Overview
昨天跟Chris
讨论SVM
分类预测准确性的时候,知道PSSM_AC
的作用比PSSM
作用更明显,于是决定将以前的python
脚本改进一下,输出PSSM
和PSSM_AC
这两个文件,方便观察。该脚本包括两部分,本文将按顺序记录下来。
以前的脚本可以参考我之前的文章蛋白质序列特征提取方法之——PSSM。
1. t34pssm.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | #! /usr/bin/env python import fileinput import sys from os import listdir from os.path import isfile, join import re from pssm_ft import * smplfasta = sys.argv[ 1 ] spfasta = sys.argv[ 2 ] check_head = re. compile (r '\>' ) #read from undersample fasta, store smplist = [] smpcnt = 0 for line, strin in enumerate (fileinput. input (smplfasta)): if check_head.match(strin): smplist.append(strin.strip()) smpcnt + = 1 onlyfiles = [ f for f in listdir(spfasta) if isfile(join(spfasta,f)) ] fastaDict = {} for fi in onlyfiles: cntnt = '' for line, strin in enumerate (fileinput. input (spfasta + '/' + fi)): if line = = 0 : cntnt + = strin.strip() if cntnt in fastaDict: print strin fastaDict[cntnt] = fi pssmdir = sys.argv[ 3 ] finalist = [] for smp in smplist: finalist.append(pssmdir + '/' + fastaDict[smp].split( '.' )[ 0 ] + '.pssm' ) for fi in finalist: pssm(fi, 'total_train_60.pssm' , 'total_train_60.pssm_AC' ) #此处多了一个参数,多了一个输出文件 |
2. pssm_ft.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | import sys import numpy as np import math import re import fileinput GROUP = 10 def scale(x, max, min): """ scale x to [-1,1] """ try: return -(1-(x-min)/(max-min))+((x-min)/(max-min)) except Exception: print "ZeroDivisionError, max-min:\s", max-min def pssm_ac_cal(PSSM, g, l): """ :param PSSM: ['1','-1'...] :param g: group :param l: sequence length :return PSSM_AC: g * 20 matrix """ PSSM_AC = np.array([ [0.0] * 20 ] * g) for pg in range(g): l_g = l - pg - 1 for pj in range(20): sum_jl = 0.0 for i in range(l): sum_jl += PSSM[i][pj] sum_jl /= l pssm_acjg = 0.0 for i in range(l_g): pssm_acjg += (PSSM[i][pj]-sum_jl) * (PSSM[i+pg+1][pj]-sum_jl) pssm_acjg /= l_g PSSM_AC[pg][pj] = pssm_acjg return PSSM_AC def pssm(fi,output,output_AC): # 0-19 represents amino acid 'ARNDCQEGHILKMFPSTWYV' Amino_vec = "ARNDCQEGHILKMFPSTWYV" PSSM = [] ACC = [ [0.0] * 20 ] * 20 seq_cn = 0 for line, strin in enumerate(fileinput.input(fi)): if line > 2: str_vec = strin.split()[1:22] if len(str_vec) == 0: break PSSM.append(map(int, str_vec[1:])) seq_cn += 1 #test1_1 += float(str_vec[1]) test1_1 = -103 ACC[Amino_vec.index(str_vec[0])] = map(sum, zip(map(int, str_vec[1:]), ACC[Amino_vec.index(str_vec[0])])) fileinput.close() # section for ACC features ACC_np = np.array(ACC) ACC_np = np.divide(ACC_np, seq_cn) amcnt_max = np.amax(ACC_np) amcnt_min = np.amin(ACC_np) vfunc = np.vectorize(scale) ACC_np = vfunc(ACC_np, amcnt_max,amcnt_min) ACC_shp = np.shape(ACC_np) #section for PSSM_AC features PSSM = np.array(PSSM) PSSM_AC = pssm_ac_cal(PSSM, GROUP, seq_cn) PSSM_shp = np.shape(PSSM_AC) file_out = file(output,'a') file_out_AC = file(output_AC,'a') #此处将保存文件的代码拆分开 np.savetxt(file_out, [np.reshape(ACC_np, (ACC_shp[0] * ACC_shp[1], ))], delimiter=",") np.savetxt(file_out_AC, [np.reshape(PSSM_AC, (PSSM_shp[0] * PSSM_shp[1], ))], delimiter=",") |