-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDatabaseBuilder.py
More file actions
308 lines (240 loc) · 14.9 KB
/
DatabaseBuilder.py
File metadata and controls
308 lines (240 loc) · 14.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri March 26 10:19:44 2021
@author: sb069
"""
#makes the filepaths global variables
from collections import Counter
import sys
import textwrap
import math
import numpy as np
global iupfix
global nonproteinor
global pfamtxt
global prositetxt
global txt
import os
global g
global Qtxt
g=0
from isol import *
#for iupred2a
PATH="Resources Used/iupred2a"
def DatabaseBuilder():
Question()
def Question():
global nonproteinor
global pfamtxt
global prositetxt
global txt
global csv3
global question1
global Qtxt
global g
global finalin
global PSpath
global Inpath
global Fullpath
global Prositeq
Fullpath=os.getcwd()
PSpath="'" + Fullpath + "/Resources Used/PS_Scan/ps_scan_Linux/" + "'"
Inpath= Fullpath + "/Input Files"
Outpath=Fullpath + "/Output Files"
os.environ['Input']= Inpath
os.environ['PS']= PSpath
os.environ['Output']=Outpath
os.environ['PSS']= "Resources Used/PS_Scan/ps_scan_Linux/"
if g==0:
HMMscanq=input("Do you want to run HMMscan?[Y/n] ")
Prositeq=input("Do you want to run prosite scan?[Y/n] ")
else:
pass
nonproteinor=""
question1=input("Is this the enzyme , non-enzyme or an unlabeled file:" )
question1=question1.strip()
if question1=="enzyme":
nonproteinor="1"
if question1==float:
question1=str(question1)
elif question1=="non-enzyme" or "unlabeled":
nonproteinor="0"
else:
print("Please enter 'enzyme' , 'non-enzyme' or 'unlabeled' ")
return Question()
txt=input("Please enter the name of the fasta file you wish to use:" )
Qtxt="Input Files/" + txt
csv3=input("Please enter what you wish to name the output file:" )
csv3="Output Files/" + csv3 + ".csv"
pfamtxt="Machine Learning/PFAM/" + txt #+"' "
prositetxt="Machine Learning/Prosite/" + txt
print("The class number is " + nonproteinor)
if Prositeq =="n" and HMMscanq == "n":
count()
elif Prositeq == "n" and HMMscanq=="Y":
print("Remember to have the Prosite file in FASTA and in 'PredFast/Machine Learning/Prosite' with the same name as the input file")
HMMERSCAN()
elif Prositeq== "Y" and HMMscanq=="n":
print("Remember to have the Pfam file in FASTA and in 'PredFast/Machine Learning/Pfam' with the same name as the input file")
PrositeScan()
elif Prositeq=="Y" and HMMscanq=="Y":
HMMERSCAN()
else:
print("Invalid Selection. Both programs will run.")
HMMERSCAN()
def HMMERSCAN():
global txt
global Qtxt
global hmmin
global Fullpath
global Prositeq
hmmin="'" +Qtxt+ "'"
print("The dataset will now be built Hmmer is running")
hmmstr="hmmscan --domtblout 'Machine Learning/PFAM/" + txt +"'" + " 'Machine Learning/PFAM/Pfam-A.hmm' " + hmmin + " >/dev/null"
os.system(hmmstr)
if Prositeq=="Y":
print("Hmmer has completed Prosite Scan is Running")
PrositeScan()
elif Prositeq=="n":
print("HMMScan has finished please wait for the dataset to be fully built")
count()
else:
print("Invalid Selection The program will now restart")
PrositeScan()
def PrositeScan():
global txt
global Qtxt
global hmmin
import subprocess
global Inpath
#PSin= Inpath+ "/" +txt
prositein= "'" + "Machine Learning/Prosite/" + txt + "'"
psstr1="perl 'Resources Used/PS_Scan/ps_scan_Linux/ps_scan.pl' " + hmmin + " -o fasta > " + prositein
psscan='Resources Used/PS_Scan/ps_scan_Linux/ps_scan.pl'
exportpath= "'" + Fullpath + "/Resources Used/PS_Scan/ps_scan_Linux/pfscan'"
exportpath='"'+exportpath+'"'
exportpath="my $PFSCAN =" + exportpath + ";"
#print(exportpath)
# This is how it appears in proscan we are going to manually replace path in PERL program: my $PFSCAN = 'pfscan';
with open(psscan, 'r') as prositefile:
prositedata=prositefile.read()
prositedata=prositedata.replace("my $PFSCAN = 'pfscan';" , exportpath)
prositefile.close()
with open(psscan, 'w') as prositeoutfile:
prositeoutfile.write(prositedata)
prositeoutfile.close()
subprocess.call(psstr1, shell= True)
print("Prosite Scan has finished please wait for the dataset to be fully built")
count()
def count():
global nonproteinor
global pfamtxt
global prositetxt
global txt
global csv3
global question1
global g
global Qtxt
PS=""
with open(Qtxt, "r") as txt3:
proteinout = []
#actual creation of data from sequence
for line in txt3 :
if ">" in line:
PS="0" #defines PS and PF as String
PF="0"
name=str(line[1:7]) #length of query name
with open(prositetxt,"r") as r:
for line in r:
if name in line:
PS=str(line.split(": PS",1)[1]) #allows us to get a string for Prosite
PS=PS[0:5]
with open(pfamtxt,"r") as p:
x=0
for line in p:
if name in line and x<1: #the x<1 just returns the top result setting it higher returns a different result IE 2 would return the 2nd result for that query name
PF=str(line.split(" PF",1)[1])
PF=PF[0:5] #returns just the part of the list that is the PF family
x+=1 #counter increases by one per run allows us to return the top sequence
elif ">" not in line:
sequence=line #important for pI
iupfix=sequence #important for iupred2a
counted = Counter(line)
lin= ((len(line))-1)
lins= str(lin)
#needed for iupred2a
input_pKa_set='IPC_protein' #this uses the same pka set they use
alph= str((counted["A"]*1.420 + counted["R"]*0.980 + counted["N"]*0.670 + counted["D"]*1.010 + counted["C"]*0.700 + counted["Q"]*1.110 + counted["E"]*1.510 + counted["G"]*0.570 + counted["H"]*1.000 + counted["I"]*1.080 + counted["L"]*1.210 + counted["K"]*1.160 + counted["M"]*1.450 + counted["F"]*1.130 + counted["P"]*0.570 + counted["S"]*0.770 + counted["T"]*0.830 + counted["W"]*1.080 + counted["Y"]*0.690 + counted["V"]*1.060 ) / lin) #alpha helix %
beta= str((counted["A"]*0.709 + counted["R"]*0.920 + counted["N"]*0.604 + counted["D"]*0.541 + counted["C"]*1.191 + counted["Q"]*0.840 + counted["E"]*0.567 + counted["G"]*0.657 + counted["H"]*0.863 + counted["I"]*1.799 + counted["L"]*1.261 + counted["K"]*0.721 + counted["M"]*1.210 + counted["F"]*1.393 + counted["P"]*0.354 + counted["S"]*0.928 + counted["T"]*1.221 + counted["W"]*1.306 + counted["Y"]*1.266 + counted["V"]*1.965 ) / lin) #beta strand %
bulk= str((counted["A"]*11.500 + counted["R"]*14.280 + counted["N"]*12.820 + counted["D"]*11.680 + counted["C"]*13.460 + counted["Q"]*14.450 + counted["E"]*13.570 + counted["G"]*3.400 + counted["H"]*13.690 + counted["I"]*21.400 + counted["L"]*21.400 + counted["K"]*15.710 + counted["M"]*16.250 + counted["F"]*19.800 + counted["P"]*17.430 + counted["S"]*9.470 + counted["T"]*15.770 + counted["W"]*21.670 + counted["Y"]*18.030 + counted["V"]*21.570) / lin)#bulkiness
ref= str((counted["A"]*4.340 + counted["R"]*26.660 + counted["N"]*13.280 + counted["D"]*12.000 + counted["C"]*35.770 + counted["Q"]*17.560 + counted["E"]*17.260 + counted["G"]*0.000 + counted["H"]*21.810 + counted["I"]*19.060 + counted["L"]*18.780 + counted["K"]*21.290 + counted["M"]*21.640 + counted["F"]*29.400 + counted["P"]*10.930 + counted["S"]*6.350 + counted["T"]*11.010 + counted["W"]*42.530 + counted["Y"]*31.530 + counted["V"]*13.920) / lin)#refractivity
pI= str(predict_isoelectric_point(sequence, input_pKa_set)) #pI
coil= str((counted["A"]* 0.824 + counted["R"]*0.893 + counted["N"]*1.167 + counted["D"]*1.197 + counted["C"]*0.953 + counted["Q"]*0.947 + counted["E"]*0.761 + counted["G"]* 1.251 + counted["H"]*1.068 + counted["I"]*0.886 + counted["L"]*0.810 + counted["K"]*0.897 + counted["M"]*0.810 + counted["F"]*0.797 + counted["P"]* 1.540 + counted["S"]*1.130 + counted["T"]*1.148 + counted["W"]*0.941 + counted["Y"]*1.109 + counted["V"]*0.772) / lin) # random coil
buried= str((counted["A"]*11.200 + counted["R"]*0.500 + counted["N"]*2.900 + counted["D"]*2.900 + counted["C"]*4.100 + counted["Q"]*1.600 + counted["E"]*1.800 + counted["G"]* 11.800 + counted["H"]*2.000 + counted["I"]*8.600 + counted["L"]*11.700 + counted["K"]*0.500 + counted["M"]*1.900 + counted["F"]*5.100 + counted["P"]*2.700 + counted["S"]*8.000 + counted["T"]*2.200 + counted["W"]*0.941 + counted["Y"]*2.600 + counted["V"]*12.900) / lin) # buried residues
hratio= str((counted["A"]*0.000 + counted["R"]*0.650 + counted["N"]*1.330 + counted["D"]*1.380 + counted["C"]*2.750 + counted["Q"]*0.890 + counted["E"]*0.920 + counted["G"]* 0.740 + counted["H"]* 0.580 + counted["I"]* 0.000+ counted["L"]*0.000 + counted["K"]*0.330 + counted["M"]*0.000 + counted["F"]*0.000 + counted["P"]*0.390 + counted["S"]*1.420 + counted["T"]*0.710 + counted["W"]*0.130 + counted["Y"]*0.200 + counted["V"]*0.000) / lin) #Hetero Ratio
cdensity= str((counted["A"]*1.401 + counted["R"]*1.1 + counted["N"]*1.66 + counted["D"]*1.54 + counted["C"]*0.0 + counted["Q"]*0 + counted["E"]*1.460 + counted["G"]*1.607 + counted["H"]*0.0 + counted["I"]*0.0+ counted["L"]*1.191+ counted["K"]*0.0 + counted["M"]*1.340 + counted["F"]*0 + counted["P"]*0 + counted["S"]*1.537 + counted["T"]*0.0 + counted["W"]*0.0 + counted["Y"]*1.456 + counted["V"]*1.230) / lin) # crystal density prediction
#for solubility some were listed as only "very soluble" at g/mL these values were given into the database as 100
dalt= str(counted["A"]*89 + counted["R"]*174 + counted["N"]*132 + counted["D"]*133 + counted["C"]*121 + counted["Q"]*146 + counted["E"]*147 + counted["G"]*75 + counted["H"]*155 + counted["I"]*131 + counted["L"]*131 + counted["K"]*146 + counted["M"]*149 + counted["F"]*165 + counted["P"]*115 + counted["S"]*105 + counted["T"]*119 + counted["W"]*204 + counted["Y"]*181 + counted["V"]*117)#counts daltons
polar= str((counted["A"]*8.100 + counted["R"]*10.500 + counted["N"]*11.600 + counted["D"]*13.000 + counted["C"]*5.500 + counted["Q"]*10.500 + counted["E"]*12.300 + counted["G"]*9.000 + counted["H"]*10.400 + counted["I"]*5.200 + counted["L"]*4.900 + counted["K"]*11.300 + counted["M"]*5.700 + counted["F"]*5.200 + counted["P"]*8.000 + counted["S"]*9.200 + counted["T"]*8.600 + counted["W"]*5.400 + counted["Y"]*6.200 + counted["V"]*5.900) / lin ) #gets average polarity for the protein
hydro= str((counted["A"]*-0.400 + counted["R"]*-0.590 + counted["N"]*-0.920 + counted["D"]*-1.310 + counted["C"]*0.170 + counted["Q"]*-0.910 + counted["E"]*-1.220 + counted["G"]*-0.670 + counted["H"]*-0.640 + counted["I"]*1.250 + counted["L"]*1.220 + counted["K"]*-0.670 + counted["M"]*1.020 + counted["F"]*1.920 + counted["P"]*-0.490 + counted["S"]*-0.550 + counted["T"]*-0.280 + counted["W"]*0.500 + counted["Y"]* 1.670 + counted["V"]*0.910) / lin) # gets average hydrophobicity for protein
flex= str((counted["A"]*0.360 + counted["R"]*0.530 + counted["N"]*0.460 + counted["D"]*0.510 + counted["C"]*0.350 + counted["Q"]*0.490 + counted["E"]* 0.500 + counted["G"]*0.540 + counted["H"]*0.320 + counted["I"]*0.460 + counted["L"]*0.370 + counted["K"]*0.470 + counted["M"]*0.300 + counted["F"]*0.310 + counted["P"]*0.510 + counted["S"]*0.510 + counted["T"]*0.440 + counted["W"]*0.310 + counted["Y"]*0.420 + counted["V"]*0.390) / lin) #gets average flexibility for the protein
proteinout.append(name + "," + str(counted["A"]/ lin)+ "," + str(counted["R"]/ lin)+ "," + str(counted["N"]/ lin)+ "," + str(counted["D"]/ lin)+ "," + str(counted["C"]/ lin)+ "," + str(counted["Q"]/ lin)+ "," + str(counted["E"]/ lin)+ "," + str(counted["G"]/ lin)+ "," + str(counted["H"]/ lin)+ "," + str(counted["I"]/ lin)+ "," + str(counted["L"]/ lin)+ "," + str(counted["K"]/ lin)+ "," + str(counted["M"]/ lin)+ "," + str(counted["F"]/ lin)+ "," + str(counted["P"]/ lin)+ "," + str(counted["S"]/ lin)+ "," + str(counted["T"]/ lin)+ "," + str(counted["W"]/ lin)+ "," + str(counted["Y"]/ lin)+ "," + str(counted["V"]/ lin)+ "," + lins + "," + dalt + "," + hydro + "," + polar + "," + flex + "," + pI + "," + ref + "," + bulk +"," + alph+ "," + beta + "," + coil+ "," + buried + "," + hratio + "," + cdensity + "," + str(PS) + "," + str(PF) + "," + nonproteinor) #this is what actually makes the dataset #this is what actually makes the dataset
proteinfinal='\n'.join(proteinout)
#removing final and last to remove lines
#closing file
txt3.close()
global g
# Make a new file
global csv3
csv1= open(csv3, "w")
csv1.write(proteinfinal)
csv1.close()
r.close()
p.close()
print("This has made the " + question1 + " part of the file have you run both enzymes and non-enzymes")
print("Output Files located in the folder 'Output Files")
if g<1 and question1!="unlabeled":
g+=1
DatabaseBuilder()
elif g>1 or g==1:
Finalizer()
else:
Finalizer()
def Finalizer():
global csv3
global txt
final=""
header="Machine Learning/Header.txt"
Ftxt=csv3
Ftxt2='no file'
#Ftxt3=input("Please enter what you wish to name the database: ")
Ftxt3=r"Databases/" + txt + ".csv"
if Ftxt2=="no file":
with open(header) as header1:
header2=header1.read()
with open(Ftxt) as read1:
data1=read1.read()
final+= header2
final+= "\n"
final+= data1
with open (Ftxt3, 'w+') as output:
output.write(final)
print("The dataset is now ready to be used in an algorithm the file will be found in 'Databases' ")
else:
Ftxt2="Output Files/" + Ftxt2
with open(header) as header1:
header2=header1.read()
with open(Ftxt) as read1:
data1=read1.read()
with open(Ftxt2) as read2:
data2=read2.read()
final+= header2
final+= "\n"
final+= data1
final+= "\n"
final+=data2
with open (Ftxt3, 'w+') as output:
output.write(final)
print("The full dataset is now ready to be used in an algorithm the file will be found in 'Databases'")
DatabaseBuilder()