forked from ecemecemk/HoliLoc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprotein_sl_predictor.py
248 lines (187 loc) · 10.1 KB
/
protein_sl_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# Suppress warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import argparse
import pandas as pd
import numpy as np
import cv2
from tensorflow.keras.models import load_model
from sklearn.preprocessing import MinMaxScaler
import h5py
class_names = ["Actin_filaments", "Aggresome", "Cell_junctions", "MTOC", "Centrosome",
"Cytoplasmic_bodies", "Cytosol", "ER", "Focal_adhesion_sites",
"Golgi_apparatus", "Intermediate_filaments", "Microtubules",
"Mitotic_spindle", "Nuclear_bodies", "Nuclear_membrane",
"Nuclear_speckles", "Nucleoli", "Nucleoli_fibrillar_center",
"PM", "Nucleoplasm", "Mitochondria", "Cytokinetic_bridge"]
def predict_protein_location(model_type, **kwargs):
if model_type == "image":
return predict_protein_location_with_image(**kwargs)
elif model_type == "sequence":
return predict_protein_location_with_sequence(**kwargs)
elif model_type == "PPI":
return predict_protein_location_with_PPI(**kwargs)
elif model_type == "HoliLoc":
return predict_protein_location_with_HoliLoc(**kwargs)
else:
print(f"Invalid model type: {model_type}. Please choose from image, sequence, PPI, or HoliLoc.")
return []
def prompt_for_image_args():
return {
'image_path': input('Enter the path to the protein image: '),
'image_model_path': input('Enter the path to the Image Based Model File: ')
}
def prompt_for_sequence_args():
return {
'target_protein': input('Enter the UniProt ID of the target protein: '),
'sequence_model_path': input('Enter the path to the Sequence model file: '),
'sequence_embeddings_path': input('Enter the path to the sequence embeddings file: ')
}
def prompt_for_PPI_args():
return {
'target_protein': input('Enter the UniProt ID of the target protein: '),
'PPI_model_path': input('Enter the path to the PPI model file: '),
'PPI_embeddings_path': input('Enter the path to the PPI embeddings CSV file: ')
}
def prompt_for_HoliLoc_args():
return {
'target_protein': input('Enter the UniProt ID of the target protein: '),
'image_path': input('Enter the path to the protein image: '),
'holiloc_model_path': input('Enter the path to the Holiloc model file: '),
'sequence_embeddings_path': input('Enter the path to the sequence embeddings file: '),
'ppi_embeddings_path': input('Enter the path to the PPI embeddings CSV file: ')
}
def predict_protein_location_with_image(image_path, image_model_path):
best_threshold_image = 0.09530000000000001
# Load Model
image_model = load_model(image_model_path)
# Load Image and Get Image Feature Vector
img = cv2.imread(image_path)
image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image_rgb = cv2.resize(image_rgb, (224, 224))
image_rgb = image_rgb.astype(np.float32)
image_rgb = image_rgb / 255
image_feature_vector = np.stack(image_rgb)
# Get prediction
image_feature_vector_single = np.expand_dims(image_feature_vector, axis=0)
pred_image = image_model.predict(image_feature_vector_single)
outcome_image = np.where(pred_image < best_threshold_image, 0, 1)
predicted_classes_image = [class_names[i] for i, value in enumerate(outcome_image[0]) if value == 1]
return predicted_classes_image
def predict_protein_location_with_sequence(target_protein, sequence_model_path, sequence_embeddings_path):
best_threshold_sequence = 0.0369
# Load Model
sequence_model = load_model(sequence_model_path)
# Load Sequence Embeddings and Normalize Target Embedding
ids = []
embed = []
target_embedding_sequence = None # Initialize to None
try:
with h5py.File(sequence_embeddings_path, "r") as file:
for sequence_id, embedding in file.items():
ids.append(sequence_id)
embed.append(np.array(embedding))
if sequence_id == target_protein:
target_embedding_sequence = np.array(embedding)
# Check if target_embedding_sequence is still None after the try block
if target_embedding_sequence is None:
raise IndexError # Simulate the IndexError if target_embedding_sequence is not found
sequence_embeddings_array_2d = target_embedding_sequence.reshape(-1, 1) # to give normalization
scaler = MinMaxScaler(feature_range=(0, 1))
sequence_embeddings_array_normalized = scaler.fit_transform(sequence_embeddings_array_2d)
sequence_embeddings_array_normalized = sequence_embeddings_array_normalized.reshape(-1,)
except IndexError:
print(f"Sequence embedding not found for {target_protein}. Unable to make predictions.")
return []
# Get prediction
sequence_embeddings_single = np.expand_dims(sequence_embeddings_array_normalized, axis=0)
pred_sequence = sequence_model.predict(sequence_embeddings_single)
outcome_sequence = np.where(pred_sequence < best_threshold_sequence, 0, 1)
predicted_classes_sequence = [class_names[i] for i, value in enumerate(outcome_sequence[0]) if value == 1]
return predicted_classes_sequence
def predict_protein_location_with_PPI(target_protein, PPI_model_path, PPI_embeddings_path):
best_threshold_PPI = 0.029400000000000003
human_interactome = pd.read_csv(PPI_embeddings_path)
# Load Model
PPI_model = load_model(PPI_model_path)
# Load PPI Embeddings and Normalize Target Embedding
try:
human_interactome['PPI_Embedding'] = human_interactome['PPI_Embedding'].apply(lambda x: eval(x))
target_embedding_graph = np.array(human_interactome[human_interactome['UNIPROT'] == target_protein]['PPI_Embedding'].values[0])
target_embedding_graph = target_embedding_graph.astype(float)
target_embedding_graph_2d = target_embedding_graph.reshape(-1, 1) # to give normalization
scaler = MinMaxScaler(feature_range=(0, 1))
PPI_embeddings_array_normalized = scaler.fit_transform(target_embedding_graph_2d)
PPI_embeddings_array_normalized = PPI_embeddings_array_normalized.reshape(-1,)
except IndexError:
print(f"PPI embedding not found for {target_protein}. Unable to make predictions.")
return []
# Get prediction
PPI_embeddings_single = np.expand_dims(PPI_embeddings_array_normalized, axis=0)
pred_PPI = PPI_model.predict(PPI_embeddings_single)
outcome_PPI = np.where(pred_PPI < best_threshold_PPI, 0, 1)
predicted_classes_PPI = [class_names[i] for i, value in enumerate(outcome_PPI[0]) if value == 1]
return predicted_classes_PPI
def predict_protein_location_with_HoliLoc(target_protein, image_path, holiloc_model_path, sequence_embeddings_path, ppi_embeddings_path):
best_threshold_holiloc = 0.11560000000000001
# Load Model
holiloc_model = load_model(holiloc_model_path)
# Load Image Data and Obtain Image Feature Vector
img = cv2.imread(image_path)
image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
image_rgb = cv2.resize(image_rgb, (224, 224))
image_rgb = image_rgb.astype(np.float32)
image_rgb = image_rgb / 255
image_feature_vector = np.stack(image_rgb)
# Get Sequence Embedding
ids = []
embed = []
with h5py.File(sequence_embeddings_path, "r") as file:
for sequence_id, embedding in file.items():
ids.append(sequence_id)
embed.append(np.array(embedding))
if sequence_id == target_protein:
target_embedding_sequence = np.array(embedding)
sequence_embeddings_array_2d = target_embedding_sequence.reshape(-1, 1) # to give normalization
scaler = MinMaxScaler(feature_range=(0, 1))
sequence_embeddings_array_normalized = scaler.fit_transform(sequence_embeddings_array_2d)
sequence_embeddings_array_normalized = sequence_embeddings_array_normalized.reshape(-1,)
# Get PPI Embedding
human_interactome = pd.read_csv(ppi_embeddings_path)
human_interactome['PPI_Embedding'] = human_interactome['PPI_Embedding'].apply(lambda x: eval(x))
target_embedding_graph = np.array(human_interactome[human_interactome['UNIPROT'] == target_protein]['PPI_Embedding'].values[0])
target_embedding_graph = target_embedding_graph.astype(float)
target_embedding_graph_2d = target_embedding_graph.reshape(-1, 1) # to give normalization
PPI_embeddings_array_normalized = scaler.fit_transform(target_embedding_graph_2d)
PPI_embeddings_array_normalized = PPI_embeddings_array_normalized.reshape(-1,)
# Get Prediction
image_feature_vector_single = np.expand_dims(image_feature_vector, axis=0)
sequence_embeddings_single = np.expand_dims(sequence_embeddings_array_normalized, axis=0)
PPI_embeddings_single = np.expand_dims(PPI_embeddings_array_normalized, axis=0)
pred_holiloc = holiloc_model.predict([image_feature_vector_single, sequence_embeddings_single, PPI_embeddings_single])
outcome_holiloc = np.where(pred_holiloc < best_threshold_holiloc, 0, 1)
predicted_classes_HoliLoc = [class_names[i] for i, value in enumerate(outcome_holiloc[0]) if value == 1]
return predicted_classes_HoliLoc
def parse_args():
parser = argparse.ArgumentParser(description='Protein Subcellular Location Prediction')
parser.add_argument('--model_type', type=str, help='Type of the model (image, sequence, PPI, HoliLoc)', required=True)
return parser.parse_args()
def main():
args = parse_args()
if args.model_type == "image":
additional_args = prompt_for_image_args()
elif args.model_type == "sequence":
additional_args = prompt_for_sequence_args()
elif args.model_type == "PPI":
additional_args = prompt_for_PPI_args()
elif args.model_type == "HoliLoc":
additional_args = prompt_for_HoliLoc_args()
else:
print(f"Invalid model type: {args.model_type}. Please choose from image, sequence, PPI, or HoliLoc.")
return
# Explicitly pass 'model_type' as a keyword argument
all_args = {**vars(args), **additional_args, 'model_type': args.model_type}
predictions = predict_protein_location(**all_args)
print("Predicted Subcellular Locations:", predictions)
if __name__ == "__main__":
main()