Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 48 additions & 40 deletions analysis/compile_cas9_fidelity.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
from tqdm import tqdm
import os

from Bio.Align import PairwiseAligner, substitution_matrices
from sequence_models.utils import parse_fasta
from tqdm import tqdm

from dayhoff.analysis_utils import get_all_paths, results_to_pandas

base_path = "/home/kevyan/generations/cas9/"
base_path = "/home/kevyan/generations/cas9-no-order/"

models = ["short_cas9s_1.0_minp0.00_new"]
for m in models:
pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
fold_df, mpnn_df, df = results_to_pandas(pdb_paths, mpnn_paths, name="")
df['model'] = m
model = "short_cas9s_1.0_minp0.00_new"
folding_df = pd.read_csv(os.path.join(base_path, 'esmfold_proteinmpnn_merge_data.csv'))
seqs, names = parse_fasta(os.path.join(base_path, "%s.fasta" % model), return_names=True)
df = folding_df[folding_df['if_temp'] == 1.0]
name_df = pd.DataFrame()
name_df['sequence'] = seqs
name_df['file'] = names
df = pd.merge(name_df, df, how='left', on='file')
# for m in models:
# pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
# fold_df, mpnn_df, df = results_to_pandas(pdb_paths, mpnn_paths, name="")
# df['model'] = m

aligner = PairwiseAligner()
aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
Expand All @@ -22,41 +28,43 @@
aligner.query_end_gap_score = 0.0
with tqdm(total=len(df)) as pbar:
homologs, homolog_names = parse_fasta(os.path.join('/home/kevyan/data/characterized_cas9s', "naturals.fasta"), return_names=True)
for idx, row in df.iterrows():
s = row['sequence']
s = s.replace("-", "")
s = s.replace("<mask2>", "")
s = s.replace("<mask1>", "")
s = s.replace("<mask3>", "")
s = s.replace("<eos>", "")
best_matches = -1
best_homolog_sequence = None
best_homolog_name = None
best_cterm_gaps = None
for hs, hn in zip(homologs, homolog_names):
alignment = aligner.align(s, hs)
if alignment.score > best_matches:
best_matches = alignment.score
best_homolog_sequence = hs
best_homolog_name = hn
best_cterm_gaps = len(hs) - alignment[0].aligned[1, -1, 1]
df.loc[idx, 'gen_length'] = len(s)
df.loc[idx, 'best_matches'] = best_matches
df.loc[idx, 'match_length'] = len(best_homolog_sequence)
df.loc[idx, 'homolog_name'] = best_homolog_name
df.loc[idx, 'homolog_sequence'] = best_homolog_sequence
df.loc[idx, 'cterm_gaps'] = best_cterm_gaps
pbar.update(1)

for model in models:
seqs, names = parse_fasta(os.path.join(base_path, "%s.fasta" %model), return_names=True)
for s, n in zip(seqs, names):
s = s.replace("-", "")
s = s.replace("<mask2>", "")
s = s.replace("<mask1>", "")
s = s.replace("<mask3>", "")
s = s.replace("<eos>", "")
best_matches = -1
best_homolog_sequence = None
best_homolog_name = None
best_cterm_gaps = None
for hs, hn in zip(homologs, homolog_names):
alignment = aligner.align(s, hs)
if alignment.score > best_matches:
best_matches = alignment.score
best_homolog_sequence = hs
best_homolog_name = hn
best_cterm_gaps = len(hs) - alignment[0].aligned[1, -1, 1]

idx = df[(df['model'] == model) & (df['file'] == n)].index[0]
df.loc[idx, 'sequence'] = s
df.loc[idx, 'gen_length'] = len(s)
df.loc[idx, 'best_matches'] = best_matches
df.loc[idx, 'match_length'] = len(best_homolog_sequence)
df.loc[idx, 'homolog_name'] = best_homolog_name
df.loc[idx, 'homolog_sequence'] = best_homolog_sequence
df.loc[idx, 'cterm_gaps'] = best_cterm_gaps
pbar.update(1)

df['plddt'] = df['esmfoldplddt']
df['scperplexity'] = df['proteinmpnnperplexity']
df['seq_id'] = df['best_matches'] / df['gen_length']
df = df.sort_values(['cterm_gaps', 'plddt'], ascending=[True, False])
df['name'] = [f.split('_')[-1] for f in df['file']]
df.to_csv(os.path.join(base_path, "%s_fidelity.csv" %models[0]), index=False)
df.to_csv(os.path.join(base_path, "%s_fidelity.csv" %model), index=False)

df = pd.read_csv(os.path.join(base_path, "%s_fidelity.csv" %model))

df[df['plddt'] > .70].head(10)[['name', 'match_length', 'gen_length', 'plddt', 'cterm_gaps', 'best_matches']]
# 52, 8, and 50 have the most domain hits
# 52, 8, and 50 have the most domain hits
df[df['plddt'] > 0.7].shape
df.loc[[0, 1, 2, 18, 19, 21], ['name', 'sequence']].values
df.loc[[0, 1, 2, 18, 19, 21], ['name', 'homolog_name', 'homolog_sequence']].values
123 changes: 81 additions & 42 deletions analysis/compile_msa_fidelity.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import os
from collections import Counter
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from Bio.Align import PairwiseAligner
from scipy.stats import pearsonr
from sequence_models.utils import parse_fasta
from tqdm import tqdm
from scipy.stats import ttest_rel

from dayhoff.analysis_utils import results_to_pandas, get_all_paths, run_tmscore
from sequence_models.utils import parse_fasta
from dayhoff.analysis_utils import get_all_paths, results_to_pandas, run_tmscore

sns.set_theme(font_scale=1.2)
Expand All @@ -18,17 +20,16 @@
base_path = "/home/kevyan/generations/queries_from_homologs"

models = ["natural", "xlstm", "evodiff", "evodiff_nom",
"gap_1.0_0.01", "gap_1.2_0.01", "gap_1.0_0.00", "gap_1.1_0.05", "gap_1.0_0.00_nom", "gap_1.0_0.05_nom",
"ccmgen", "ccmgen_short",
"indel_1.0_0.00", "indel_1.2_0.01", "indel_1.0_0.01", "indel_1.1_0.05", "indel_1.0_0.00_nom", "indel_1.0_0.05_nom"]
"gap_1.0_0.05_nom",
"ccmgen",
"indel_1.0_0.05_nom"]
raw_dfs = []
for m in models:
pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
fold_df, mpnn_df, merged_df = results_to_pandas(pdb_paths, mpnn_paths, name="")
merged_df['model'] = m
raw_dfs.append(merged_df)
df = pd.concat(raw_dfs, ignore_index=True)

model_to_name = {m: m for m in models}
model_to_name['natural'] = 'queries'
model_to_name['gap_1.0_0.01'] = '3b-cooled_25000_gap_t1.0_0.01'
Expand Down Expand Up @@ -60,11 +61,12 @@
gen_length = len(s)
n_homologs = len(homologs)
best_id = -1
if model == 'natural':
homologs = homologs[1:]
for i, homolog in enumerate(homologs):
if i == 0:
query_length = len(homolog)
if model == 'natural':
continue

alignment = aligner.align(s, homolog)
if alignment.score > best_matches:
best_matches = alignment.score
Expand Down Expand Up @@ -100,56 +102,75 @@
len(set(df['file']))
df.to_csv(os.path.join(base_path, "compiled_fidelities.csv"), index=False)
df = pd.read_csv(os.path.join(base_path, "compiled_fidelities.csv"))
models_to_plot = {
"natural": "Natural",
"ccmgen": "CCMgen",
"evodiff_nom": "EvoDiff-MSA",
"xlstm": 'Prot-xLSTM',
"gap_1.0_0.05_nom": "Aligned",
"indel_1.0_0.05_nom": "Unaligned"
}



print("model R(n_homologs, plddt)")
for model in models:
df_lim = df[df['model'] == model]
print(model, pearsonr(df_lim['n_homologs'], df_lim['plddt']).statistic)

print("model R(n_homologs, seq_id)")
for model in models:
df_lim = df[df['model'] == model]
print(model, pearsonr(df_lim['n_homologs'], df_lim['seq_id']).statistic)

print("model R(n_homologs, tmscore)")
for model in models[1:]:
df_lim = df[df['model'] == model]
print(model, pearsonr(df_lim['n_homologs'], df_lim['tmscore']).statistic)

print("model R(seq_id, tmscore)")
for model in models[1:]:
df_lim = df[df['model'] == model]
print(model, pearsonr(df_lim['seq_id'], df_lim['tmscore']).statistic)


grouped = df.groupby('model')
model1 = "indel_1.0_0.05_nom"
model2 = "gap_1.0_0.05_nom"
df.columns
all_names = list(set(df['file']))
metrics = {
"plddt": [],
"perplexity": [],
"tmscore": [],
"seq_id": []
}
m_dict = {
"plddt": "pLDDT",
"perplexity": "scPerplexity",
"tmscore": "TM score",
"seq_id": "Sequence identity"
}
for n in all_names:
for m in metrics:
first = df[(df['model'] == model1) & (df['file'] == n)][m].values[0]
second = df[(df['model'] == model2) & (df['file'] == n)][m].values[0]
metrics[m].append(first - second)
fig, axs = plt.subplots(1, 4, figsize=(14.4, 4.8))
for i, (m, ax) in enumerate(zip(metrics, axs)):
print(np.array(metrics[m]).mean())
_ = sns.stripplot(y=metrics[m], ax=ax, alpha=0.7)
if i == 0:
_ = ax.set_ylabel(models_to_plot[model1] + " - " + models_to_plot[model2])
_ = ax.set_xlabel(m_dict[m])
_ = ax.axhline(y=0, color='gray')
_ = ax.set_xticks(ax.get_xticks())
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
_ = fig.savefig(os.path.join(base_path, "ulal_diff.pdf"), dpi=300, bbox_inches="tight")


grouped = df[['model', 'plddt', 'perplexity', 'tmscore', 'seq_id']].groupby('model')
final_models = ["gap_1.0_0.05_nom", "indel_1.0_0.05_nom", "ccmgen", "xlstm", "evodiff_nom", "natural"]
metrics = grouped.agg(['mean', 'std']).loc[final_models].reset_index()
for i, row in metrics.iterrows():
print_me = [models_to_plot[row['model'].values[0]]]
for m in ['plddt', 'perplexity', 'tmscore', 'seq_id']:
print_me.append("$%.2f \\pm %.2f$" % (row[m]['mean'], row[m]['std']))
print_me = " & ".join(print_me) + "\\\\"
print(print_me)
grouped.seq_id.agg(['mean', 'std'])
grouped.tmscore.agg(['mean', 'std'])
grouped.plddt.agg(['mean', 'std'])
grouped.perplexity.agg(['mean', 'std'])

models_to_plot = {
"natural": "Natural",
"ccmgen_short": "CCMgen",
"evodiff_nom": "EvoDiff-MSA",
"xlstm": 'Prot-xLSTM',
"gap_1.0_0.05_nom": "Alignment conditioning",
"indel_1.0_0.05_nom": "Homolog conditioning"
}
pal = sns.color_palette()
model_to_hue = {
"natural": "gray",
"ccmgen_short": pal[-4],
"ccmgen": pal[-4],
"xlstm": pal[-1],
"evodiff_nom": pal[-2],
"gap_1.0_0.05_nom": pal[1],
"indel_1.0_0.05_nom": pal[2],
}
cdfs = {
"seq_id": (plt.subplots(), "Sequence Identity"),
"tmscore": (plt.subplots(), "TM score"),
"tmscore": (plt.subplots(), "TM-score"),
"plddt": (plt.subplots(), "pLDDT"),
"perplexity": (plt.subplots(), "scPerplexity"),
}
Expand Down Expand Up @@ -202,3 +223,21 @@
# 6202062 gap plddt 0.896246
# A0A174Z1L0 indel plddt 0.941450 longer than query
# 76841376 indel plddt 0.938367

df.head()
df.columns
pivoted = {}
out_models = {
"gap_1.0_0.05_nom": "aligned", "indel_1.0_0.05_nom": "unaligned", "ccmgen": "CCMgen", "xlstm": "prot-xLSTM", "evodiff_nom": "EvoDiff-MSA", "natural": "natural"
}
ttest_outfile = os.path.join(base_path, "ttest_results.csv")
with open(ttest_outfile, 'w') as f:
f.write("metric,model1,model2,t,p\n")
for m in ['plddt', 'perplexity', 'tmscore', 'seq_id']:
df2 = df.pivot(index='file', columns='model', values=m)
for i, model1 in enumerate(final_models):
for model2 in final_models[i + 1:]:
r = ttest_rel(df2[model1], df2[model2])
f.write(','.join([m, out_models[model1], out_models[model2], str(r.statistic), str(r.pvalue)]) + "\n")
if ttest_rel(df2[model1], df2[model2]).pvalue > 0.05:
print(m, model1, model2)
Loading