microsoft · sarahalamdari · Jul 11, 2025 · Apr 23, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/analysis/compile_cas9_fidelity.py b/analysis/compile_cas9_fidelity.py
@@ -1,18 +1,24 @@
+from tqdm import tqdm
 import os
 
 from Bio.Align import PairwiseAligner, substitution_matrices
 from sequence_models.utils import parse_fasta
-from tqdm import tqdm
 
-from dayhoff.analysis_utils import get_all_paths, results_to_pandas
 
-base_path = "/home/kevyan/generations/cas9/"
+base_path = "/home/kevyan/generations/cas9-no-order/"
 
-models = ["short_cas9s_1.0_minp0.00_new"]
-for m in models:
-    pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
-    fold_df, mpnn_df, df = results_to_pandas(pdb_paths, mpnn_paths, name="")
-    df['model'] = m
+model = "short_cas9s_1.0_minp0.00_new"
+folding_df = pd.read_csv(os.path.join(base_path, 'esmfold_proteinmpnn_merge_data.csv'))
+seqs, names = parse_fasta(os.path.join(base_path, "%s.fasta" % model), return_names=True)
+df = folding_df[folding_df['if_temp'] == 1.0]
+name_df = pd.DataFrame()
+name_df['sequence'] = seqs
+name_df['file'] = names
+df = pd.merge(name_df, df, how='left', on='file')
+# for m in models:
+#     pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
+#     fold_df, mpnn_df, df = results_to_pandas(pdb_paths, mpnn_paths, name="")
+#     df['model'] = m
 
 aligner = PairwiseAligner()
 aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
@@ -22,41 +28,43 @@
 aligner.query_end_gap_score = 0.0
 with tqdm(total=len(df)) as pbar:
     homologs, homolog_names = parse_fasta(os.path.join('/home/kevyan/data/characterized_cas9s', "naturals.fasta"), return_names=True)
+    for idx, row in df.iterrows():
+        s = row['sequence']
+        s = s.replace("-", "")
+        s = s.replace("<mask2>", "")
+        s = s.replace("<mask1>", "")
+        s = s.replace("<mask3>", "")
+        s = s.replace("<eos>", "")
+        best_matches = -1
+        best_homolog_sequence = None
+        best_homolog_name = None
+        best_cterm_gaps = None
+        for hs, hn in zip(homologs, homolog_names):
+            alignment = aligner.align(s, hs)
+            if alignment.score > best_matches:
+                best_matches = alignment.score
+                best_homolog_sequence = hs
+                best_homolog_name = hn
+                best_cterm_gaps = len(hs) - alignment[0].aligned[1, -1, 1]
+        df.loc[idx, 'gen_length'] = len(s)
+        df.loc[idx, 'best_matches'] = best_matches
+        df.loc[idx, 'match_length'] = len(best_homolog_sequence)
+        df.loc[idx, 'homolog_name'] = best_homolog_name
+        df.loc[idx, 'homolog_sequence'] = best_homolog_sequence
+        df.loc[idx, 'cterm_gaps'] = best_cterm_gaps
+        pbar.update(1)
 
-    for model in models:
-        seqs, names = parse_fasta(os.path.join(base_path, "%s.fasta" %model), return_names=True)
-        for s, n in zip(seqs, names):
-            s = s.replace("-", "")
-            s = s.replace("<mask2>", "")
-            s = s.replace("<mask1>", "")
-            s = s.replace("<mask3>", "")
-            s = s.replace("<eos>", "")
-            best_matches = -1
-            best_homolog_sequence = None
-            best_homolog_name = None
-            best_cterm_gaps = None
-            for hs, hn in zip(homologs, homolog_names):
-                alignment = aligner.align(s, hs)
-                if alignment.score > best_matches:
-                    best_matches = alignment.score
-                    best_homolog_sequence = hs
-                    best_homolog_name = hn
-                    best_cterm_gaps = len(hs) - alignment[0].aligned[1, -1, 1]
-
-            idx = df[(df['model'] == model) & (df['file'] == n)].index[0]
-            df.loc[idx, 'sequence'] = s
-            df.loc[idx, 'gen_length'] = len(s)
-            df.loc[idx, 'best_matches'] = best_matches
-            df.loc[idx, 'match_length'] = len(best_homolog_sequence)
-            df.loc[idx, 'homolog_name'] = best_homolog_name
-            df.loc[idx, 'homolog_sequence'] = best_homolog_sequence
-            df.loc[idx, 'cterm_gaps'] = best_cterm_gaps
-            pbar.update(1)
-
+df['plddt'] = df['esmfoldplddt']
+df['scperplexity'] = df['proteinmpnnperplexity']
 df['seq_id'] = df['best_matches'] / df['gen_length']
 df = df.sort_values(['cterm_gaps', 'plddt'], ascending=[True, False])
 df['name'] = [f.split('_')[-1] for f in df['file']]
-df.to_csv(os.path.join(base_path, "%s_fidelity.csv" %models[0]), index=False)
+df.to_csv(os.path.join(base_path, "%s_fidelity.csv" %model), index=False)
+
+df = pd.read_csv(os.path.join(base_path, "%s_fidelity.csv" %model))
 
 df[df['plddt'] > .70].head(10)[['name', 'match_length', 'gen_length', 'plddt', 'cterm_gaps', 'best_matches']]
-# 52, 8, and 50 have the most domain hits
+# 52, 8, and 50 have the most domain hits
+df[df['plddt'] > 0.7].shape
+df.loc[[0, 1, 2, 18, 19, 21], ['name', 'sequence']].values
+df.loc[[0, 1, 2, 18, 19, 21], ['name', 'homolog_name', 'homolog_sequence']].values
diff --git a/analysis/compile_msa_fidelity.py b/analysis/compile_msa_fidelity.py
@@ -1,15 +1,17 @@
 import os
 from collections import Counter
+from tqdm import tqdm
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from Bio.Align import PairwiseAligner
 from scipy.stats import pearsonr
-from sequence_models.utils import parse_fasta
-from tqdm import tqdm
+from scipy.stats import ttest_rel
 
+from dayhoff.analysis_utils import results_to_pandas, get_all_paths, run_tmscore
+from sequence_models.utils import parse_fasta
 from dayhoff.analysis_utils import get_all_paths, results_to_pandas, run_tmscore
 
 sns.set_theme(font_scale=1.2)
@@ -18,17 +20,16 @@
 base_path = "/home/kevyan/generations/queries_from_homologs"
 
 models = ["natural", "xlstm", "evodiff", "evodiff_nom",
-          "gap_1.0_0.01", "gap_1.2_0.01", "gap_1.0_0.00", "gap_1.1_0.05", "gap_1.0_0.00_nom", "gap_1.0_0.05_nom",
-          "ccmgen", "ccmgen_short",
-          "indel_1.0_0.00", "indel_1.2_0.01", "indel_1.0_0.01", "indel_1.1_0.05", "indel_1.0_0.00_nom", "indel_1.0_0.05_nom"]
+          "gap_1.0_0.05_nom",
+          "ccmgen",
+          "indel_1.0_0.05_nom"]
 raw_dfs = []
 for m in models:
     pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
     fold_df, mpnn_df, merged_df = results_to_pandas(pdb_paths, mpnn_paths, name="")
     merged_df['model'] = m
     raw_dfs.append(merged_df)
 df = pd.concat(raw_dfs, ignore_index=True)
-
 model_to_name = {m: m for m in models}
 model_to_name['natural'] = 'queries'
 model_to_name['gap_1.0_0.01'] = '3b-cooled_25000_gap_t1.0_0.01'
@@ -60,11 +61,12 @@
             gen_length = len(s)
             n_homologs = len(homologs)
             best_id = -1
+            if model == 'natural':
+                homologs = homologs[1:]
             for i, homolog in enumerate(homologs):
                 if i == 0:
                     query_length = len(homolog)
-                    if model == 'natural':
-                        continue
+
                 alignment = aligner.align(s, homolog)
                 if alignment.score > best_matches:
                     best_matches = alignment.score
@@ -100,56 +102,75 @@
 len(set(df['file']))
 df.to_csv(os.path.join(base_path, "compiled_fidelities.csv"), index=False)
 df = pd.read_csv(os.path.join(base_path, "compiled_fidelities.csv"))
+models_to_plot = {
+    "natural": "Natural",
+    "ccmgen": "CCMgen",
+    "evodiff_nom": "EvoDiff-MSA",
+    "xlstm": 'Prot-xLSTM',
+        "gap_1.0_0.05_nom": "Aligned",
+        "indel_1.0_0.05_nom": "Unaligned"
+}
 
-
-
-print("model R(n_homologs, plddt)")
-for model in models:
-    df_lim = df[df['model'] == model]
-    print(model, pearsonr(df_lim['n_homologs'], df_lim['plddt']).statistic)
-
-print("model R(n_homologs, seq_id)")
-for model in models:
-    df_lim = df[df['model'] == model]
-    print(model, pearsonr(df_lim['n_homologs'], df_lim['seq_id']).statistic)
-
-print("model R(n_homologs, tmscore)")
-for model in models[1:]:
-    df_lim = df[df['model'] == model]
-    print(model, pearsonr(df_lim['n_homologs'], df_lim['tmscore']).statistic)
-
-print("model R(seq_id, tmscore)")
-for model in models[1:]:
-    df_lim = df[df['model'] == model]
-    print(model, pearsonr(df_lim['seq_id'], df_lim['tmscore']).statistic)
-
-
-grouped = df.groupby('model')
+model1 = "indel_1.0_0.05_nom"
+model2 = "gap_1.0_0.05_nom"
+df.columns
+all_names = list(set(df['file']))
+metrics = {
+    "plddt": [],
+    "perplexity": [],
+    "tmscore": [],
+    "seq_id": []
+}
+m_dict = {
+    "plddt": "pLDDT",
+    "perplexity": "scPerplexity",
+    "tmscore": "TM score",
+    "seq_id": "Sequence identity"
+}
+for n in all_names:
+    for m in metrics:
+        first = df[(df['model'] == model1) & (df['file'] == n)][m].values[0]
+        second = df[(df['model'] == model2) & (df['file'] == n)][m].values[0]
+        metrics[m].append(first - second)
+fig, axs = plt.subplots(1, 4, figsize=(14.4, 4.8))
+for i, (m, ax) in enumerate(zip(metrics, axs)):
+    print(np.array(metrics[m]).mean())
+    _ = sns.stripplot(y=metrics[m], ax=ax, alpha=0.7)
+    if i == 0:
+        _ = ax.set_ylabel(models_to_plot[model1] + " - " + models_to_plot[model2])
+    _ = ax.set_xlabel(m_dict[m])
+    _ = ax.axhline(y=0, color='gray')
+    _ = ax.set_xticks(ax.get_xticks())
+    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
+_ = fig.savefig(os.path.join(base_path, "ulal_diff.pdf"), dpi=300, bbox_inches="tight")
+
+
+grouped = df[['model', 'plddt', 'perplexity', 'tmscore', 'seq_id']].groupby('model')
+final_models = ["gap_1.0_0.05_nom", "indel_1.0_0.05_nom", "ccmgen", "xlstm", "evodiff_nom", "natural"]
+metrics = grouped.agg(['mean', 'std']).loc[final_models].reset_index()
+for i, row in metrics.iterrows():
+    print_me = [models_to_plot[row['model'].values[0]]]
+    for m in ['plddt', 'perplexity', 'tmscore', 'seq_id']:
+        print_me.append("$%.2f \\pm %.2f$" % (row[m]['mean'], row[m]['std']))
+    print_me = " & ".join(print_me) + "\\\\"
+    print(print_me)
 grouped.seq_id.agg(['mean', 'std'])
 grouped.tmscore.agg(['mean', 'std'])
 grouped.plddt.agg(['mean', 'std'])
 grouped.perplexity.agg(['mean', 'std'])
 
-models_to_plot = {
-    "natural": "Natural",
-    "ccmgen_short": "CCMgen",
-    "evodiff_nom": "EvoDiff-MSA",
-    "xlstm": 'Prot-xLSTM',
-        "gap_1.0_0.05_nom": "Alignment conditioning",
-        "indel_1.0_0.05_nom": "Homolog conditioning"
-}
 pal = sns.color_palette()
 model_to_hue = {
     "natural": "gray",
-    "ccmgen_short": pal[-4],
+    "ccmgen": pal[-4],
     "xlstm": pal[-1],
     "evodiff_nom": pal[-2],
     "gap_1.0_0.05_nom": pal[1],
     "indel_1.0_0.05_nom": pal[2],
 }
 cdfs = {
     "seq_id": (plt.subplots(), "Sequence Identity"),
-    "tmscore": (plt.subplots(), "TM score"),
+    "tmscore": (plt.subplots(), "TM-score"),
     "plddt": (plt.subplots(), "pLDDT"),
     "perplexity": (plt.subplots(), "scPerplexity"),
 }
@@ -202,3 +223,21 @@
 # 6202062 gap plddt 0.896246
 # A0A174Z1L0 indel plddt 0.941450 longer than query
 # 76841376 indel plddt 0.938367
+
+df.head()
+df.columns
+pivoted = {}
+out_models = {
+"gap_1.0_0.05_nom": "aligned", "indel_1.0_0.05_nom": "unaligned", "ccmgen": "CCMgen", "xlstm": "prot-xLSTM", "evodiff_nom": "EvoDiff-MSA", "natural": "natural"
+}
+ttest_outfile = os.path.join(base_path, "ttest_results.csv")
+with open(ttest_outfile, 'w') as f:
+    f.write("metric,model1,model2,t,p\n")
+    for m in ['plddt', 'perplexity', 'tmscore', 'seq_id']:
+        df2 = df.pivot(index='file', columns='model', values=m)
+        for i, model1 in enumerate(final_models):
+            for model2 in final_models[i + 1:]:
+                r = ttest_rel(df2[model1], df2[model2])
+                f.write(','.join([m, out_models[model1], out_models[model2], str(r.statistic), str(r.pvalue)]) + "\n")
+            if ttest_rel(df2[model1], df2[model2]).pvalue > 0.05:
+                print(m, model1, model2)