Save progress. Average distance working.

ameya98 · ameya98 · commit 28cd6aca9b6d · 2025-03-09T16:03:24.000Z
diff --git a/README.md b/README.md
@@ -162,7 +162,9 @@ We provide scripts for analysing JAMUN and original MD trajectories in [https://
 
 ## Data Generation
 
-We also provide scripts for generating the MD simulation data with [OpenMM](https://openmm.org/), including energy minimization and calibration steps with NVT and NPT ensembles.
+### Running Molecular Dynamics with OpenMM
+
+We provide scripts for generating MD simulation data with [OpenMM](https://openmm.org/), including energy minimization and calibration steps with NVT and NPT ensembles.
 
 ```bash
 python scripts/generate_data/run_simulation.py [INIT_PDB]
@@ -171,7 +173,9 @@ python scripts/generate_data/run_simulation.py [INIT_PDB]
 The defaults correspond to our setup for the capped diamines.
 Please run this script with the `-h` flag to see all simulation parameters.
 
-## Preprocessing
+### Preprocessing
+
+Some of the datasets require some preprocessing for easier consumption, for eg. the MDGen data:
 
 ```bash
 source .env
diff --git a/src/jamun/cmdline/train.py b/src/jamun/cmdline/train.py
@@ -17,7 +17,7 @@
 import jamun
 from jamun.hydra import instantiate_dict_cfg
 from jamun.hydra.utils import format_resolver
-from jamun.utils import compute_average_squared_distance_from_data, dist_log, find_checkpoint
+from jamun.utils import compute_average_squared_distance_from_datasets, dist_log, find_checkpoint
 
 dotenv.load_dotenv(".env", verbose=True)
 OmegaConf.register_new_resolver("format", format_resolver)
@@ -27,10 +27,9 @@ def compute_average_squared_distance_from_config(cfg: OmegaConf) -> float:
     """Computes the average squared distance for normalization from the data."""
     datamodule = hydra.utils.instantiate(cfg.data.datamodule)
     datamodule.setup("compute_normalization")
-    train_dataloader = datamodule.train_dataloader()
+    train_datasets = datamodule.datasets["train"]
     cutoff = cfg.model.max_radius
-    average_squared_distance = compute_average_squared_distance_from_data(train_dataloader, cutoff, cfg.trainer)
-    average_squared_distance = float(average_squared_distance)
+    average_squared_distance = compute_average_squared_distance_from_datasets(train_datasets, cutoff)
     return average_squared_distance
 
 
diff --git a/src/jamun/data/_utils.py b/src/jamun/data/_utils.py
@@ -1,9 +1,9 @@
 import collections
 import os
 import re
-import random
 from typing import List, Optional, Sequence
 
+import pandas as pd
 import hydra
 import requests
 import torch
@@ -124,6 +124,7 @@ def parse_datasets_from_directory_new(
     max_datasets: Optional[int] = None,
     max_datasets_offset: Optional[int] = None,
     filter_codes: Optional[Sequence[str]] = None,
+    split_csv: Optional[str] = None,
     as_iterable: bool = False,
     **dataset_kwargs,
 ) -> List[MDtrajDataset]:
@@ -170,6 +171,9 @@ def parse_datasets_from_directory_new(
             pdb_files[code] = pdb_file
     
     # Filter out codes
+    if split_csv is not None:
+        filter_codes = pd.read_csv("train.csv")["entry"].tolist()
+
     if filter_codes is not None:
         codes = [code for code in codes if code in set(filter_codes)]
     
diff --git a/src/jamun/metrics/_save_trajectory.py b/src/jamun/metrics/_save_trajectory.py
@@ -25,7 +25,7 @@ def __init__(self, save_true_trajectory: bool = False, *args, **kwargs):
             for ext in self.true_samples_extensions:
                 os.makedirs(os.path.join(self.true_samples_dir, ext), exist_ok=True)
 
-        self.pred_samples_extensions = ["npy", "pdb", "dcd"]
+        self.pred_samples_extensions = ["pdb", "dcd"]
         for ext in self.pred_samples_extensions:
             os.makedirs(os.path.join(self.pred_samples_dir, ext), exist_ok=True)
 
@@ -69,29 +69,29 @@ def on_sample_end(self):
         label = self.dataset.label()
         label = label.replace("/", "_").replace("=", "-")
 
-        for ext in ["npy", "pdb", "dcd"]:
-            filename = self.filename_pred("joined", ext)
-            artifact = wandb.Artifact(f"{label}_pred_samples_joined", type="pred_samples_joined")
-            artifact.add_file(filename, f"pred_samples_joined.{ext}")
-            wandb.log_artifact(artifact)
+        # for ext in self.pred_samples_extensions:
+        #     filename = self.filename_pred("joined", ext)
+        #     artifact = wandb.Artifact(f"{label}_pred_samples_joined", type="pred_samples_joined")
+        #     artifact.add_file(filename, f"pred_samples_joined.{ext}")
+        #     wandb.log_artifact(artifact)
 
     def compute(self) -> Dict[str, float]:
         # Save the predicted samples as numpy files.
-        samples_np = self.sample_tensors(new=True).cpu().detach().numpy()
-        for trajectory_index, sample in enumerate(samples_np):
-            np.save(self.filename_pred(trajectory_index, "npy"), sample)
+        # samples_np = self.sample_tensors(new=True).cpu().detach().numpy()
+        # for trajectory_index, sample in enumerate(samples_np):
+        #     np.save(self.filename_pred(trajectory_index, "npy"), sample)
 
-        samples_joined_np = self.joined_sample_tensor().cpu().detach().numpy()
-        np.save(self.filename_pred("joined", "npy"), samples_joined_np)
+        # samples_joined_np = self.joined_sample_tensor().cpu().detach().numpy()
+        # np.save(self.filename_pred("joined", "npy"), samples_joined_np)
 
         # Save the predict sample trajectory as a PDB and DCD file.
         pred_trajectories = self.sample_trajectories(new=True)
         for trajectory_index, pred_trajectory in enumerate(pred_trajectories, start=self.num_chains_seen):
-            utils.save_pdb(pred_trajectory, self.filename_pred(trajectory_index, "pdb"))
+            # utils.save_pdb(pred_trajectory, self.filename_pred(trajectory_index, "pdb"))
             pred_trajectory.save_dcd(self.filename_pred(trajectory_index, "dcd"))
 
         pred_trajectory_joined = self.joined_sample_trajectory()
-        utils.save_pdb(pred_trajectory_joined, self.filename_pred("joined", "pdb"))
+        # utils.save_pdb(pred_trajectory_joined, self.filename_pred("joined", "pdb"))
         pred_trajectory_joined.save_dcd(self.filename_pred("joined", "dcd"))
 
         return {}
diff --git a/src/jamun/utils/__init__.py b/src/jamun/utils/__init__.py
@@ -1,5 +1,5 @@
 from .align import align_A_to_B, align_A_to_B_batched
-from .average_squared_distance import compute_average_squared_distance, compute_average_squared_distance_from_data
+from .average_squared_distance import compute_average_squared_distance, compute_average_squared_distance_from_datasets
 from .checkpoint import find_checkpoint, find_checkpoint_directory, get_wandb_run_config
 from .data_with_residue_info import DataWithResidueInformation
 from .dist_log import dist_log, wandb_dist_log
diff --git a/src/jamun/utils/average_squared_distance.py b/src/jamun/utils/average_squared_distance.py
@@ -112,45 +112,6 @@ def compute_final_statistics(self):
         return average_squared_distance
 
 
-def compute_average_squared_distance_from_data(
-    datamodule: pl.LightningDataModule,
-    cutoff: float,
-    trainer_cfg: Dict[str, Any],
-    num_estimation_graphs: int = 5000, 
-    verbose: bool = False
-):
-    """Compute normalization using a Lightning trainer.
-    
-    Args:
-        datamodule: The Lightning datamodule
-        cutoff (float): The radius cutoff for distance calculations
-        compute_average_squared_distance_fn (callable): Function to compute average
-                                                     squared distance for a graph
-        trainer_cfg: Configuration for the Lightning trainer
-        num_estimation_graphs (int): Maximum number of graphs to process
-        verbose (bool): Whether to print detailed statistics
-        
-    Returns:
-        float: The computed average squared distance
-    """
-    
-    # Create the normalization module
-    norm_module = ComputeNormalizationModule(
-        cutoff=cutoff,
-        num_estimation_graphs=num_estimation_graphs,
-        verbose=verbose
-    )
-    
-    # Create the trainer
-    trainer = hydra.utils.instantiate(trainer_cfg)
-    
-    # Fit without any callbacks or loggers
-    trainer.fit(norm_module, datamodule=datamodule)
-    
-    # Compute and return the final statistics
-    return norm_module.compute_final_statistics()
-
-
 def compute_distance_matrix(x: np.ndarray, cutoff: Optional[float] = None) -> np.ndarray:
     """Computes the distance matrix between points in x, ignoring self-distances."""
     if x.shape[-1] != 3:
@@ -177,42 +138,36 @@ def compute_average_squared_distance(x: np.ndarray, cutoff: Optional[float] = No
     return np.mean(dist_x**2)
 
 
-# def compute_average_squared_distance_from_data(
-#     dataloader: torch.utils.data.DataLoader,
-#     cutoff: float,
-#     num_estimation_graphs: int = 5000,
-#     verbose: bool = False,
-# ) -> float:
-#     """Computes the average squared distance for normalization."""
-#     avg_sq_dists = collections.defaultdict(list)
-#     num_graphs = 0
-#     for batch in dataloader:
-#         for graph in batch.to_data_list():
-#             pos = np.asarray(graph.pos)
-#             avg_sq_dist = compute_average_squared_distance(pos, cutoff=cutoff)
-#             avg_sq_dists[graph.dataset_label].append(avg_sq_dist)
-#             num_graphs += 1
-
-#         if num_graphs >= num_estimation_graphs:
-#             break
-
-#     mean_avg_sq_dist = sum(np.sum(avg_sq_dists[label]) for label in avg_sq_dists) / num_graphs
-#     utils.dist_log(f"Mean average squared distance = {mean_avg_sq_dist:0.3f} nm^2")
-
-#     if verbose:
-#         utils.dist_log(f"For cutoff {cutoff} nm:")
-#         for label in sorted(avg_sq_dists):
-#             utils.dist_log(
-#                 f"- Dataset {label}: Average squared distance = {np.mean(avg_sq_dists[label]):0.3f} +- {np.std(avg_sq_dists[label]):0.3f} nm^2"
-#             )
-
-#     # Average across all processes, if distributed.
-#     print("torch.distributed.is_initialized():", torch.distributed.is_initialized())
-#     mean_avg_sq_dist = torch.tensor(mean_avg_sq_dist, device="cuda")
-
-#     print("mean_avg_sq_dist bef:", mean_avg_sq_dist)
-#     torch.distributed.all_reduce(mean_avg_sq_dist, op=torch.distributed.ReduceOp.AVG)
-#     mean_avg_sq_dist = mean_avg_sq_dist.item()
-#     print("mean_avg_sq_dist aft:", mean_avg_sq_dist)
-
-#     return mean_avg_sq_dist
+def compute_average_squared_distance_from_datasets(
+    datasets: Sequence[torch.utils.data.Dataset],
+    cutoff: float,
+    num_estimation_datasets: int = 50,
+    num_estimation_graphs_per_dataset: int = 100,
+    verbose: bool = False,
+) -> float:
+    """Computes the average squared distance for normalization."""
+    avg_sq_dists = collections.defaultdict(list)
+    
+    for dataset in datasets[:num_estimation_datasets]:
+        num_graphs = 0
+        
+        for graph in dataset:
+            pos = np.asarray(graph.pos)
+            avg_sq_dist = compute_average_squared_distance(pos, cutoff=cutoff)
+            avg_sq_dists[graph.dataset_label].append(avg_sq_dist)
+            num_graphs += 1
+
+        if num_graphs >= num_estimation_graphs_per_dataset:
+            break
+
+    mean_avg_sq_dist = sum(np.sum(avg_sq_dists[label]) for label in avg_sq_dists) / num_graphs
+    utils.dist_log(f"Mean average squared distance = {mean_avg_sq_dist:0.3f} nm^2")
+
+    if verbose:
+        utils.dist_log(f"For cutoff {cutoff} nm:")
+        for label in sorted(avg_sq_dists):
+            utils.dist_log(
+                f"- Dataset {label}: Average squared distance = {np.mean(avg_sq_dists[label]):0.3f} +- {np.std(avg_sq_dists[label]):0.3f} nm^2"
+            )
+
+    return float(mean_avg_sq_dist)