diff-use · vratins · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,10 +11,11 @@ dependencies = [
   "e3nn",
   "esm",
   "biotite",
-  "pymol-open-source",
+  "pymol-open-source-whl>=3.1.0.4",
   "scipy",
   "pandas",
   "numpy",
+  "rdkit",
   "matplotlib",
   "pillow",
   "loguru",

diff --git a/src/dataset.py b/src/dataset.py
@@ -47,16 +47,17 @@ def element_onehot(symbols: list[str]) -> Tensor:
 
 def parse_asu_with_biotite(
     path: str,
-) -> tuple[bts.AtomArray, bts.AtomArray]:
+) -> tuple[bts.AtomArray, bts.AtomArray, bts.AtomArray]:
     """
-    Parse PDB file and extract protein and water atoms.
+    Parse PDB file and extract protein, water, and ligand atoms.
 
     Args:
         path: Path to PDB file
 
     Returns:
-        Tuple of (protein_atoms, water_atoms) as biotite AtomArrays.
-        Hydrogen atoms are excluded.
+        Tuple of (protein_atoms, water_atoms, ligand_atoms) as biotite AtomArrays.
+        Hydrogen atoms are excluded. ligand_atoms contains all non-protein, non-water
+        HETATM records (small molecules, ions, cofactors).
 
     Notes:
         - model=1: Uses first model in PDB (standard for X-ray structures)
@@ -71,11 +72,13 @@ def parse_asu_with_biotite(
 
     protein_mask = bts.filter_amino_acids(atoms)
     water_mask = (atoms.res_name == "HOH") | (atoms.res_name == "WAT")
+    ligand_mask = ~protein_mask & ~water_mask
 
     protein_atoms = atoms[protein_mask]
     water_atoms = atoms[water_mask]
+    ligand_atoms = atoms[ligand_mask]
 
-    return protein_atoms, water_atoms
+    return protein_atoms, water_atoms, ligand_atoms
 
 
 def get_crystal_contacts_pymol(
@@ -665,6 +668,7 @@ def __init__(
         base_pdb_dir: str = "/sb/wankowicz_lab/data/srivasv/pdb_redo_data",
         cutoff: float = 8.0,
         include_mates: bool = True,
+        include_ligands: bool = False,
         geometry_cache_name: str = "geometry",
         preprocess: bool = True,
         duplicate_single_sample: int = 1,
@@ -691,8 +695,12 @@ def __init__(
             base_pdb_dir: Base directory containing PDB subdirectories
             cutoff: Distance cutoff for PP edges and crystal contacts (Angstroms)
             include_mates: If True, include symmetry mate atoms as protein nodes
+            include_ligands: If True, include non-water het atoms (ligands, ions,
+                             cofactors) as protein nodes. They are appended after
+                             protein (and mate) atoms with a boolean is_ligand mask.
             geometry_cache_name: Base name for geometry cache directory. When
                                  include_mates=True, "_mates" is appended automatically.
+                                 When include_ligands=True, "_lig" is appended.
                                  Default is "geometry", resulting in "geometry/" or
                                  "geometry_mates/" subdirectories.
             preprocess: If True, run preprocessing on missing cached files
@@ -720,8 +728,10 @@ def __init__(
         """
 
         self.cache_dir = Path(processed_dir)
-        # Directory-based separation: geometry/ vs geometry_mates/
+        # Directory-based separation: geometry/ vs geometry_mates/ vs geometry_lig/ etc.
         cache_suffix = "_mates" if include_mates else ""
+        if include_ligands:
+            cache_suffix += "_lig"
         self.geometry_dir = self.cache_dir / f"{geometry_cache_name}{cache_suffix}"
         self.base_pdb_dir = Path(base_pdb_dir)
         self.cutoff = cutoff
@@ -731,6 +741,7 @@ def __init__(
         else:
             self.embedding_dir = None
         self.include_mates = include_mates
+        self.include_ligands = include_ligands
         self.duplicate_single_sample = duplicate_single_sample
 
         self.max_com_dist = max_com_dist
@@ -867,7 +878,7 @@ def _preprocess_one(self, entry: dict, cache_path: Path):
         """
         pdb_path = str(entry["pdb_path"])
 
-        protein_atoms, water_atoms = parse_asu_with_biotite(pdb_path)
+        protein_atoms, water_atoms, ligand_atoms = parse_asu_with_biotite(pdb_path)
 
         # check inter-chain interactions for multi-chain proteins
         chain_valid, chain_reason, _ = check_chain_interactions(
@@ -1046,6 +1057,26 @@ def _preprocess_one(self, entry: dict, cache_path: Path):
             final_protein_x = protein_x
             final_protein_res_idx = protein_res_idx
 
+        # Append ligand atoms after protein (and mate) atoms when enabled.
+        # is_ligand mask marks which protein-type nodes are ligand atoms.
+        # Ligands always go last so num_asu_protein and mate counts are unaffected,
+        # preserving ESM/SLAE embedding alignment via _pad_atom_embeddings_for_mates.
+        if self.include_ligands and ligand_atoms:
+            ligand_pos = torch.tensor(ligand_atoms.coord, dtype=torch.float32) - center
+            ligand_elements = [str(e).upper() for e in ligand_atoms.element]
+            ligand_x = element_onehot(ligand_elements)
+            final_protein_pos = torch.cat([final_protein_pos, ligand_pos], dim=0)
+            final_protein_x = torch.cat([final_protein_x, ligand_x], dim=0)
+            # Ligand atoms get residue_index = -1 (sentinel; no residue embedding)
+            ligand_res_idx = torch.full((len(ligand_atoms),), -1, dtype=torch.long)
-            # Ligand atoms get residue_index = -1 (sentinel; no residue embedding)
-            ligand_res_idx = torch.full((len(ligand_atoms),), -1, dtype=torch.long)
+            # Assign ligands to valid residue indices after existing residues
+            max_res_idx = (
+                final_protein_res_idx.max().item()
+                if final_protein_res_idx.numel() > 0
+                else -1
+            )
+            ligand_res_idx = torch.full(
+                (len(ligand_atoms),),
+                max_res_idx + 1,
+                dtype=torch.long,
+            )
-            # Ligand atoms get residue_index = -1 (sentinel; no residue embedding)
-            ligand_res_idx = torch.full((len(ligand_atoms),), -1, dtype=torch.long)
+            # Assign ligands to valid residue indices after existing residues
+            max_res_idx = (
+                final_protein_res_idx.max().item()
+                if final_protein_res_idx.numel() > 0
+                else -1
+            )
+            ligand_res_idx = torch.full(
+                (len(ligand_atoms),),
+                max_res_idx + 1,
+                dtype=torch.long,
+            )
+            final_protein_res_idx = torch.cat(
+                [final_protein_res_idx, ligand_res_idx], dim=0
+            )
+            is_ligand = torch.zeros(final_protein_pos.size(0), dtype=torch.bool)
+            is_ligand[-len(ligand_atoms) :] = True
+        else:
+            is_ligand = torch.zeros(final_protein_pos.size(0), dtype=torch.bool)
+
         # Compute PP edges and features
         if final_protein_pos.size(0) > 0:
             pp_edge_index = radius_graph(final_protein_pos, r=self.cutoff, loop=False)
@@ -1071,6 +1102,7 @@ def _preprocess_one(self, entry: dict, cache_path: Path):
                 "protein_pos": final_protein_pos,
                 "protein_x": final_protein_x,
                 "protein_res_idx": final_protein_res_idx,
+                "is_ligand": is_ligand,
                 "water_pos": water_pos,
                 "water_x": water_x,
                 # PP topology and features (precomputed)
@@ -1164,6 +1196,10 @@ def __getitem__(self, idx: int) -> HeteroData:
         protein_pos = cached["protein_pos"]
         protein_x = cached["protein_x"]
         protein_res_idx = cached["protein_res_idx"]
+        # is_ligand may be absent in older caches (backward compat: all False)
+        is_ligand = cached.get(
+            "is_ligand", torch.zeros(protein_pos.size(0), dtype=torch.bool)
+        )
         pp_edge_index = cached["pp_edge_index"]
         pp_edge_unit_vectors = cached["pp_edge_unit_vectors"]
         pp_edge_rbf = cached["pp_edge_rbf"]
@@ -1185,6 +1221,7 @@ def __getitem__(self, idx: int) -> HeteroData:
         data["protein"].x = protein_x
         data["protein"].pos = protein_pos
         data["protein"].residue_index = protein_res_idx
+        data["protein"].is_ligand = is_ligand
         data["protein"].num_nodes = protein_pos.size(0)
         data["protein"].num_residues = num_residues
         data["protein"].num_protein_residues = num_protein_residues

diff --git a/src/encoder_base.py b/src/encoder_base.py
@@ -15,6 +15,8 @@
 import torch
 import torch.nn as nn
 
+from src.constants import NODE_FEATURE_DIM
+
 
 if TYPE_CHECKING:
     from torch_geometric.data import HeteroData
@@ -177,6 +179,15 @@ def __init__(
         self._embedding_dim: int | None = embedding_dim
         self._embedding_key = embedding_key
         self._encoder_type = encoder_type
+        # Learnable projection for ligand atoms (element one-hot -> embedding space).
+        # Ligands have no ESM/SLAE embeddings; this replaces zero-padding with a
+        # learned representation parameterized only by element type.
+        # Lazily initialized on first forward when embedding_dim becomes known.
+        self.ligand_embed: nn.Linear | None = (
+            nn.Linear(NODE_FEATURE_DIM, embedding_dim, bias=False)
+            if embedding_dim is not None
+            else None
+        )
 
     @property
     def output_dims(self) -> tuple[int, int]:
@@ -203,13 +214,15 @@ def forward(
         """
         Read cached embeddings and return (s, V, None).
 
-        On first call, infers embedding dimension from the data.
+        On first call, infers embedding dimension from the data. If ligand atoms
+        are present (data['protein'].is_ligand), their zero-padded embedding rows
+        are replaced with a learned projection from element one-hot features.
 
         Args:
             data: HeteroData with cached embeddings in data['protein']
 
         Returns:
-            s: (N, embedding_dim) — raw embeddings
+            s: (N, embedding_dim) — embeddings (ligand rows via learned projection)
             V: (N, 0, 3)         — empty vector features
             pp_edge_attr: None   — cached embedding encoders don't process edges
         """
@@ -221,9 +234,20 @@ def forward(
 
         embeddings = data["protein"][self._embedding_key]
 
-        # Infer dimension on first forward
+        # Infer dimension on first forward and lazily init ligand head
         if self._embedding_dim is None:
             self._embedding_dim = embeddings.size(-1)
+            self.ligand_embed = nn.Linear(
+                NODE_FEATURE_DIM, self._embedding_dim, bias=False
+            ).to(embeddings.device)
+
+        # Replace zero-padded ligand rows with learned element projection
+        lig_mask = getattr(data["protein"], "is_ligand", None)
+        if lig_mask is not None and lig_mask.any():
+            embeddings = embeddings.clone()
+            embeddings[lig_mask] = self.ligand_embed(
+                data["protein"].x[lig_mask].to(embeddings.device)
+            )
 
         V = embeddings.new_empty(embeddings.size(0), 0, 3)
         return embeddings, V, None

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -67,6 +67,12 @@ def pdb_1deu():
     return _resolve_pdb_path("1deu")
 
 
+@pytest.fixture
+def pdb_4h0b():
+    """4h0b - has non-water ligand HETATMs for ligand support tests."""
+    return _resolve_pdb_path("4h0b")
+
+
 # ============== Shared encoder fixtures ==============