openforcefield · Yoshanuikabundi · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -9,6 +9,7 @@ dependencies:
   - rdkit
   - openmm
   - pyxdg
+  - networkx
 
   # Lints
   - basedpyright

diff --git a/openff/pablo/_pdb_data.py b/openff/pablo/_pdb_data.py
@@ -9,6 +9,8 @@
 from os import PathLike
 from typing import IO, Any, DefaultDict, Self
 
+import networkx
+
 from ._utils import __UNSET__, dec_hex, int_or_none, with_neighbours
 from .exceptions import (
     UnknownOrAmbiguousSerialInConectError,
@@ -368,7 +370,7 @@
 
         yield tuple(indices)
 
-    def subset_matches_residue(
+    def subset_matches_residue_names(
         self,
         res_atom_idcs: Sequence[int],
         residue_definition: ResidueDefinition,
@@ -482,6 +484,44 @@
             logging.debug("    Match failed: Missing atoms do not specify link")
             return None
 
+    def subset_matches_residue_conect(
+        self,
+        res_atom_idcs: Sequence[int],
+        residue_definition: ResidueDefinition,
+    ) -> ResidueMatch | None:
+        conect_graph = networkx.Graph()
+        for a in res_atom_idcs:
+            conect_graph.add_node(a, elem_charge=(self.element[a], self.charge[a]))
+            for b in self.conects[a]:
+                if b in res_atom_idcs:
+                    conect_graph.add_node(
+                        b,
+                        elem_charge=(self.element[b], self.charge[b]),
+                    )
+                    conect_graph.add_edge(a, b)
+        resdef_graph = residue_definition.to_graph()
+        networkx.set_node_attributes(
+            resdef_graph,
+            {
+                atom: {"elem_charge": (atom.symbol, atom.charge)}
+                for atom in resdef_graph.nodes
+            },
+        )
+
+        isomorphism = networkx.vf2pp_isomorphism(
+            conect_graph,
+            resdef_graph,
+            node_label="elem_charge",
+        )
+        if isomorphism is None:
+            return None
+        else:
+            return ResidueMatch(
+                residue_definition=residue_definition,
+                index_to_atomdef=isomorphism,
+                crosslink=None,
+            )
+
     def get_residue_matches(
         self,
         residue_database: Mapping[str, Iterable[ResidueDefinition]],
@@ -503,17 +543,27 @@
 
             residue_matches: list[ResidueMatch] = []
             for residue_definition in residue_database.get(res_name, []):
-                match = self.subset_matches_residue(
+                match = self.subset_matches_residue_names(
                     res_atom_idcs,
                     residue_definition,
                 )
 
                 if match is not None:
                     residue_matches.append(match)
 
+            if len(residue_matches) == 0:
+                for residue_definition in residue_database.get(res_name, []):
+                    match = self.subset_matches_residue_conect(
+                        res_atom_idcs,
+                        residue_definition,
+                    )
+
+                    if match is not None:
+                        residue_matches.append(match)
+
             if len(residue_matches) == 0:
                 for residue_definition in additional_substructures:
-                    match = self.subset_matches_residue(
+                    match = self.subset_matches_residue_names(
                         res_atom_idcs,
                         residue_definition,
                     )

diff --git a/openff/pablo/_tests/test_pdb_data.py b/openff/pablo/_tests/test_pdb_data.py
@@ -365,15 +365,15 @@ def test_subset_matches_residue_raises_on_empty(
         cys_def: ResidueDefinition,
     ):
         with pytest.raises(ValueError):
-            hewl_data.subset_matches_residue([], cys_def)
+            hewl_data.subset_matches_residue_names([], cys_def)
 
     def test_subset_matches_residue_succeeds_when_all_atoms_present(
         self,
         cys_def: ResidueDefinition,
         cys_data: PdbData,
     ):
         assert (
-            cys_data.subset_matches_residue(
+            cys_data.subset_matches_residue_names(
                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
                 cys_def,
             )
@@ -399,15 +399,15 @@ def test_subset_matches_residue_fails_on_multiply_matched_atom(self):
                 name=["H1", "H"],
                 charge=charges,
                 element=elements,
-            ).subset_matches_residue([0, 1], res_def)
+            ).subset_matches_residue_names([0, 1], res_def)
             is None
         )
         assert (
             PdbData(
                 name=["O", "H"],
                 charge=charges,
                 element=elements,
-            ).subset_matches_residue([0, 1], res_def)
+            ).subset_matches_residue_names([0, 1], res_def)
             is not None
         )
 
@@ -418,7 +418,7 @@ def test_subset_matches_residue_succeeds_when_all_leaving_atoms_absent(
     ):
         leaving_atoms = {atom.name for atom in cys_def.atoms if atom.leaving}
         assert (
-            cys_data.subset_matches_residue(
+            cys_data.subset_matches_residue_names(
                 [
                     i
                     for i, name in enumerate(cys_data.name)
@@ -438,7 +438,7 @@ def test_subset_matches_residue_fails_when_nonleaving_atom_missing(
         assert cys_data.name[1] == "CA"
         assert cys_def.name_to_atom["CA"].leaving is False
         assert (
-            cys_data.subset_matches_residue(
+            cys_data.subset_matches_residue_names(
                 [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
                 cys_def,
             )
@@ -453,7 +453,7 @@ def test_subset_matches_residue_fails_when_linkage_leaving_atoms_partially_missi
         # Missing atom 13 (HXT), one of two posterior bond leaving atoms
         assert cys_data.name[13] == "HXT"
         assert (
-            cys_data.subset_matches_residue(
+            cys_data.subset_matches_residue_names(
                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                 cys_def,
             )
@@ -473,7 +473,7 @@ def test_subset_matches_residue_succeeds_when_posterior_bond_leaving_atoms_missi
             if name not in getattr(cys_def, bond_name + "_leaving_atoms")
         ]
         assert len(subset) in [12, 13]
-        assert cys_data.subset_matches_residue(subset, cys_def) is not None
+        assert cys_data.subset_matches_residue_names(subset, cys_def) is not None
 
     def test_subset_matches_residue_fails_on_element_mismatch(
         self,
@@ -482,7 +482,8 @@ def test_subset_matches_residue_fails_on_element_mismatch(
     ):
         cys_data.element[5] = "Zr"
         assert (
-            cys_data.subset_matches_residue(range(len(cys_def.atoms)), cys_def) is None
+            cys_data.subset_matches_residue_names(range(len(cys_def.atoms)), cys_def)
+            is None
         )
 
     def test_subset_matches_residue_tolerates_none_charge_and_empty_element(self):
@@ -495,7 +496,7 @@ def test_subset_matches_residue_tolerates_none_charge_and_empty_element(self):
             residue_name="HPL",
         )
         data = PdbData(name=["H"], element=[""], charge=[None])
-        assert data.subset_matches_residue([0], resdef) is not None
+        assert data.subset_matches_residue_names([0], resdef) is not None
 
     def test_subset_matches_residue_tolerates_wrong_case_element(self):
         resdef = ResidueDefinition(
@@ -507,7 +508,7 @@ def test_subset_matches_residue_tolerates_wrong_case_element(self):
             residue_name="HPL",
         )
         data = PdbData(name=["H"], element=["h"], charge=[1])
-        assert data.subset_matches_residue([0], resdef) is not None
+        assert data.subset_matches_residue_names([0], resdef) is not None
 
     def test_get_residue_matches_loads_vicinal_disulfide(
         self,

diff --git a/openff/pablo/ccd/_ccdcache.py b/openff/pablo/ccd/_ccdcache.py
@@ -1,17 +1,12 @@
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from copy import deepcopy
-from io import StringIO
 from pathlib import Path
-from typing import Self, no_type_check
+from typing import Self
 from urllib.request import urlopen
 
 import xdg.BaseDirectory as xdg_base_dir
-from openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader
 
-from ..chem import PEPTIDE_BOND, PHOSPHODIESTER_BOND
 from ..residue import (
-    AtomDefinition,
-    BondDefinition,
     ResidueDefinition,
     _skip_residue_definition_validation,
 )
@@ -142,7 +137,7 @@ def _apply_patches(
         return definitions
 
     def _add_definition_from_str(self, s: str, res_name: str | None = None) -> None:
-        definition = self._res_def_from_ccd_str(s)
+        definition = ResidueDefinition.from_ccd_cif_str(s)
         self._add_and_patch_definition(definition, res_name)
 
     def _add_and_patch_definition(
@@ -186,89 +181,6 @@ def _glob(self, pattern: str) -> Iterator[Path]:
         for path in self._paths:
             yield from path.glob(pattern)
 
-    @staticmethod
-    def _res_def_from_ccd_str(s: str) -> ResidueDefinition:
-        @no_type_check
-        def inner(s):
-            # TODO: Handle residues like CL with a single atom properly (no tables)
-            data = []
-            with StringIO(s) as file:
-                PdbxReader(file).read(data)
-            block = data[0]
-
-            parent_residue_name = (
-                block.getObj("chem_comp").getValue("mon_nstd_parent_comp_id").upper()
-            )
-            parent_residue_name = (
-                None if parent_residue_name == "?" else parent_residue_name
-            )
-            residueName = block.getObj("chem_comp").getValue("id").upper()
-            residue_description = block.getObj("chem_comp").getValue("name")
-            linking_type = block.getObj("chem_comp").getValue("type").upper()
-            linking_bond = LINKING_TYPES[linking_type]
-
-            atomData = block.getObj("chem_comp_atom")
-            atomNameCol = atomData.getAttributeIndex("atom_id")
-            altAtomNameCol = atomData.getAttributeIndex("alt_atom_id")
-            symbolCol = atomData.getAttributeIndex("type_symbol")
-            leavingCol = atomData.getAttributeIndex("pdbx_leaving_atom_flag")
-            chargeCol = atomData.getAttributeIndex("charge")
-            aromaticCol = atomData.getAttributeIndex("pdbx_aromatic_flag")
-            stereoCol = atomData.getAttributeIndex("pdbx_stereo_config")
-
-            atoms = [
-                AtomDefinition(
-                    name=row[atomNameCol],
-                    synonyms=tuple(
-                        [row[altAtomNameCol]]
-                        if row[altAtomNameCol] != row[atomNameCol]
-                        else [],
-                    ),
-                    symbol=row[symbolCol][0:1].upper() + row[symbolCol][1:].lower(),
-                    leaving=row[leavingCol] == "Y",
-                    charge=int(row[chargeCol]),
-                    aromatic=row[aromaticCol] == "Y",
-                    stereo=None if row[stereoCol] == "N" else row[stereoCol],
-                )
-                for row in atomData.getRowList()
-            ]
-
-            bondData = block.getObj("chem_comp_bond")
-            if bondData is not None:
-                atom1Col = bondData.getAttributeIndex("atom_id_1")
-                atom2Col = bondData.getAttributeIndex("atom_id_2")
-                orderCol = bondData.getAttributeIndex("value_order")
-                aromaticCol = bondData.getAttributeIndex("pdbx_aromatic_flag")
-                stereoCol = bondData.getAttributeIndex("pdbx_stereo_config")
-                bonds = [
-                    BondDefinition(
-                        atom1=row[atom1Col],
-                        atom2=row[atom2Col],
-                        order={"SING": 1, "DOUB": 2, "TRIP": 3, "QUAD": 4}[
-                            row[orderCol]
-                        ],
-                        aromatic=row[aromaticCol] == "Y",
-                        stereo=None if row[stereoCol] == "N" else row[stereoCol],
-                    )
-                    for row in bondData.getRowList()
-                ]
-            else:
-                bonds = []
-
-            with _skip_residue_definition_validation():
-                residue_definition = ResidueDefinition(
-                    residue_name=residueName,
-                    description=residue_description,
-                    linking_bond=linking_bond,
-                    crosslink=None,
-                    atoms=tuple(atoms),
-                    bonds=tuple(bonds),
-                )
-
-            return residue_definition
-
-        return inner(s)
-
     def __contains__(self, value: object) -> bool:
         if value in self._definitions:
             return True
@@ -300,38 +212,3 @@ def with_(
             new._extra_definitions_set = True
             new._add_definitions(resdefs, resname)
         return new
-
-
-# TODO: Fill in this data
-LINKING_TYPES: dict[str, BondDefinition | None] = {
-    # "D-beta-peptide, C-gamma linking".upper(): [],
-    # "D-gamma-peptide, C-delta linking".upper(): [],
-    # "D-peptide COOH carboxy terminus".upper(): [],
-    # "D-peptide NH3 amino terminus".upper(): [],
-    # "D-peptide linking".upper(): [],
-    # "D-saccharide".upper(): [],
-    # "D-saccharide, alpha linking".upper(): [],
-    # "D-saccharide, beta linking".upper(): [],
-    # "DNA OH 3 prime terminus".upper(): [],
-    # "DNA OH 5 prime terminus".upper(): [],
-    "DNA linking".upper(): PHOSPHODIESTER_BOND,
-    "L-DNA linking".upper(): PHOSPHODIESTER_BOND,
-    "L-RNA linking".upper(): PHOSPHODIESTER_BOND,
-    # "L-beta-peptide, C-gamma linking".upper(): [],
-    # "L-gamma-peptide, C-delta linking".upper(): [],
-    # "L-peptide COOH carboxy terminus".upper(): [],
-    # "L-peptide NH3 amino terminus".upper(): [],
-    "L-peptide linking".upper(): PEPTIDE_BOND,
-    # "L-saccharide".upper(): [],
-    # "L-saccharide, alpha linking".upper(): [],
-    # "L-saccharide, beta linking".upper(): [],
-    # "RNA OH 3 prime terminus".upper(): [],
-    # "RNA OH 5 prime terminus".upper(): [],
-    "RNA linking".upper(): PHOSPHODIESTER_BOND,
-    "non-polymer".upper(): None,
-    # "other".upper(): [],
-    "peptide linking".upper(): PEPTIDE_BOND,
-    "peptide-like".upper(): PEPTIDE_BOND,
-    # "saccharide".upper(): [],
-}
-"""Map from the CCD's linking types to the bond formed between two such monomers"""
diff --git a/openff/pablo/ccd/patches.py b/openff/pablo/ccd/patches.py
@@ -4,15 +4,14 @@
 
 from itertools import combinations
 
-from openff.pablo.chem import DISULFIDE_BOND
+from openff.pablo.chem import DISULFIDE_BOND, PEPTIDE_BOND
 
 from .._utils import flatten, unwrap
 from ..residue import (
     AtomDefinition,
     BondDefinition,
     ResidueDefinition,
 )
-from ._ccdcache import PEPTIDE_BOND
 
 __all__ = [
     "ACIDIC_PROTONS",
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ dependencies: @@
       - rdkit
       - openmm
       - pyxdg
+      - networkx
       # Lints
       - basedpyright
@@ Expand Down @@