Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connectivity matching #69

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions devtools/conda-envs/test_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- rdkit
- openmm
- pyxdg
- networkx

# Lints
- basedpyright
Expand Down
56 changes: 53 additions & 3 deletions openff/pablo/_pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from os import PathLike
from typing import IO, Any, DefaultDict, Self

import networkx

from ._utils import __UNSET__, dec_hex, int_or_none, with_neighbours
from .exceptions import (
UnknownOrAmbiguousSerialInConectError,
Expand Down Expand Up @@ -368,7 +370,7 @@

yield tuple(indices)

def subset_matches_residue(
def subset_matches_residue_names(
self,
res_atom_idcs: Sequence[int],
residue_definition: ResidueDefinition,
Expand Down Expand Up @@ -482,6 +484,44 @@
logging.debug(" Match failed: Missing atoms do not specify link")
return None

def subset_matches_residue_conect(
self,
res_atom_idcs: Sequence[int],
residue_definition: ResidueDefinition,
) -> ResidueMatch | None:
conect_graph = networkx.Graph()

Check warning on line 492 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "conect_graph" is partially unknown   Type of "conect_graph" is "Graph[Unknown]" (reportUnknownVariableType)
for a in res_atom_idcs:
conect_graph.add_node(a, elem_charge=(self.element[a], self.charge[a]))

Check warning on line 494 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "add_node" is partially unknown   Type of "add_node" is "(node_for_adding: Unknown, **attr: Unknown) -> None" (reportUnknownMemberType)
for b in self.conects[a]:
if b in res_atom_idcs:
conect_graph.add_node(

Check warning on line 497 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "add_node" is partially unknown   Type of "add_node" is "(node_for_adding: Unknown, **attr: Unknown) -> None" (reportUnknownMemberType)
b,
elem_charge=(self.element[b], self.charge[b]),
)
conect_graph.add_edge(a, b)

Check warning on line 501 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "add_edge" is partially unknown   Type of "add_edge" is "(u_of_edge: Unknown, v_of_edge: Unknown, **attr: Unknown) -> None" (reportUnknownMemberType)
resdef_graph = residue_definition.to_graph()
networkx.set_node_attributes(
resdef_graph,
{
atom: {"elem_charge": (atom.symbol, atom.charge)}
for atom in resdef_graph.nodes
},
)

isomorphism = networkx.vf2pp_isomorphism(

Check warning on line 511 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "isomorphism" is unknown (reportUnknownVariableType)

Check warning on line 511 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Type of "vf2pp_isomorphism" is partially unknown   Type of "vf2pp_isomorphism" is "_dispatchable[(G1: Unknown, G2: Unknown, node_label: Any | None = None, default_label: Any | None = None), Unknown]" (reportUnknownMemberType)
conect_graph,
resdef_graph,
node_label="elem_charge",
)
if isomorphism is None:
return None
else:
return ResidueMatch(
residue_definition=residue_definition,
index_to_atomdef=isomorphism,

Check warning on line 521 in openff/pablo/_pdb_data.py

View workflow job for this annotation

GitHub Actions / main-tests (ubuntu-latest, 3.11)

Argument type is unknown   Argument corresponds to parameter "index_to_atomdef" in function "__init__" (reportUnknownArgumentType)
crosslink=None,
)

def get_residue_matches(
self,
residue_database: Mapping[str, Iterable[ResidueDefinition]],
Expand All @@ -503,17 +543,27 @@

residue_matches: list[ResidueMatch] = []
for residue_definition in residue_database.get(res_name, []):
match = self.subset_matches_residue(
match = self.subset_matches_residue_names(
res_atom_idcs,
residue_definition,
)

if match is not None:
residue_matches.append(match)

if len(residue_matches) == 0:
for residue_definition in residue_database.get(res_name, []):
match = self.subset_matches_residue_conect(
res_atom_idcs,
residue_definition,
)

if match is not None:
residue_matches.append(match)

if len(residue_matches) == 0:
for residue_definition in additional_substructures:
match = self.subset_matches_residue(
match = self.subset_matches_residue_names(
res_atom_idcs,
residue_definition,
)
Expand Down
23 changes: 12 additions & 11 deletions openff/pablo/_tests/test_pdb_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,15 +365,15 @@ def test_subset_matches_residue_raises_on_empty(
cys_def: ResidueDefinition,
):
with pytest.raises(ValueError):
hewl_data.subset_matches_residue([], cys_def)
hewl_data.subset_matches_residue_names([], cys_def)

def test_subset_matches_residue_succeeds_when_all_atoms_present(
self,
cys_def: ResidueDefinition,
cys_data: PdbData,
):
assert (
cys_data.subset_matches_residue(
cys_data.subset_matches_residue_names(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
cys_def,
)
Expand All @@ -399,15 +399,15 @@ def test_subset_matches_residue_fails_on_multiply_matched_atom(self):
name=["H1", "H"],
charge=charges,
element=elements,
).subset_matches_residue([0, 1], res_def)
).subset_matches_residue_names([0, 1], res_def)
is None
)
assert (
PdbData(
name=["O", "H"],
charge=charges,
element=elements,
).subset_matches_residue([0, 1], res_def)
).subset_matches_residue_names([0, 1], res_def)
is not None
)

Expand All @@ -418,7 +418,7 @@ def test_subset_matches_residue_succeeds_when_all_leaving_atoms_absent(
):
leaving_atoms = {atom.name for atom in cys_def.atoms if atom.leaving}
assert (
cys_data.subset_matches_residue(
cys_data.subset_matches_residue_names(
[
i
for i, name in enumerate(cys_data.name)
Expand All @@ -438,7 +438,7 @@ def test_subset_matches_residue_fails_when_nonleaving_atom_missing(
assert cys_data.name[1] == "CA"
assert cys_def.name_to_atom["CA"].leaving is False
assert (
cys_data.subset_matches_residue(
cys_data.subset_matches_residue_names(
[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
cys_def,
)
Expand All @@ -453,7 +453,7 @@ def test_subset_matches_residue_fails_when_linkage_leaving_atoms_partially_missi
# Missing atom 13 (HXT), one of two posterior bond leaving atoms
assert cys_data.name[13] == "HXT"
assert (
cys_data.subset_matches_residue(
cys_data.subset_matches_residue_names(
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
cys_def,
)
Expand All @@ -473,7 +473,7 @@ def test_subset_matches_residue_succeeds_when_posterior_bond_leaving_atoms_missi
if name not in getattr(cys_def, bond_name + "_leaving_atoms")
]
assert len(subset) in [12, 13]
assert cys_data.subset_matches_residue(subset, cys_def) is not None
assert cys_data.subset_matches_residue_names(subset, cys_def) is not None

def test_subset_matches_residue_fails_on_element_mismatch(
self,
Expand All @@ -482,7 +482,8 @@ def test_subset_matches_residue_fails_on_element_mismatch(
):
cys_data.element[5] = "Zr"
assert (
cys_data.subset_matches_residue(range(len(cys_def.atoms)), cys_def) is None
cys_data.subset_matches_residue_names(range(len(cys_def.atoms)), cys_def)
is None
)

def test_subset_matches_residue_tolerates_none_charge_and_empty_element(self):
Expand All @@ -495,7 +496,7 @@ def test_subset_matches_residue_tolerates_none_charge_and_empty_element(self):
residue_name="HPL",
)
data = PdbData(name=["H"], element=[""], charge=[None])
assert data.subset_matches_residue([0], resdef) is not None
assert data.subset_matches_residue_names([0], resdef) is not None

def test_subset_matches_residue_tolerates_wrong_case_element(self):
resdef = ResidueDefinition(
Expand All @@ -507,7 +508,7 @@ def test_subset_matches_residue_tolerates_wrong_case_element(self):
residue_name="HPL",
)
data = PdbData(name=["H"], element=["h"], charge=[1])
assert data.subset_matches_residue([0], resdef) is not None
assert data.subset_matches_residue_names([0], resdef) is not None

def test_get_residue_matches_loads_vicinal_disulfide(
self,
Expand Down
127 changes: 2 additions & 125 deletions openff/pablo/ccd/_ccdcache.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from copy import deepcopy
from io import StringIO
from pathlib import Path
from typing import Self, no_type_check
from typing import Self
from urllib.request import urlopen

import xdg.BaseDirectory as xdg_base_dir
from openmm.app.internal.pdbx.reader.PdbxReader import PdbxReader

from ..chem import PEPTIDE_BOND, PHOSPHODIESTER_BOND
from ..residue import (
AtomDefinition,
BondDefinition,
ResidueDefinition,
_skip_residue_definition_validation,
)
Expand Down Expand Up @@ -142,7 +137,7 @@ def _apply_patches(
return definitions

def _add_definition_from_str(self, s: str, res_name: str | None = None) -> None:
definition = self._res_def_from_ccd_str(s)
definition = ResidueDefinition.from_ccd_cif_str(s)
self._add_and_patch_definition(definition, res_name)

def _add_and_patch_definition(
Expand Down Expand Up @@ -186,89 +181,6 @@ def _glob(self, pattern: str) -> Iterator[Path]:
for path in self._paths:
yield from path.glob(pattern)

@staticmethod
def _res_def_from_ccd_str(s: str) -> ResidueDefinition:
@no_type_check
def inner(s):
# TODO: Handle residues like CL with a single atom properly (no tables)
data = []
with StringIO(s) as file:
PdbxReader(file).read(data)
block = data[0]

parent_residue_name = (
block.getObj("chem_comp").getValue("mon_nstd_parent_comp_id").upper()
)
parent_residue_name = (
None if parent_residue_name == "?" else parent_residue_name
)
residueName = block.getObj("chem_comp").getValue("id").upper()
residue_description = block.getObj("chem_comp").getValue("name")
linking_type = block.getObj("chem_comp").getValue("type").upper()
linking_bond = LINKING_TYPES[linking_type]

atomData = block.getObj("chem_comp_atom")
atomNameCol = atomData.getAttributeIndex("atom_id")
altAtomNameCol = atomData.getAttributeIndex("alt_atom_id")
symbolCol = atomData.getAttributeIndex("type_symbol")
leavingCol = atomData.getAttributeIndex("pdbx_leaving_atom_flag")
chargeCol = atomData.getAttributeIndex("charge")
aromaticCol = atomData.getAttributeIndex("pdbx_aromatic_flag")
stereoCol = atomData.getAttributeIndex("pdbx_stereo_config")

atoms = [
AtomDefinition(
name=row[atomNameCol],
synonyms=tuple(
[row[altAtomNameCol]]
if row[altAtomNameCol] != row[atomNameCol]
else [],
),
symbol=row[symbolCol][0:1].upper() + row[symbolCol][1:].lower(),
leaving=row[leavingCol] == "Y",
charge=int(row[chargeCol]),
aromatic=row[aromaticCol] == "Y",
stereo=None if row[stereoCol] == "N" else row[stereoCol],
)
for row in atomData.getRowList()
]

bondData = block.getObj("chem_comp_bond")
if bondData is not None:
atom1Col = bondData.getAttributeIndex("atom_id_1")
atom2Col = bondData.getAttributeIndex("atom_id_2")
orderCol = bondData.getAttributeIndex("value_order")
aromaticCol = bondData.getAttributeIndex("pdbx_aromatic_flag")
stereoCol = bondData.getAttributeIndex("pdbx_stereo_config")
bonds = [
BondDefinition(
atom1=row[atom1Col],
atom2=row[atom2Col],
order={"SING": 1, "DOUB": 2, "TRIP": 3, "QUAD": 4}[
row[orderCol]
],
aromatic=row[aromaticCol] == "Y",
stereo=None if row[stereoCol] == "N" else row[stereoCol],
)
for row in bondData.getRowList()
]
else:
bonds = []

with _skip_residue_definition_validation():
residue_definition = ResidueDefinition(
residue_name=residueName,
description=residue_description,
linking_bond=linking_bond,
crosslink=None,
atoms=tuple(atoms),
bonds=tuple(bonds),
)

return residue_definition

return inner(s)

def __contains__(self, value: object) -> bool:
if value in self._definitions:
return True
Expand Down Expand Up @@ -300,38 +212,3 @@ def with_(
new._extra_definitions_set = True
new._add_definitions(resdefs, resname)
return new


# TODO: Fill in this data
LINKING_TYPES: dict[str, BondDefinition | None] = {
# "D-beta-peptide, C-gamma linking".upper(): [],
# "D-gamma-peptide, C-delta linking".upper(): [],
# "D-peptide COOH carboxy terminus".upper(): [],
# "D-peptide NH3 amino terminus".upper(): [],
# "D-peptide linking".upper(): [],
# "D-saccharide".upper(): [],
# "D-saccharide, alpha linking".upper(): [],
# "D-saccharide, beta linking".upper(): [],
# "DNA OH 3 prime terminus".upper(): [],
# "DNA OH 5 prime terminus".upper(): [],
"DNA linking".upper(): PHOSPHODIESTER_BOND,
"L-DNA linking".upper(): PHOSPHODIESTER_BOND,
"L-RNA linking".upper(): PHOSPHODIESTER_BOND,
# "L-beta-peptide, C-gamma linking".upper(): [],
# "L-gamma-peptide, C-delta linking".upper(): [],
# "L-peptide COOH carboxy terminus".upper(): [],
# "L-peptide NH3 amino terminus".upper(): [],
"L-peptide linking".upper(): PEPTIDE_BOND,
# "L-saccharide".upper(): [],
# "L-saccharide, alpha linking".upper(): [],
# "L-saccharide, beta linking".upper(): [],
# "RNA OH 3 prime terminus".upper(): [],
# "RNA OH 5 prime terminus".upper(): [],
"RNA linking".upper(): PHOSPHODIESTER_BOND,
"non-polymer".upper(): None,
# "other".upper(): [],
"peptide linking".upper(): PEPTIDE_BOND,
"peptide-like".upper(): PEPTIDE_BOND,
# "saccharide".upper(): [],
}
"""Map from the CCD's linking types to the bond formed between two such monomers"""
3 changes: 1 addition & 2 deletions openff/pablo/ccd/patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@

from itertools import combinations

from openff.pablo.chem import DISULFIDE_BOND
from openff.pablo.chem import DISULFIDE_BOND, PEPTIDE_BOND

from .._utils import flatten, unwrap
from ..residue import (
AtomDefinition,
BondDefinition,
ResidueDefinition,
)
from ._ccdcache import PEPTIDE_BOND

__all__ = [
"ACIDIC_PROTONS",
Expand Down
Loading
Loading