diff --git a/src/peppr/match.py b/src/peppr/match.py index 58b8866..0bf6d57 100644 --- a/src/peppr/match.py +++ b/src/peppr/match.py @@ -22,6 +22,7 @@ from rdkit.Chem import ( AssignStereochemistryFrom3D, BondType, + GetMolFrags, Mol, SanitizeFlags, SanitizeMol, @@ -1464,4 +1465,9 @@ def _to_mol(molecule: struc.AtomArray) -> Mol: BondType.DOUBLE, ]: bond.SetBondType(BondType.ONEANDAHALF) + frags = GetMolFrags(mol) + if len(frags) != 1: + raise struc.BadStructureError( + f"Molecule contains multiple disconnected fragments: {len(frags)}" + ) return mol diff --git a/src/peppr/sanitize.py b/src/peppr/sanitize.py index 60b8780..e101147 100644 --- a/src/peppr/sanitize.py +++ b/src/peppr/sanitize.py @@ -2,6 +2,7 @@ __all__ = ["sanitize"] +from itertools import product import rdkit import rdkit.Chem.AllChem as Chem @@ -24,7 +25,7 @@ def sanitize(mol: Chem.Mol, max_fix_iterations: int = 1000) -> None: ----- Deals with cases: - - ``AtomValenceException``: add charge if total valence is exceeding default (for N, O atoms only) + - ``AtomValenceException``: add charge if total valence is exceeding default (for B, N, O atoms only) - ``KekulizeException``: for ring N atoms with unspecified protonation """ mol.UpdatePropertyCache(strict=False) @@ -59,7 +60,7 @@ def sanitize(mol: Chem.Mol, max_fix_iterations: int = 1000) -> None: def _fix_problem(mol: Chem.Mol, problem: Exception) -> None: """ - Apply fixes for common problems. + Apply fixes for common input problems with RDKit SanitizeMol. Parameters ---------- @@ -109,17 +110,37 @@ def _fix_problem(mol: Chem.Mol, problem: Exception) -> None: # currently not implemented return if problem.GetType() == "KekulizeException": - # hack: only works for nitrogens with missing explicit Hs + # hack: fix KekulizeException related to ring N atoms with unspecified + # protonation by setting one or more ring nitrogens as "[nH]" + possible_nHs = [] + original_nH_state = [] for atidx in problem.GetAtomIndices(): at = mol.GetAtomWithIdx(atidx) - # set one of the nitrogens with two bonds in a ring system as "[nH]" - if ( - at.GetSymbol() == "N" - and at.GetDegree() == 2 - and at.GetTotalNumHs() == 0 - ): - at.SetNumExplicitHs(1) - break + if at.GetSymbol() == "N" and at.GetDegree() == 2: + possible_nHs.append(at) + original_nH_state.append(at.GetNumExplicitHs()) + + if not len(possible_nHs): + # no candidate nitrogens to fix the problem + # note this early return is unnecessary as the loops below will + # not be entered, but it makes the intention clearer + return + + # if there is one or more candidates - iterate through all the possible + # combinations of explicit Hs for the candidate nitrogens + for combo in product([0, 1], repeat=len(possible_nHs)): + for nH, nH_explicit in zip(possible_nHs, combo): + nH.SetNumExplicitHs(nH_explicit) + # and check if the problem is resolved + check_problems = Chem.DetectChemistryProblems( + mol, sanitizeOps=Chem.rdmolops.SanitizeFlags.SANITIZE_KEKULIZE + ) + if not [p for p in check_problems if _is_same_problem(p, problem)]: + # problem is resolved - no need to try other combinations + return + # if all combos failed - reset explicit Hs back to the original values + for nH, nH_explicit in zip(possible_nHs, original_nH_state): + nH.SetNumExplicitHs(nH_explicit) def _is_same_problem(problem1: Exception, problem2: Exception) -> bool: @@ -143,6 +164,13 @@ def _is_same_problem(problem1: Exception, problem2: Exception) -> bool: if problem1.GetAtomIdx() != problem2.GetAtomIdx(): return False elif problem_type == "KekulizeException": - if problem1.GetAtomIndices() != problem2.GetAtomIndices(): + if ( + len( + set(problem1.GetAtomIndices()).intersection( + set(problem2.GetAtomIndices()) + ) + ) + == 0 + ): return False return True diff --git a/tests/test_charge.py b/tests/test_charge.py index b67c6f5..47b0b7d 100644 --- a/tests/test_charge.py +++ b/tests/test_charge.py @@ -90,10 +90,6 @@ def test_estimate_formal_charges_from_smiles(smiles, ph, ref_charges): # imine example ("MFG", 7.4, {}), ("MFG", 3, {"N10": 1}), - # tetrazole and imidazole motifs - ("3GK", 7.4, {"N1": -1}), - ("3GK", 5, {"N1": -1, "N21": 1}), - ("3GK", 4, {"N21": 1}), # thiophenol example ("JKE", 7.4, {"SD": -1, "OD2": -1}), ("JKE", 6, {"OD2": -1}), @@ -133,6 +129,47 @@ def test_estimate_formal_charges(comp_name, ph, ref_charged_atoms, include_hydro assert test_charged_atoms == ref_charged_atoms +@pytest.mark.parametrize( + ["comp_name", "ph", "acceptatble_charge_combos"], + [ + # tetrazole and imidazole motifs + ("3GK", 7.4, [{"N1": -1}, {"N5": -1}]), + ( + "3GK", + 5, + [ + {"N1": -1, "N21": 1}, + {"N1": -1, "N23": 1}, + {"N5": -1, "N21": 1}, + {"N5": -1, "N23": 1}, + ], + ), + ("3GK", 4, [{"N21": 1}, {"N23": 1}]), + ], +) +def test_estimate_formal_charges_heterocycles_nohydrogens( + comp_name, ph, acceptatble_charge_combos +): + """ + Check if formal charges are estimated correctly for more complex small molecules. + Especially when protonation is not specified and several options are possible. + """ + atoms = info.residue(comp_name) + # Do not rely on charges extracted from the CCD + atoms.del_annotation("charge") + # remove hydrogens to check if the method can estimate charges without relying on them + atoms = atoms[atoms.element != "H"] + + charges = peppr.estimate_formal_charges(atoms, ph) + test_charged_atoms = { + atom_name: charge + for atom_name, charge in zip(atoms.atom_name, charges) + if charge != 0 + } + + assert test_charged_atoms in acceptatble_charge_combos + + def test_estimate_formal_charges_peptide(): """ Check if a peptide has the expected charges at the termini and the acidic/basic diff --git a/tests/test_contacts.py b/tests/test_contacts.py index 3e2459e..74dda5a 100644 --- a/tests/test_contacts.py +++ b/tests/test_contacts.py @@ -212,7 +212,13 @@ def test_salt_bridge_identification( contact_measurement = peppr.ContactMeasurement(receptor, ligand, cutoff=4.0, ph=6.0) test_contacts = contact_measurement.find_salt_bridges(use_resonance=use_resonance) - assert len(test_contacts) == len(expected_contacts) + if use_resonance: + # the expected use should recover all expected contacts + assert len(test_contacts) == len(expected_contacts) + else: + # If resonance is not considered, some contacts may be missed + assert len(test_contacts) <= len(expected_contacts) + for i, j in test_contacts: contact = ( receptor.res_id[i].item(), @@ -316,23 +322,21 @@ def test_find_resonance_charges(): ligand.set_annotation("charge", peppr.estimate_formal_charges(ligand, 7.4)) # create rdkit ligand object ligand_mol = rdkit_interface.to_mol(ligand) - try: - peppr.sanitize(ligand_mol) - except Exception: - return np.nan - ligand_charged_atoms = np.where(ligand.charge != 0)[0] - assert np.equal(ligand_charged_atoms, [0, 7, 8]).all() + peppr.sanitize(ligand_mol) + + charged_atoms = np.where(ligand.charge != 0)[0] + assert np.equal(charged_atoms, [0, 7, 8]).all() # get charged atoms and their resonance groups pos_mask, neg_mask, ligand_conjugated_groups = peppr.find_resonance_charges( ligand_mol ) assert len(set(ligand_conjugated_groups)) < len(ligand_conjugated_groups), ( - "Number of groups expected less than number of atoms as some are conjugated" + "Number of groups should be less than number of atoms as some are conjugated" ) charged_atom_mask = pos_mask | neg_mask ligand_charged_in_resonance_atoms = np.where(charged_atom_mask)[0] - assert set(ligand_charged_atoms) != set(ligand_charged_in_resonance_atoms), ( + assert set(charged_atoms) != set(ligand_charged_in_resonance_atoms), ( "Charged atoms do not match those found in resonance structures" ) assert np.equal(ligand_charged_in_resonance_atoms, [0, 3, 6, 7, 8]).all() diff --git a/tests/test_match.py b/tests/test_match.py index 538e12e..f08a502 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -483,8 +483,16 @@ def test_unmatchable_molecules(use_heuristic): reference = reference[reference.element != "H"] _annotate_atom_order(reference) pose = reference.copy() - # Remove bonds in one structure to make the bond graphs incompatible - pose.bonds = struc.BondList(pose.array_length()) + bond_list = pose.bonds + bond_array = bond_list.as_array().copy() + + # shuffle the bond types to create an incompatible bond graph + bond_array[:, 2] = np.random.permutation(bond_array[:, 2]) + pose.bonds = struc.BondList(bond_list.get_atom_count(), bond_array) + _annotate_atom_order(pose) + assert not np.all( + reference.bonds == pose.bonds + ) # Sanity check to make sure the bond lists are different with pytest.warns(peppr.GraphMatchWarning, match="Incompatible bond graph"): matched_reference, matched_pose = peppr.find_optimal_match( @@ -628,3 +636,30 @@ def _check_match( matched_reference.atom_id ) assert len(np.unique(matched_pose.atom_id)) == len(matched_pose.atom_id) + + +@pytest.mark.parametrize("disconnected", [True, False]) +def test__to_mol_disconnected(disconnected: bool) -> None: + """ + Test that disconnected mol in rdKit is raising exception + + We are creating a molecule with just appending a ligand to another + if `disconnected` is True + + Parameters + ---------- + disconnected: bool + Whether test a case of disconnected mol + """ + ccd = "BTN" + mol_atom_array = struc.info.residue(ccd) + if disconnected: + mol_atom_array += struc.info.residue(ccd) + with pytest.raises( + struc.BadStructureError, + match="Molecule contains multiple disconnected fragments", + ): + mol = peppr.match._to_mol(mol_atom_array) + else: + mol = peppr.match._to_mol(mol_atom_array) + assert mol.GetNumAtoms() == len(mol_atom_array), f"Failed for {ccd}" diff --git a/tests/test_sanitize.py b/tests/test_sanitize.py index 8cb3653..c128072 100644 --- a/tests/test_sanitize.py +++ b/tests/test_sanitize.py @@ -10,6 +10,7 @@ "smiles", [ "c1ccc2nc3c(c(=O)c2c1)cccc3", + "O=c1c(cccc2)c2c(Cc3ccccc3)nn1", "n1cccc1", "CN(C)(C)C", "C1CCCC=O1",