diff --git a/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp b/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp index e4099888c..e73c98c53 100644 --- a/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp +++ b/Code/GraphMol/MolHash/Wrap/rdMolHash.cpp @@ -56,4 +56,6 @@ BOOST_PYTHON_MODULE(rdMolHash) { python::arg("useCxSmiles") = false, python::arg("cxFlagsToSkip") = 0), "Generate a hash for a molecule. The func argument determines " "which hash is generated."); + python::scope().attr("excludeFromTautomerismProp") = + MolHash::excludeFromTautomerismProp; } diff --git a/Code/GraphMol/MolHash/catch_tests.cpp b/Code/GraphMol/MolHash/catch_tests.cpp index a97e4db2b..23e971911 100644 --- a/Code/GraphMol/MolHash/catch_tests.cpp +++ b/Code/GraphMol/MolHash/catch_tests.cpp @@ -477,35 +477,40 @@ TEST_CASE("tautomer v2") { // different hashes. Previously the algorithm incorrectly treated // aromatic heteroatoms like pyridine N as tautomeric candidates, // causing stereo to be stripped. - {{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619) + {{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619) {"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}}, // 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should // have different hashes. The exocyclic C=C to phenyl should // preserve stereochemistry. - {{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170) + {{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170) {"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}}, // E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers // are NOT tautomers and should have different hashes. - {{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321) - {"c1ccccc1N/N=C2/CCCCC2C", - "c1ccccc1NN=C2CCCCC2C"}}, + {{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321) + {"c1ccccc1N/N=C2/CCCCC2C", "c1ccccc1NN=C2CCCCC2C"}}, // --------------------------- // stereocenters near amide bonds should not be destroyed // by extension through flagged bonds // --------------------------- // proline-like stereocenter between two amide C=O groups {{"NC(=O)[C@H]1CCCN1C=O"}, - {"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051) + {"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051) // stereocenters near amide bonds on pyrrolidine ring - {{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850) - // stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ - // (stereocenter connected via single non-conjugated N-C bond) - {{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338) + {{"CC(=O)N[C@H]1CCNC1"}, + {"CC(=O)N[C@@H]1CCNC1", + "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850) + // stereocenter adjacent to pyrimidine/pyrazole: enantiomers should + // differ (stereocenter connected via single non-conjugated N-C + // bond) + {{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in + // SureChEMBL + // (4072338) {"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}}, - // diastereomers on indole ring: aromatic C should not pull in stereocenters + // diastereomers on indole ring: aromatic C should not pull in + // stereocenters {{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}, - {"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799 - }; + {"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799 + }; for (const auto &[same, diff] : data) { std::unique_ptr m{SmilesToMol(same[0])}; REQUIRE(m); @@ -1141,4 +1146,79 @@ TEST_CASE("github #8654: stereogroups incorrectly included in hash") { CHECK(hsh1 == hsh2); } } +} + +TEST_CASE("exclude atoms with properties") { + SECTION("basics") { + auto m = "OC=CC"_smiles; + REQUIRE(m); + RWMol m2(*m); + auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0"); + + for (auto i : {0, 1, 2}) { + INFO("excluding atom " + std::to_string(i)); + RWMol m3(*m); + m3.getAtomWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1"); + auto hsh2 = + MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0"); + } + } + SECTION("more complex example") { + auto m = "OC=CC=C"_smiles; + REQUIRE(m); + RWMol m2(*m); + auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0"); + + { + RWMol m3(*m); + m3.getAtomWithIdx(3)->setProp(MolHash::excludeFromTautomerismProp, "1"); + auto hsh2 = + MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0"); + } + { + RWMol m3(*m); + m3.getAtomWithIdx(4)->setProp(MolHash::excludeFromTautomerismProp, "1"); + auto hsh2 = + MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh2 == "[CH2]=[C]:[C]:[C]:[O]_4_0"); + } + } +} + +TEST_CASE("exclude bonds with properties") { + SECTION("basics") { + auto m = "OC=CC"_smiles; + REQUIRE(m); + RWMol m2(*m); + auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0"); + + for (auto i : {0, 1}) { + INFO("excluding bond " + std::to_string(i)); + RWMol m3(*m); + m3.getBondWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1"); + auto hsh2 = + MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0"); + } + } + SECTION("more complex example") { + auto m = "OC=CC=C"_smiles; + REQUIRE(m); + RWMol m2(*m); + auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0"); + + { + RWMol m3(*m); + m3.getBondWithIdx(2)->setProp(MolHash::excludeFromTautomerismProp, "1"); + auto hsh2 = + MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2); + CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0"); + } + } } \ No newline at end of file diff --git a/Code/GraphMol/MolHash/hashfunctions.cpp b/Code/GraphMol/MolHash/hashfunctions.cpp index 97463eefd..950c0ac0e 100644 --- a/Code/GraphMol/MolHash/hashfunctions.cpp +++ b/Code/GraphMol/MolHash/hashfunctions.cpp @@ -384,9 +384,12 @@ std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles, } namespace details { - +constexpr std::uint64_t atomFlagProperty = + 1; //*< atom has the exclude property constexpr std::uint64_t bondFlagCarboxyl = 1; //*< bond involved in in carboxyl, amide, etc. +constexpr std::uint64_t bondFlagProperty = + 2; //*< bond has the exclude property std::vector getBondFlags(const ROMol &mol) { // FIX: oversimplified, but should work for now static std::vector patterns{ @@ -412,7 +415,14 @@ std::vector getBondFlags(const ROMol &mol) { } } - std::vector bondFlags(mol.getNumBonds(), 0); + std::vector bondFlags; + bondFlags.reserve(mol.getNumBonds()); + std::ranges::transform( + mol.bonds(), std::back_inserter(bondFlags), [](const auto &bnd) { + return bnd->hasProp(MolHash::excludeFromTautomerismProp) + ? details::bondFlagProperty + : 0; + }); for (const auto &qry : queries) { auto matches = SubstructMatch(mol, *qry); for (const auto &match : matches) { @@ -466,7 +476,8 @@ bool isPossibleTautomericBond(const Bond *bptr, bool isPossibleStartingBond(const Bond *bptr, const std::vector &atomFlags, const std::vector &bondFlags) { - if (bondFlags[bptr->getIdx()]) { + if (bondFlags[bptr->getIdx()] || atomFlags[bptr->getBeginAtom()->getIdx()] || + atomFlags[bptr->getEndAtom()->getIdx()]) { return false; } auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) && @@ -507,7 +518,7 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) { // (indicating the stereocenter could become sp2 in a tautomer). // Aromatic atoms shouldn't pull in substituent stereocenters. bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom, - const Bond *connectingBond) { + const Bond *connectingBond) { if (targetAtom->getHybridization() != Atom::SP3 || targetAtom->getTotalNumHs() != 1) { return false; @@ -532,7 +543,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, const boost::dynamic_bitset<> &startBonds, const std::vector &atomFlags, const std::vector &bondFlags) { - if (bondFlags[nbrBond->getIdx()]) { + if (bondFlags[nbrBond->getIdx()] || atomFlags[otherAtom->getIdx()] || + atomFlags[atom->getIdx()]) { return true; } @@ -545,7 +557,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, if (!nbrBond->getIsAromatic()) { const Atom *aromaticAtom = nullptr; const Atom *nonAromaticAtom = nullptr; - + if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) { aromaticAtom = atom; nonAromaticAtom = otherAtom; @@ -553,7 +565,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, aromaticAtom = otherAtom; nonAromaticAtom = atom; } - + // If we have an aromatic->non-aromatic transition if (aromaticAtom && nonAromaticAtom) { // Aromatic atom must have no H and be carbon @@ -561,7 +573,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, aromaticAtom->getAtomicNum() == 6 && nonAromaticAtom->getAtomicNum() == 6) { // Check if nonAromaticAtom is part of a C=C system - for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) { + for (const auto &bnd : + atom->getOwningMol().atomBonds(nonAromaticAtom)) { if (bnd->getBondType() == Bond::BondType::DOUBLE && !bnd->getIsAromatic()) { auto otherEnd = bnd->getOtherAtom(nonAromaticAtom); @@ -627,9 +640,14 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, unsigned int hcount = 0; int charge = 0; - // we aren't current doing anything with atomFlags, but we have added them in - // analogy to the bondFlags as a kind of future proofing. - std::vector atomFlags(mol->getNumAtoms(), 0); + std::vector atomFlags; + atomFlags.reserve(mol->getNumAtoms()); + std::ranges::transform(mol->atoms(), std::back_inserter(atomFlags), + [](const auto &atom) { + return atom->hasProp(excludeFromTautomerismProp) + ? details::atomFlagProperty + : 0; + }); auto bondFlags = details::getBondFlags(*mol); boost::dynamic_bitset<> bondsToModify(mol->getNumBonds()); @@ -922,10 +940,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, if (!bondsToModify[bptr->getIdx()]) { continue; } - // Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not) - // to avoid merging distinct geometric isomers (e.g., E/Z hydrazones). - // For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that - // E/Z stereo is preserved in the SMILES output. + // Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one + // not) to avoid merging distinct geometric isomers (e.g., E/Z + // hydrazones). For these bonds, keep them as DOUBLE bonds (not AROMATIC) + // so that E/Z stereo is preserved in the SMILES output. bool isExocyclicWithStereo = false; if (bptr->getStereo() != Bond::BondStereo::STEREONONE) { bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom()); diff --git a/Code/GraphMol/MolHash/nmmolhash.h b/Code/GraphMol/MolHash/nmmolhash.h index e0e5ca2be..e138ba5c1 100644 --- a/Code/GraphMol/MolHash/nmmolhash.h +++ b/Code/GraphMol/MolHash/nmmolhash.h @@ -22,6 +22,11 @@ namespace RDKit { class RWMol; namespace MolHash { +// atoms or bonds with this property set (value not important) will not be +// included in tautomerism when using the HetAtomTautomerv2 or +// HetAtomProtomerv2 hash functions. +constexpr const char *excludeFromTautomerismProp = + "_MolHashExcludeFromTautomerism"; enum class HashFunction { AnonymousGraph = 1, ElementGraph = 2,