From 61981921d18ba11dad32cd1b3c14f6cf30af2069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eloy=20F=C3=A9lix?= Date: Fri, 24 Apr 2026 14:19:47 +0200 Subject: [PATCH] Tautomer insensitive hash v2, E/Z and stereocenter-preservation (#9128) * Tautomer insensitive hash v2, E/Z and stereocenter-preservation * Preserve E/Z stereochemistry and stereocenters in TautomerHashv2 Simplify extension logic to better protect stereocenters connected via single bonds to aromatic systems. Preserve E/Z stereo on exocyclic double bonds to distinguish geometric isomers (e.g., E/Z hydrazones). * add helper function to remove duplicated code * Fix ring info and bond aromaticity handling in MolHash - Add fastFindRings check in TautomerHashv2 before ring queries - Set isAromatic consistent with bond type (true for AROMATIC bonds) - Fix inverted condition in RegioisomerHash * more consistent hashes regardless of stereo annotation --- Code/GraphMol/MolHash/catch_tests.cpp | 42 +++++++-- Code/GraphMol/MolHash/hashfunctions.cpp | 120 ++++++++++++++++++++++-- 2 files changed, 144 insertions(+), 18 deletions(-) diff --git a/Code/GraphMol/MolHash/catch_tests.cpp b/Code/GraphMol/MolHash/catch_tests.cpp index 15e119869..a97e4db2b 100644 --- a/Code/GraphMol/MolHash/catch_tests.cpp +++ b/Code/GraphMol/MolHash/catch_tests.cpp @@ -471,15 +471,41 @@ TEST_CASE("tautomer v2") { "C1C=CC=C2C=C(O)NC=12", }}, // --------------------------- - // some areas for potential improvement + // E/Z isomers with heteroaromatic rings // --------------------------- - // these two are tautomers, but the current algorithm does not catch - // them - // problematic because pyridine is recognized as tautomeric - {{"c1ccccc1/C=C/c1ncccc1", "c1ccccc1/C=C\\c1ncccc1", - "c1ccccc1C=Cc1ncccc1"}, - {}}, - }; + // Stilbene with pyridyl: E and Z are NOT tautomers and should have + // different hashes. Previously the algorithm incorrectly treated + // aromatic heteroatoms like pyridine N as tautomeric candidates, + // causing stereo to be stripped. + {{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619) + {"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}}, + // 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should + // have different hashes. The exocyclic C=C to phenyl should + // preserve stereochemistry. + {{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170) + {"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}}, + // E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers + // are NOT tautomers and should have different hashes. + {{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321) + {"c1ccccc1N/N=C2/CCCCC2C", + "c1ccccc1NN=C2CCCCC2C"}}, + // --------------------------- + // stereocenters near amide bonds should not be destroyed + // by extension through flagged bonds + // --------------------------- + // proline-like stereocenter between two amide C=O groups + {{"NC(=O)[C@H]1CCCN1C=O"}, + {"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051) + // stereocenters near amide bonds on pyrrolidine ring + {{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850) + // stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ + // (stereocenter connected via single non-conjugated N-C bond) + {{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338) + {"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}}, + // diastereomers on indole ring: aromatic C should not pull in stereocenters + {{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}, + {"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799 + }; for (const auto &[same, diff] : data) { std::unique_ptr m{SmilesToMol(same[0])}; REQUIRE(m); diff --git a/Code/GraphMol/MolHash/hashfunctions.cpp b/Code/GraphMol/MolHash/hashfunctions.cpp index 5a8d5e3f7..97463eefd 100644 --- a/Code/GraphMol/MolHash/hashfunctions.cpp +++ b/Code/GraphMol/MolHash/hashfunctions.cpp @@ -461,7 +461,7 @@ bool isPossibleTautomericBond(const Bond *bptr, isCandidateAtom(bptr->getEndAtom(), atomFlags); } -// a bond is a possible starting bond if it involves a candidate hetereoatom +// a bond is a possible starting bond if it involves a candidate heteroatom // (definition above) and an unsaturated atom bool isPossibleStartingBond(const Bond *bptr, const std::vector &atomFlags, @@ -502,6 +502,28 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) { return false; } +// Don't extend to a stereocenter via single non-conjugated bonds, +// unless the source atom has a non-aromatic unsaturated bond +// (indicating the stereocenter could become sp2 in a tautomer). +// Aromatic atoms shouldn't pull in substituent stereocenters. +bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom, + const Bond *connectingBond) { + if (targetAtom->getHybridization() != Atom::SP3 || + targetAtom->getTotalNumHs() != 1) { + return false; + } + if (isUnsaturatedBond(connectingBond) || connectingBond->getIsConjugated()) { + return false; + } + // Check if source has a non-aromatic unsaturated bond + for (const auto &srcBnd : sourceAtom->getOwningMol().atomBonds(sourceAtom)) { + if (!srcBnd->getIsAromatic() && isUnsaturatedBond(srcBnd)) { + return false; // Source could participate in tautomerism + } + } + return true; // Skip this chiral neighbor +} + // skip the neighbor bond if otherAtom isn't a candidate and doesn't have a // start bond OR if the bond is neither unsaturated nor conjugated and atom // doesn't have a start bond @@ -510,11 +532,53 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom, const boost::dynamic_bitset<> &startBonds, const std::vector &atomFlags, const std::vector &bondFlags) { - return (bondFlags[nbrBond->getIdx()] || - ((!isCandidateAtom(otherAtom, atomFlags) && - !hasStartBond(otherAtom, startBonds)) || - (!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() && - !hasStartBond(atom, startBonds)))); + if (bondFlags[nbrBond->getIdx()]) { + return true; + } + + // Special case: prevent extending from an aromatic carbon (with no H) + // to an exocyclic C=C system. This handles cases like stilbene-pyridyl + // (c1ccccc1/C=C/c1ncccc1) where the aromatic C has no mobile H but the + // algorithm would otherwise extend to the exocyclic C=C and destroy E/Z. + // Check BOTH directions: either the aromatic atom extending to vinyl, + // or the vinyl atom being connected from aromatic. + if (!nbrBond->getIsAromatic()) { + const Atom *aromaticAtom = nullptr; + const Atom *nonAromaticAtom = nullptr; + + if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) { + aromaticAtom = atom; + nonAromaticAtom = otherAtom; + } else if (!atom->getIsAromatic() && otherAtom->getIsAromatic()) { + aromaticAtom = otherAtom; + nonAromaticAtom = atom; + } + + // If we have an aromatic->non-aromatic transition + if (aromaticAtom && nonAromaticAtom) { + // Aromatic atom must have no H and be carbon + if (aromaticAtom->getTotalNumHs() == 0 && + aromaticAtom->getAtomicNum() == 6 && + nonAromaticAtom->getAtomicNum() == 6) { + // Check if nonAromaticAtom is part of a C=C system + for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) { + if (bnd->getBondType() == Bond::BondType::DOUBLE && + !bnd->getIsAromatic()) { + auto otherEnd = bnd->getOtherAtom(nonAromaticAtom); + if (otherEnd->getAtomicNum() == 6) { + // This is a C=C double bond - skip extending + return true; + } + } + } + } + } + } + + return ((!isCandidateAtom(otherAtom, atomFlags) && + !hasStartBond(otherAtom, startBonds)) || + (!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() && + !hasStartBond(atom, startBonds))); } // counts the number of neighboring atoms that have conjugated bonds. @@ -556,6 +620,9 @@ bool checkForOverreach( std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, unsigned cxFlagsToSkip = 0) { PRECONDITION(mol, "bad molecule"); + if (!mol->getRingInfo()->isFindFastOrBetter()) { + MolOps::fastFindRings(*mol); + } std::string result; unsigned int hcount = 0; int charge = 0; @@ -630,6 +697,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, numConjugatedNeighbors)) { continue; } + if (shouldSkipChiralNeighbor(atm, oatom, nbrBond)) { + continue; + } // if the bond is not eligible, then we can skip this neighbor if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags, @@ -711,6 +781,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, numConjugatedNeighbors)) { continue; } + if (shouldSkipChiralNeighbor(atm, oatom, nbrBnd)) { + continue; + } if ((skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags, bondFlags) && skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags, @@ -806,6 +879,19 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, if (!oatom->getTotalNumHs()) { continue; } + // don't extend to reach a potential stereocenter — doing so would + // incorrectly pull the stereocenter into the tautomeric system and + // destroy its chirality. Use structural criteria (SP3 + 1H) for + // chain atoms so behavior is consistent regardless of annotation. + // For ring atoms, only skip if chirality is actually annotated, + // since ring SP3+1H atoms often genuinely participate in ring + // tautomerism (e.g. purine NH, glutarimide). + if (oatom->getHybridization() == Atom::SP3 && + oatom->getTotalNumHs() == 1 && + (!queryIsAtomInRing(oatom) || + oatom->getChiralTag() != Atom::CHI_UNSPECIFIED)) { + continue; + } unsigned int numModifiedNeighbors = 0; for (const auto nbr : mol->atomNeighbors(oatom)) { if (nbr == atm) { @@ -836,9 +922,23 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles, if (!bondsToModify[bptr->getIdx()]) { continue; } - bptr->setIsAromatic(false); - bptr->setBondType(Bond::AROMATIC); - bptr->setStereo(Bond::BondStereo::STEREONONE); + // Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not) + // to avoid merging distinct geometric isomers (e.g., E/Z hydrazones). + // For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that + // E/Z stereo is preserved in the SMILES output. + bool isExocyclicWithStereo = false; + if (bptr->getStereo() != Bond::BondStereo::STEREONONE) { + bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom()); + bool endInRing = queryIsAtomInRing(bptr->getEndAtom()); + isExocyclicWithStereo = (beginInRing != endInRing); + } + if (!isExocyclicWithStereo) { + bptr->setBondType(Bond::AROMATIC); + bptr->setIsAromatic(true); // Must be consistent with bond type + bptr->setStereo(Bond::BondStereo::STEREONONE); + } else { + bptr->setIsAromatic(false); + } atomsToModify.set(bptr->getBeginAtomIdx()); atomsToModify.set(bptr->getEndAtomIdx()); } @@ -1220,7 +1320,7 @@ std::string RegioisomerHash(RWMol *mol, bool useCXSmiles, // we need a copy of the molecule so that we can loop over the bonds of // something while modifying something else RDKit::ROMol molcpy(*mol); - if (molcpy.getRingInfo()->isFindFastOrBetter()) { + if (!molcpy.getRingInfo()->isFindFastOrBetter()) { MolOps::fastFindRings(molcpy); } for (int i = molcpy.getNumBonds() - 1; i >= 0; --i) {