Tautomer insensitive hash v2, E/Z and stereocenter-preservation (#9128)

* Tautomer insensitive hash v2, E/Z and stereocenter-preservation

* Preserve E/Z stereochemistry and stereocenters in TautomerHashv2

Simplify extension logic to better protect stereocenters connected via
single bonds to aromatic systems. Preserve E/Z stereo on exocyclic
double bonds to distinguish geometric isomers (e.g., E/Z hydrazones).

* add helper function to remove duplicated code

* Fix ring info and bond aromaticity handling in MolHash

- Add fastFindRings check in TautomerHashv2 before ring queries
- Set isAromatic consistent with bond type (true for AROMATIC bonds)
- Fix inverted condition in RegioisomerHash

* more consistent hashes regardless of stereo annotation
This commit is contained in:
Eloy Félix
2026-04-24 14:19:47 +02:00
committed by GitHub
parent 6cac6afcb3
commit cb251343b9
2 changed files with 144 additions and 18 deletions

View File

@@ -471,15 +471,41 @@ TEST_CASE("tautomer v2") {
"C1C=CC=C2C=C(O)NC=12",
}},
// ---------------------------
// some areas for potential improvement
// E/Z isomers with heteroaromatic rings
// ---------------------------
// these two are tautomers, but the current algorithm does not catch
// them
// problematic because pyridine is recognized as tautomeric
{{"c1ccccc1/C=C/c1ncccc1", "c1ccccc1/C=C\\c1ncccc1",
"c1ccccc1C=Cc1ncccc1"},
{}},
};
// Stilbene with pyridyl: E and Z are NOT tautomers and should have
// different hashes. Previously the algorithm incorrectly treated
// aromatic heteroatoms like pyridine N as tautomeric candidates,
// causing stereo to be stripped.
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
{"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}},
// 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should
// have different hashes. The exocyclic C=C to phenyl should
// preserve stereochemistry.
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
{"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}},
// E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers
// are NOT tautomers and should have different hashes.
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
{"c1ccccc1N/N=C2/CCCCC2C",
"c1ccccc1NN=C2CCCCC2C"}},
// ---------------------------
// stereocenters near amide bonds should not be destroyed
// by extension through flagged bonds
// ---------------------------
// proline-like stereocenter between two amide C=O groups
{{"NC(=O)[C@H]1CCCN1C=O"},
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
// stereocenters near amide bonds on pyrrolidine ring
{{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ
// (stereocenter connected via single non-conjugated N-C bond)
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338)
{"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}},
// diastereomers on indole ring: aromatic C should not pull in stereocenters
{{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"},
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
};
for (const auto &[same, diff] : data) {
std::unique_ptr<RWMol> m{SmilesToMol(same[0])};
REQUIRE(m);

View File

@@ -461,7 +461,7 @@ bool isPossibleTautomericBond(const Bond *bptr,
isCandidateAtom(bptr->getEndAtom(), atomFlags);
}
// a bond is a possible starting bond if it involves a candidate hetereoatom
// a bond is a possible starting bond if it involves a candidate heteroatom
// (definition above) and an unsaturated atom
bool isPossibleStartingBond(const Bond *bptr,
const std::vector<std::uint64_t> &atomFlags,
@@ -502,6 +502,28 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
return false;
}
// Don't extend to a stereocenter via single non-conjugated bonds,
// unless the source atom has a non-aromatic unsaturated bond
// (indicating the stereocenter could become sp2 in a tautomer).
// Aromatic atoms shouldn't pull in substituent stereocenters.
bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom,
const Bond *connectingBond) {
if (targetAtom->getHybridization() != Atom::SP3 ||
targetAtom->getTotalNumHs() != 1) {
return false;
}
if (isUnsaturatedBond(connectingBond) || connectingBond->getIsConjugated()) {
return false;
}
// Check if source has a non-aromatic unsaturated bond
for (const auto &srcBnd : sourceAtom->getOwningMol().atomBonds(sourceAtom)) {
if (!srcBnd->getIsAromatic() && isUnsaturatedBond(srcBnd)) {
return false; // Source could participate in tautomerism
}
}
return true; // Skip this chiral neighbor
}
// skip the neighbor bond if otherAtom isn't a candidate and doesn't have a
// start bond OR if the bond is neither unsaturated nor conjugated and atom
// doesn't have a start bond
@@ -510,11 +532,53 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
const boost::dynamic_bitset<> &startBonds,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
return (bondFlags[nbrBond->getIdx()] ||
((!isCandidateAtom(otherAtom, atomFlags) &&
!hasStartBond(otherAtom, startBonds)) ||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
!hasStartBond(atom, startBonds))));
if (bondFlags[nbrBond->getIdx()]) {
return true;
}
// Special case: prevent extending from an aromatic carbon (with no H)
// to an exocyclic C=C system. This handles cases like stilbene-pyridyl
// (c1ccccc1/C=C/c1ncccc1) where the aromatic C has no mobile H but the
// algorithm would otherwise extend to the exocyclic C=C and destroy E/Z.
// Check BOTH directions: either the aromatic atom extending to vinyl,
// or the vinyl atom being connected from aromatic.
if (!nbrBond->getIsAromatic()) {
const Atom *aromaticAtom = nullptr;
const Atom *nonAromaticAtom = nullptr;
if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) {
aromaticAtom = atom;
nonAromaticAtom = otherAtom;
} else if (!atom->getIsAromatic() && otherAtom->getIsAromatic()) {
aromaticAtom = otherAtom;
nonAromaticAtom = atom;
}
// If we have an aromatic->non-aromatic transition
if (aromaticAtom && nonAromaticAtom) {
// Aromatic atom must have no H and be carbon
if (aromaticAtom->getTotalNumHs() == 0 &&
aromaticAtom->getAtomicNum() == 6 &&
nonAromaticAtom->getAtomicNum() == 6) {
// Check if nonAromaticAtom is part of a C=C system
for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) {
if (bnd->getBondType() == Bond::BondType::DOUBLE &&
!bnd->getIsAromatic()) {
auto otherEnd = bnd->getOtherAtom(nonAromaticAtom);
if (otherEnd->getAtomicNum() == 6) {
// This is a C=C double bond - skip extending
return true;
}
}
}
}
}
}
return ((!isCandidateAtom(otherAtom, atomFlags) &&
!hasStartBond(otherAtom, startBonds)) ||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
!hasStartBond(atom, startBonds)));
}
// counts the number of neighboring atoms that have conjugated bonds.
@@ -556,6 +620,9 @@ bool checkForOverreach(
std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
unsigned cxFlagsToSkip = 0) {
PRECONDITION(mol, "bad molecule");
if (!mol->getRingInfo()->isFindFastOrBetter()) {
MolOps::fastFindRings(*mol);
}
std::string result;
unsigned int hcount = 0;
int charge = 0;
@@ -630,6 +697,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
numConjugatedNeighbors)) {
continue;
}
if (shouldSkipChiralNeighbor(atm, oatom, nbrBond)) {
continue;
}
// if the bond is not eligible, then we can skip this neighbor
if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags,
@@ -711,6 +781,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
numConjugatedNeighbors)) {
continue;
}
if (shouldSkipChiralNeighbor(atm, oatom, nbrBnd)) {
continue;
}
if ((skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags,
bondFlags) &&
skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags,
@@ -806,6 +879,19 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
if (!oatom->getTotalNumHs()) {
continue;
}
// don't extend to reach a potential stereocenter — doing so would
// incorrectly pull the stereocenter into the tautomeric system and
// destroy its chirality. Use structural criteria (SP3 + 1H) for
// chain atoms so behavior is consistent regardless of annotation.
// For ring atoms, only skip if chirality is actually annotated,
// since ring SP3+1H atoms often genuinely participate in ring
// tautomerism (e.g. purine NH, glutarimide).
if (oatom->getHybridization() == Atom::SP3 &&
oatom->getTotalNumHs() == 1 &&
(!queryIsAtomInRing(oatom) ||
oatom->getChiralTag() != Atom::CHI_UNSPECIFIED)) {
continue;
}
unsigned int numModifiedNeighbors = 0;
for (const auto nbr : mol->atomNeighbors(oatom)) {
if (nbr == atm) {
@@ -836,9 +922,23 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
if (!bondsToModify[bptr->getIdx()]) {
continue;
}
bptr->setIsAromatic(false);
bptr->setBondType(Bond::AROMATIC);
bptr->setStereo(Bond::BondStereo::STEREONONE);
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not)
// to avoid merging distinct geometric isomers (e.g., E/Z hydrazones).
// For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that
// E/Z stereo is preserved in the SMILES output.
bool isExocyclicWithStereo = false;
if (bptr->getStereo() != Bond::BondStereo::STEREONONE) {
bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom());
bool endInRing = queryIsAtomInRing(bptr->getEndAtom());
isExocyclicWithStereo = (beginInRing != endInRing);
}
if (!isExocyclicWithStereo) {
bptr->setBondType(Bond::AROMATIC);
bptr->setIsAromatic(true); // Must be consistent with bond type
bptr->setStereo(Bond::BondStereo::STEREONONE);
} else {
bptr->setIsAromatic(false);
}
atomsToModify.set(bptr->getBeginAtomIdx());
atomsToModify.set(bptr->getEndAtomIdx());
}
@@ -1220,7 +1320,7 @@ std::string RegioisomerHash(RWMol *mol, bool useCXSmiles,
// we need a copy of the molecule so that we can loop over the bonds of
// something while modifying something else
RDKit::ROMol molcpy(*mol);
if (molcpy.getRingInfo()->isFindFastOrBetter()) {
if (!molcpy.getRingInfo()->isFindFastOrBetter()) {
MolOps::fastFindRings(molcpy);
}
for (int i = molcpy.getNumBonds() - 1; i >= 0; --i) {