mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Tautomer insensitive hash v2, E/Z and stereocenter-preservation (#9128)
* Tautomer insensitive hash v2, E/Z and stereocenter-preservation * Preserve E/Z stereochemistry and stereocenters in TautomerHashv2 Simplify extension logic to better protect stereocenters connected via single bonds to aromatic systems. Preserve E/Z stereo on exocyclic double bonds to distinguish geometric isomers (e.g., E/Z hydrazones). * add helper function to remove duplicated code * Fix ring info and bond aromaticity handling in MolHash - Add fastFindRings check in TautomerHashv2 before ring queries - Set isAromatic consistent with bond type (true for AROMATIC bonds) - Fix inverted condition in RegioisomerHash * more consistent hashes regardless of stereo annotation
This commit is contained in:
@@ -471,15 +471,41 @@ TEST_CASE("tautomer v2") {
|
||||
"C1C=CC=C2C=C(O)NC=12",
|
||||
}},
|
||||
// ---------------------------
|
||||
// some areas for potential improvement
|
||||
// E/Z isomers with heteroaromatic rings
|
||||
// ---------------------------
|
||||
// these two are tautomers, but the current algorithm does not catch
|
||||
// them
|
||||
// problematic because pyridine is recognized as tautomeric
|
||||
{{"c1ccccc1/C=C/c1ncccc1", "c1ccccc1/C=C\\c1ncccc1",
|
||||
"c1ccccc1C=Cc1ncccc1"},
|
||||
{}},
|
||||
};
|
||||
// Stilbene with pyridyl: E and Z are NOT tautomers and should have
|
||||
// different hashes. Previously the algorithm incorrectly treated
|
||||
// aromatic heteroatoms like pyridine N as tautomeric candidates,
|
||||
// causing stereo to be stripped.
|
||||
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
|
||||
{"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}},
|
||||
// 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should
|
||||
// have different hashes. The exocyclic C=C to phenyl should
|
||||
// preserve stereochemistry.
|
||||
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
|
||||
{"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}},
|
||||
// E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers
|
||||
// are NOT tautomers and should have different hashes.
|
||||
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
|
||||
{"c1ccccc1N/N=C2/CCCCC2C",
|
||||
"c1ccccc1NN=C2CCCCC2C"}},
|
||||
// ---------------------------
|
||||
// stereocenters near amide bonds should not be destroyed
|
||||
// by extension through flagged bonds
|
||||
// ---------------------------
|
||||
// proline-like stereocenter between two amide C=O groups
|
||||
{{"NC(=O)[C@H]1CCCN1C=O"},
|
||||
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
|
||||
// stereocenters near amide bonds on pyrrolidine ring
|
||||
{{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
|
||||
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ
|
||||
// (stereocenter connected via single non-conjugated N-C bond)
|
||||
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338)
|
||||
{"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}},
|
||||
// diastereomers on indole ring: aromatic C should not pull in stereocenters
|
||||
{{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"},
|
||||
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
|
||||
};
|
||||
for (const auto &[same, diff] : data) {
|
||||
std::unique_ptr<RWMol> m{SmilesToMol(same[0])};
|
||||
REQUIRE(m);
|
||||
|
||||
@@ -461,7 +461,7 @@ bool isPossibleTautomericBond(const Bond *bptr,
|
||||
isCandidateAtom(bptr->getEndAtom(), atomFlags);
|
||||
}
|
||||
|
||||
// a bond is a possible starting bond if it involves a candidate hetereoatom
|
||||
// a bond is a possible starting bond if it involves a candidate heteroatom
|
||||
// (definition above) and an unsaturated atom
|
||||
bool isPossibleStartingBond(const Bond *bptr,
|
||||
const std::vector<std::uint64_t> &atomFlags,
|
||||
@@ -502,6 +502,28 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Don't extend to a stereocenter via single non-conjugated bonds,
|
||||
// unless the source atom has a non-aromatic unsaturated bond
|
||||
// (indicating the stereocenter could become sp2 in a tautomer).
|
||||
// Aromatic atoms shouldn't pull in substituent stereocenters.
|
||||
bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom,
|
||||
const Bond *connectingBond) {
|
||||
if (targetAtom->getHybridization() != Atom::SP3 ||
|
||||
targetAtom->getTotalNumHs() != 1) {
|
||||
return false;
|
||||
}
|
||||
if (isUnsaturatedBond(connectingBond) || connectingBond->getIsConjugated()) {
|
||||
return false;
|
||||
}
|
||||
// Check if source has a non-aromatic unsaturated bond
|
||||
for (const auto &srcBnd : sourceAtom->getOwningMol().atomBonds(sourceAtom)) {
|
||||
if (!srcBnd->getIsAromatic() && isUnsaturatedBond(srcBnd)) {
|
||||
return false; // Source could participate in tautomerism
|
||||
}
|
||||
}
|
||||
return true; // Skip this chiral neighbor
|
||||
}
|
||||
|
||||
// skip the neighbor bond if otherAtom isn't a candidate and doesn't have a
|
||||
// start bond OR if the bond is neither unsaturated nor conjugated and atom
|
||||
// doesn't have a start bond
|
||||
@@ -510,11 +532,53 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
|
||||
const boost::dynamic_bitset<> &startBonds,
|
||||
const std::vector<std::uint64_t> &atomFlags,
|
||||
const std::vector<std::uint64_t> &bondFlags) {
|
||||
return (bondFlags[nbrBond->getIdx()] ||
|
||||
((!isCandidateAtom(otherAtom, atomFlags) &&
|
||||
!hasStartBond(otherAtom, startBonds)) ||
|
||||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
|
||||
!hasStartBond(atom, startBonds))));
|
||||
if (bondFlags[nbrBond->getIdx()]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Special case: prevent extending from an aromatic carbon (with no H)
|
||||
// to an exocyclic C=C system. This handles cases like stilbene-pyridyl
|
||||
// (c1ccccc1/C=C/c1ncccc1) where the aromatic C has no mobile H but the
|
||||
// algorithm would otherwise extend to the exocyclic C=C and destroy E/Z.
|
||||
// Check BOTH directions: either the aromatic atom extending to vinyl,
|
||||
// or the vinyl atom being connected from aromatic.
|
||||
if (!nbrBond->getIsAromatic()) {
|
||||
const Atom *aromaticAtom = nullptr;
|
||||
const Atom *nonAromaticAtom = nullptr;
|
||||
|
||||
if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) {
|
||||
aromaticAtom = atom;
|
||||
nonAromaticAtom = otherAtom;
|
||||
} else if (!atom->getIsAromatic() && otherAtom->getIsAromatic()) {
|
||||
aromaticAtom = otherAtom;
|
||||
nonAromaticAtom = atom;
|
||||
}
|
||||
|
||||
// If we have an aromatic->non-aromatic transition
|
||||
if (aromaticAtom && nonAromaticAtom) {
|
||||
// Aromatic atom must have no H and be carbon
|
||||
if (aromaticAtom->getTotalNumHs() == 0 &&
|
||||
aromaticAtom->getAtomicNum() == 6 &&
|
||||
nonAromaticAtom->getAtomicNum() == 6) {
|
||||
// Check if nonAromaticAtom is part of a C=C system
|
||||
for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) {
|
||||
if (bnd->getBondType() == Bond::BondType::DOUBLE &&
|
||||
!bnd->getIsAromatic()) {
|
||||
auto otherEnd = bnd->getOtherAtom(nonAromaticAtom);
|
||||
if (otherEnd->getAtomicNum() == 6) {
|
||||
// This is a C=C double bond - skip extending
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ((!isCandidateAtom(otherAtom, atomFlags) &&
|
||||
!hasStartBond(otherAtom, startBonds)) ||
|
||||
(!isUnsaturatedBond(nbrBond) && !nbrBond->getIsConjugated() &&
|
||||
!hasStartBond(atom, startBonds)));
|
||||
}
|
||||
|
||||
// counts the number of neighboring atoms that have conjugated bonds.
|
||||
@@ -556,6 +620,9 @@ bool checkForOverreach(
|
||||
std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
unsigned cxFlagsToSkip = 0) {
|
||||
PRECONDITION(mol, "bad molecule");
|
||||
if (!mol->getRingInfo()->isFindFastOrBetter()) {
|
||||
MolOps::fastFindRings(*mol);
|
||||
}
|
||||
std::string result;
|
||||
unsigned int hcount = 0;
|
||||
int charge = 0;
|
||||
@@ -630,6 +697,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
numConjugatedNeighbors)) {
|
||||
continue;
|
||||
}
|
||||
if (shouldSkipChiralNeighbor(atm, oatom, nbrBond)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the bond is not eligible, then we can skip this neighbor
|
||||
if (skipNeighborBond(atm, oatom, nbrBond, startBonds, atomFlags,
|
||||
@@ -711,6 +781,9 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
numConjugatedNeighbors)) {
|
||||
continue;
|
||||
}
|
||||
if (shouldSkipChiralNeighbor(atm, oatom, nbrBnd)) {
|
||||
continue;
|
||||
}
|
||||
if ((skipNeighborBond(atm, oatom, nbrBnd, startBonds, atomFlags,
|
||||
bondFlags) &&
|
||||
skipNeighborBond(oatom, atm, nbrBnd, startBonds, atomFlags,
|
||||
@@ -806,6 +879,19 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
if (!oatom->getTotalNumHs()) {
|
||||
continue;
|
||||
}
|
||||
// don't extend to reach a potential stereocenter — doing so would
|
||||
// incorrectly pull the stereocenter into the tautomeric system and
|
||||
// destroy its chirality. Use structural criteria (SP3 + 1H) for
|
||||
// chain atoms so behavior is consistent regardless of annotation.
|
||||
// For ring atoms, only skip if chirality is actually annotated,
|
||||
// since ring SP3+1H atoms often genuinely participate in ring
|
||||
// tautomerism (e.g. purine NH, glutarimide).
|
||||
if (oatom->getHybridization() == Atom::SP3 &&
|
||||
oatom->getTotalNumHs() == 1 &&
|
||||
(!queryIsAtomInRing(oatom) ||
|
||||
oatom->getChiralTag() != Atom::CHI_UNSPECIFIED)) {
|
||||
continue;
|
||||
}
|
||||
unsigned int numModifiedNeighbors = 0;
|
||||
for (const auto nbr : mol->atomNeighbors(oatom)) {
|
||||
if (nbr == atm) {
|
||||
@@ -836,9 +922,23 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
if (!bondsToModify[bptr->getIdx()]) {
|
||||
continue;
|
||||
}
|
||||
bptr->setIsAromatic(false);
|
||||
bptr->setBondType(Bond::AROMATIC);
|
||||
bptr->setStereo(Bond::BondStereo::STEREONONE);
|
||||
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not)
|
||||
// to avoid merging distinct geometric isomers (e.g., E/Z hydrazones).
|
||||
// For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that
|
||||
// E/Z stereo is preserved in the SMILES output.
|
||||
bool isExocyclicWithStereo = false;
|
||||
if (bptr->getStereo() != Bond::BondStereo::STEREONONE) {
|
||||
bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom());
|
||||
bool endInRing = queryIsAtomInRing(bptr->getEndAtom());
|
||||
isExocyclicWithStereo = (beginInRing != endInRing);
|
||||
}
|
||||
if (!isExocyclicWithStereo) {
|
||||
bptr->setBondType(Bond::AROMATIC);
|
||||
bptr->setIsAromatic(true); // Must be consistent with bond type
|
||||
bptr->setStereo(Bond::BondStereo::STEREONONE);
|
||||
} else {
|
||||
bptr->setIsAromatic(false);
|
||||
}
|
||||
atomsToModify.set(bptr->getBeginAtomIdx());
|
||||
atomsToModify.set(bptr->getEndAtomIdx());
|
||||
}
|
||||
@@ -1220,7 +1320,7 @@ std::string RegioisomerHash(RWMol *mol, bool useCXSmiles,
|
||||
// we need a copy of the molecule so that we can loop over the bonds of
|
||||
// something while modifying something else
|
||||
RDKit::ROMol molcpy(*mol);
|
||||
if (molcpy.getRingInfo()->isFindFastOrBetter()) {
|
||||
if (!molcpy.getRingInfo()->isFindFastOrBetter()) {
|
||||
MolOps::fastFindRings(molcpy);
|
||||
}
|
||||
for (int i = molcpy.getNumBonds() - 1; i >= 0; --i) {
|
||||
|
||||
Reference in New Issue
Block a user