add ability to block atoms/bonds from participating in tautomer zones (#9297)

* add ability to block atoms/bonds from participating in tautomer zones

* be more structured with the atom flag

* response to review

---------

Co-authored-by: = <=>
This commit is contained in:
Greg Landrum
2026-05-29 05:38:02 +02:00
committed by greg landrum
parent aa0190d2db
commit 2c3c7d4257
4 changed files with 133 additions and 28 deletions

View File

@@ -56,4 +56,6 @@ BOOST_PYTHON_MODULE(rdMolHash) {
python::arg("useCxSmiles") = false, python::arg("cxFlagsToSkip") = 0),
"Generate a hash for a molecule. The func argument determines "
"which hash is generated.");
python::scope().attr("excludeFromTautomerismProp") =
MolHash::excludeFromTautomerismProp;
}

View File

@@ -477,35 +477,40 @@ TEST_CASE("tautomer v2") {
// different hashes. Previously the algorithm incorrectly treated
// aromatic heteroatoms like pyridine N as tautomeric candidates,
// causing stereo to be stripped.
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
{"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}},
// 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should
// have different hashes. The exocyclic C=C to phenyl should
// preserve stereochemistry.
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
{"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}},
// E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers
// are NOT tautomers and should have different hashes.
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
{"c1ccccc1N/N=C2/CCCCC2C",
"c1ccccc1NN=C2CCCCC2C"}},
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
{"c1ccccc1N/N=C2/CCCCC2C", "c1ccccc1NN=C2CCCCC2C"}},
// ---------------------------
// stereocenters near amide bonds should not be destroyed
// by extension through flagged bonds
// ---------------------------
// proline-like stereocenter between two amide C=O groups
{{"NC(=O)[C@H]1CCCN1C=O"},
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
// stereocenters near amide bonds on pyrrolidine ring
{{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ
// (stereocenter connected via single non-conjugated N-C bond)
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338)
{{"CC(=O)N[C@H]1CCNC1"},
{"CC(=O)N[C@@H]1CCNC1",
"CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should
// differ (stereocenter connected via single non-conjugated N-C
// bond)
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in
// SureChEMBL
// (4072338)
{"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}},
// diastereomers on indole ring: aromatic C should not pull in stereocenters
// diastereomers on indole ring: aromatic C should not pull in
// stereocenters
{{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"},
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
};
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
};
for (const auto &[same, diff] : data) {
std::unique_ptr<RWMol> m{SmilesToMol(same[0])};
REQUIRE(m);
@@ -1141,4 +1146,79 @@ TEST_CASE("github #8654: stereogroups incorrectly included in hash") {
CHECK(hsh1 == hsh2);
}
}
}
TEST_CASE("exclude atoms with properties") {
SECTION("basics") {
auto m = "OC=CC"_smiles;
REQUIRE(m);
RWMol m2(*m);
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0");
for (auto i : {0, 1, 2}) {
INFO("excluding atom " + std::to_string(i));
RWMol m3(*m);
m3.getAtomWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1");
auto hsh2 =
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0");
}
}
SECTION("more complex example") {
auto m = "OC=CC=C"_smiles;
REQUIRE(m);
RWMol m2(*m);
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0");
{
RWMol m3(*m);
m3.getAtomWithIdx(3)->setProp(MolHash::excludeFromTautomerismProp, "1");
auto hsh2 =
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0");
}
{
RWMol m3(*m);
m3.getAtomWithIdx(4)->setProp(MolHash::excludeFromTautomerismProp, "1");
auto hsh2 =
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh2 == "[CH2]=[C]:[C]:[C]:[O]_4_0");
}
}
}
TEST_CASE("exclude bonds with properties") {
SECTION("basics") {
auto m = "OC=CC"_smiles;
REQUIRE(m);
RWMol m2(*m);
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0");
for (auto i : {0, 1}) {
INFO("excluding bond " + std::to_string(i));
RWMol m3(*m);
m3.getBondWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1");
auto hsh2 =
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0");
}
}
SECTION("more complex example") {
auto m = "OC=CC=C"_smiles;
REQUIRE(m);
RWMol m2(*m);
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0");
{
RWMol m3(*m);
m3.getBondWithIdx(2)->setProp(MolHash::excludeFromTautomerismProp, "1");
auto hsh2 =
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0");
}
}
}

View File

@@ -384,9 +384,12 @@ std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles,
}
namespace details {
constexpr std::uint64_t atomFlagProperty =
1; //*< atom has the exclude property
constexpr std::uint64_t bondFlagCarboxyl =
1; //*< bond involved in in carboxyl, amide, etc.
constexpr std::uint64_t bondFlagProperty =
2; //*< bond has the exclude property
std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
// FIX: oversimplified, but should work for now
static std::vector<std::string> patterns{
@@ -412,7 +415,14 @@ std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
}
}
std::vector<std::uint64_t> bondFlags(mol.getNumBonds(), 0);
std::vector<std::uint64_t> bondFlags;
bondFlags.reserve(mol.getNumBonds());
std::ranges::transform(
mol.bonds(), std::back_inserter(bondFlags), [](const auto &bnd) {
return bnd->hasProp(MolHash::excludeFromTautomerismProp)
? details::bondFlagProperty
: 0;
});
for (const auto &qry : queries) {
auto matches = SubstructMatch(mol, *qry);
for (const auto &match : matches) {
@@ -466,7 +476,8 @@ bool isPossibleTautomericBond(const Bond *bptr,
bool isPossibleStartingBond(const Bond *bptr,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
if (bondFlags[bptr->getIdx()]) {
if (bondFlags[bptr->getIdx()] || atomFlags[bptr->getBeginAtom()->getIdx()] ||
atomFlags[bptr->getEndAtom()->getIdx()]) {
return false;
}
auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) &&
@@ -507,7 +518,7 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
// (indicating the stereocenter could become sp2 in a tautomer).
// Aromatic atoms shouldn't pull in substituent stereocenters.
bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom,
const Bond *connectingBond) {
const Bond *connectingBond) {
if (targetAtom->getHybridization() != Atom::SP3 ||
targetAtom->getTotalNumHs() != 1) {
return false;
@@ -532,7 +543,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
const boost::dynamic_bitset<> &startBonds,
const std::vector<std::uint64_t> &atomFlags,
const std::vector<std::uint64_t> &bondFlags) {
if (bondFlags[nbrBond->getIdx()]) {
if (bondFlags[nbrBond->getIdx()] || atomFlags[otherAtom->getIdx()] ||
atomFlags[atom->getIdx()]) {
return true;
}
@@ -545,7 +557,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
if (!nbrBond->getIsAromatic()) {
const Atom *aromaticAtom = nullptr;
const Atom *nonAromaticAtom = nullptr;
if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) {
aromaticAtom = atom;
nonAromaticAtom = otherAtom;
@@ -553,7 +565,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
aromaticAtom = otherAtom;
nonAromaticAtom = atom;
}
// If we have an aromatic->non-aromatic transition
if (aromaticAtom && nonAromaticAtom) {
// Aromatic atom must have no H and be carbon
@@ -561,7 +573,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
aromaticAtom->getAtomicNum() == 6 &&
nonAromaticAtom->getAtomicNum() == 6) {
// Check if nonAromaticAtom is part of a C=C system
for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) {
for (const auto &bnd :
atom->getOwningMol().atomBonds(nonAromaticAtom)) {
if (bnd->getBondType() == Bond::BondType::DOUBLE &&
!bnd->getIsAromatic()) {
auto otherEnd = bnd->getOtherAtom(nonAromaticAtom);
@@ -627,9 +640,14 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
unsigned int hcount = 0;
int charge = 0;
// we aren't current doing anything with atomFlags, but we have added them in
// analogy to the bondFlags as a kind of future proofing.
std::vector<std::uint64_t> atomFlags(mol->getNumAtoms(), 0);
std::vector<std::uint64_t> atomFlags;
atomFlags.reserve(mol->getNumAtoms());
std::ranges::transform(mol->atoms(), std::back_inserter(atomFlags),
[](const auto &atom) {
return atom->hasProp(excludeFromTautomerismProp)
? details::atomFlagProperty
: 0;
});
auto bondFlags = details::getBondFlags(*mol);
boost::dynamic_bitset<> bondsToModify(mol->getNumBonds());
@@ -922,10 +940,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
if (!bondsToModify[bptr->getIdx()]) {
continue;
}
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not)
// to avoid merging distinct geometric isomers (e.g., E/Z hydrazones).
// For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that
// E/Z stereo is preserved in the SMILES output.
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one
// not) to avoid merging distinct geometric isomers (e.g., E/Z
// hydrazones). For these bonds, keep them as DOUBLE bonds (not AROMATIC)
// so that E/Z stereo is preserved in the SMILES output.
bool isExocyclicWithStereo = false;
if (bptr->getStereo() != Bond::BondStereo::STEREONONE) {
bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom());

View File

@@ -22,6 +22,11 @@
namespace RDKit {
class RWMol;
namespace MolHash {
// atoms or bonds with this property set (value not important) will not be
// included in tautomerism when using the HetAtomTautomerv2 or
// HetAtomProtomerv2 hash functions.
constexpr const char *excludeFromTautomerismProp =
"_MolHashExcludeFromTautomerism";
enum class HashFunction {
AnonymousGraph = 1,
ElementGraph = 2,