mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
add ability to block atoms/bonds from participating in tautomer zones (#9297)
* add ability to block atoms/bonds from participating in tautomer zones * be more structured with the atom flag * response to review --------- Co-authored-by: = <=>
This commit is contained in:
@@ -56,4 +56,6 @@ BOOST_PYTHON_MODULE(rdMolHash) {
|
||||
python::arg("useCxSmiles") = false, python::arg("cxFlagsToSkip") = 0),
|
||||
"Generate a hash for a molecule. The func argument determines "
|
||||
"which hash is generated.");
|
||||
python::scope().attr("excludeFromTautomerismProp") =
|
||||
MolHash::excludeFromTautomerismProp;
|
||||
}
|
||||
|
||||
@@ -477,35 +477,40 @@ TEST_CASE("tautomer v2") {
|
||||
// different hashes. Previously the algorithm incorrectly treated
|
||||
// aromatic heteroatoms like pyridine N as tautomeric candidates,
|
||||
// causing stereo to be stripped.
|
||||
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
|
||||
{{"c1ccccc1/C=C/c1ncccc1"}, // in ChEMBL (CHEMBL1877619)
|
||||
{"c1ccccc1/C=C\\c1ncccc1", "c1ccccc1C=Cc1ncccc1"}},
|
||||
// 5-benzylidenerhodanine: E/Z isomers are NOT tautomers and should
|
||||
// have different hashes. The exocyclic C=C to phenyl should
|
||||
// preserve stereochemistry.
|
||||
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
|
||||
{{"O=C1NC(=S)S/C1=C/c2ccccc2"}, // in ChEMBL (CHEMBL4796170)
|
||||
{"O=C1NC(=S)S/C1=C\\c2ccccc2", "O=C1NC(=S)SC1=Cc2ccccc2"}},
|
||||
// E/Z hydrazones with exocyclic C=N to a ring: E and Z isomers
|
||||
// are NOT tautomers and should have different hashes.
|
||||
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
|
||||
{"c1ccccc1N/N=C2/CCCCC2C",
|
||||
"c1ccccc1NN=C2CCCCC2C"}},
|
||||
{{"c1ccccc1N/N=C2\\CCCCC2C"}, // in SureChEMBL (11696321)
|
||||
{"c1ccccc1N/N=C2/CCCCC2C", "c1ccccc1NN=C2CCCCC2C"}},
|
||||
// ---------------------------
|
||||
// stereocenters near amide bonds should not be destroyed
|
||||
// by extension through flagged bonds
|
||||
// ---------------------------
|
||||
// proline-like stereocenter between two amide C=O groups
|
||||
{{"NC(=O)[C@H]1CCCN1C=O"},
|
||||
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
|
||||
{"NC(=O)[C@@H]1CCCN1C=O"}}, // in SureChEMBL (8959051)
|
||||
// stereocenters near amide bonds on pyrrolidine ring
|
||||
{{"CC(=O)N[C@H]1CCNC1"}, {"CC(=O)N[C@@H]1CCNC1", "CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
|
||||
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should differ
|
||||
// (stereocenter connected via single non-conjugated N-C bond)
|
||||
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in SureChEMBL (4072338)
|
||||
{{"CC(=O)N[C@H]1CCNC1"},
|
||||
{"CC(=O)N[C@@H]1CCNC1",
|
||||
"CC(=O)NC1CCNC1"}}, // in SureChEMBL (39850)
|
||||
// stereocenter adjacent to pyrimidine/pyrazole: enantiomers should
|
||||
// differ (stereocenter connected via single non-conjugated N-C
|
||||
// bond)
|
||||
{{"C[C@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}, // in
|
||||
// SureChEMBL
|
||||
// (4072338)
|
||||
{"C[C@@H](c1ccccc1)Nc2ncc(c(n2)Nc3cc([nH]n3)C4CC4)Cl"}},
|
||||
// diastereomers on indole ring: aromatic C should not pull in stereocenters
|
||||
// diastereomers on indole ring: aromatic C should not pull in
|
||||
// stereocenters
|
||||
{{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"},
|
||||
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
|
||||
};
|
||||
{"C[C@@H]1Cc2c3ccccc3[nH]c2[C@H](N1CC(F)(F)F)c4cc(ccc4Cl)OCCNCCCF"}}, // in ChEMBL CHEMBL5972799
|
||||
};
|
||||
for (const auto &[same, diff] : data) {
|
||||
std::unique_ptr<RWMol> m{SmilesToMol(same[0])};
|
||||
REQUIRE(m);
|
||||
@@ -1141,4 +1146,79 @@ TEST_CASE("github #8654: stereogroups incorrectly included in hash") {
|
||||
CHECK(hsh1 == hsh2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("exclude atoms with properties") {
|
||||
SECTION("basics") {
|
||||
auto m = "OC=CC"_smiles;
|
||||
REQUIRE(m);
|
||||
RWMol m2(*m);
|
||||
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0");
|
||||
|
||||
for (auto i : {0, 1, 2}) {
|
||||
INFO("excluding atom " + std::to_string(i));
|
||||
RWMol m3(*m);
|
||||
m3.getAtomWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1");
|
||||
auto hsh2 =
|
||||
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0");
|
||||
}
|
||||
}
|
||||
SECTION("more complex example") {
|
||||
auto m = "OC=CC=C"_smiles;
|
||||
REQUIRE(m);
|
||||
RWMol m2(*m);
|
||||
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0");
|
||||
|
||||
{
|
||||
RWMol m3(*m);
|
||||
m3.getAtomWithIdx(3)->setProp(MolHash::excludeFromTautomerismProp, "1");
|
||||
auto hsh2 =
|
||||
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0");
|
||||
}
|
||||
{
|
||||
RWMol m3(*m);
|
||||
m3.getAtomWithIdx(4)->setProp(MolHash::excludeFromTautomerismProp, "1");
|
||||
auto hsh2 =
|
||||
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh2 == "[CH2]=[C]:[C]:[C]:[O]_4_0");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("exclude bonds with properties") {
|
||||
SECTION("basics") {
|
||||
auto m = "OC=CC"_smiles;
|
||||
REQUIRE(m);
|
||||
RWMol m2(*m);
|
||||
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh1 == "[CH3]-[C]:[C]:[O]_3_0");
|
||||
|
||||
for (auto i : {0, 1}) {
|
||||
INFO("excluding bond " + std::to_string(i));
|
||||
RWMol m3(*m);
|
||||
m3.getBondWithIdx(i)->setProp(MolHash::excludeFromTautomerismProp, "1");
|
||||
auto hsh2 =
|
||||
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh2 == "[CH3]-[CH]=[CH]-[OH]_0_0");
|
||||
}
|
||||
}
|
||||
SECTION("more complex example") {
|
||||
auto m = "OC=CC=C"_smiles;
|
||||
REQUIRE(m);
|
||||
RWMol m2(*m);
|
||||
auto hsh1 = MolHash::MolHash(&m2, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh1 == "[C]:[C]:[C]:[C]:[O]_6_0");
|
||||
|
||||
{
|
||||
RWMol m3(*m);
|
||||
m3.getBondWithIdx(2)->setProp(MolHash::excludeFromTautomerismProp, "1");
|
||||
auto hsh2 =
|
||||
MolHash::MolHash(&m3, MolHash::HashFunction::HetAtomTautomerv2);
|
||||
CHECK(hsh2 == "[CH2]=[CH]-[C]:[C]:[O]_3_0");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -384,9 +384,12 @@ std::string MesomerHash(RWMol *mol, bool netq, bool useCXSmiles,
|
||||
}
|
||||
|
||||
namespace details {
|
||||
|
||||
constexpr std::uint64_t atomFlagProperty =
|
||||
1; //*< atom has the exclude property
|
||||
constexpr std::uint64_t bondFlagCarboxyl =
|
||||
1; //*< bond involved in in carboxyl, amide, etc.
|
||||
constexpr std::uint64_t bondFlagProperty =
|
||||
2; //*< bond has the exclude property
|
||||
std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
|
||||
// FIX: oversimplified, but should work for now
|
||||
static std::vector<std::string> patterns{
|
||||
@@ -412,7 +415,14 @@ std::vector<std::uint64_t> getBondFlags(const ROMol &mol) {
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::uint64_t> bondFlags(mol.getNumBonds(), 0);
|
||||
std::vector<std::uint64_t> bondFlags;
|
||||
bondFlags.reserve(mol.getNumBonds());
|
||||
std::ranges::transform(
|
||||
mol.bonds(), std::back_inserter(bondFlags), [](const auto &bnd) {
|
||||
return bnd->hasProp(MolHash::excludeFromTautomerismProp)
|
||||
? details::bondFlagProperty
|
||||
: 0;
|
||||
});
|
||||
for (const auto &qry : queries) {
|
||||
auto matches = SubstructMatch(mol, *qry);
|
||||
for (const auto &match : matches) {
|
||||
@@ -466,7 +476,8 @@ bool isPossibleTautomericBond(const Bond *bptr,
|
||||
bool isPossibleStartingBond(const Bond *bptr,
|
||||
const std::vector<std::uint64_t> &atomFlags,
|
||||
const std::vector<std::uint64_t> &bondFlags) {
|
||||
if (bondFlags[bptr->getIdx()]) {
|
||||
if (bondFlags[bptr->getIdx()] || atomFlags[bptr->getBeginAtom()->getIdx()] ||
|
||||
atomFlags[bptr->getEndAtom()->getIdx()]) {
|
||||
return false;
|
||||
}
|
||||
auto heteroBeg = isHeteroAtom(bptr->getBeginAtom()) &&
|
||||
@@ -507,7 +518,7 @@ bool hasStartBond(const Atom *aptr, const boost::dynamic_bitset<> &startBonds) {
|
||||
// (indicating the stereocenter could become sp2 in a tautomer).
|
||||
// Aromatic atoms shouldn't pull in substituent stereocenters.
|
||||
bool shouldSkipChiralNeighbor(const Atom *sourceAtom, const Atom *targetAtom,
|
||||
const Bond *connectingBond) {
|
||||
const Bond *connectingBond) {
|
||||
if (targetAtom->getHybridization() != Atom::SP3 ||
|
||||
targetAtom->getTotalNumHs() != 1) {
|
||||
return false;
|
||||
@@ -532,7 +543,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
|
||||
const boost::dynamic_bitset<> &startBonds,
|
||||
const std::vector<std::uint64_t> &atomFlags,
|
||||
const std::vector<std::uint64_t> &bondFlags) {
|
||||
if (bondFlags[nbrBond->getIdx()]) {
|
||||
if (bondFlags[nbrBond->getIdx()] || atomFlags[otherAtom->getIdx()] ||
|
||||
atomFlags[atom->getIdx()]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -545,7 +557,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
|
||||
if (!nbrBond->getIsAromatic()) {
|
||||
const Atom *aromaticAtom = nullptr;
|
||||
const Atom *nonAromaticAtom = nullptr;
|
||||
|
||||
|
||||
if (atom->getIsAromatic() && !otherAtom->getIsAromatic()) {
|
||||
aromaticAtom = atom;
|
||||
nonAromaticAtom = otherAtom;
|
||||
@@ -553,7 +565,7 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
|
||||
aromaticAtom = otherAtom;
|
||||
nonAromaticAtom = atom;
|
||||
}
|
||||
|
||||
|
||||
// If we have an aromatic->non-aromatic transition
|
||||
if (aromaticAtom && nonAromaticAtom) {
|
||||
// Aromatic atom must have no H and be carbon
|
||||
@@ -561,7 +573,8 @@ bool skipNeighborBond(const Atom *atom, const Atom *otherAtom,
|
||||
aromaticAtom->getAtomicNum() == 6 &&
|
||||
nonAromaticAtom->getAtomicNum() == 6) {
|
||||
// Check if nonAromaticAtom is part of a C=C system
|
||||
for (const auto &bnd : atom->getOwningMol().atomBonds(nonAromaticAtom)) {
|
||||
for (const auto &bnd :
|
||||
atom->getOwningMol().atomBonds(nonAromaticAtom)) {
|
||||
if (bnd->getBondType() == Bond::BondType::DOUBLE &&
|
||||
!bnd->getIsAromatic()) {
|
||||
auto otherEnd = bnd->getOtherAtom(nonAromaticAtom);
|
||||
@@ -627,9 +640,14 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
unsigned int hcount = 0;
|
||||
int charge = 0;
|
||||
|
||||
// we aren't current doing anything with atomFlags, but we have added them in
|
||||
// analogy to the bondFlags as a kind of future proofing.
|
||||
std::vector<std::uint64_t> atomFlags(mol->getNumAtoms(), 0);
|
||||
std::vector<std::uint64_t> atomFlags;
|
||||
atomFlags.reserve(mol->getNumAtoms());
|
||||
std::ranges::transform(mol->atoms(), std::back_inserter(atomFlags),
|
||||
[](const auto &atom) {
|
||||
return atom->hasProp(excludeFromTautomerismProp)
|
||||
? details::atomFlagProperty
|
||||
: 0;
|
||||
});
|
||||
auto bondFlags = details::getBondFlags(*mol);
|
||||
|
||||
boost::dynamic_bitset<> bondsToModify(mol->getNumBonds());
|
||||
@@ -922,10 +940,10 @@ std::string TautomerHashv2(RWMol *mol, bool proto, bool useCXSmiles,
|
||||
if (!bondsToModify[bptr->getIdx()]) {
|
||||
continue;
|
||||
}
|
||||
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one not)
|
||||
// to avoid merging distinct geometric isomers (e.g., E/Z hydrazones).
|
||||
// For these bonds, keep them as DOUBLE bonds (not AROMATIC) so that
|
||||
// E/Z stereo is preserved in the SMILES output.
|
||||
// Preserve E/Z stereo on exocyclic double bonds (one atom in ring, one
|
||||
// not) to avoid merging distinct geometric isomers (e.g., E/Z
|
||||
// hydrazones). For these bonds, keep them as DOUBLE bonds (not AROMATIC)
|
||||
// so that E/Z stereo is preserved in the SMILES output.
|
||||
bool isExocyclicWithStereo = false;
|
||||
if (bptr->getStereo() != Bond::BondStereo::STEREONONE) {
|
||||
bool beginInRing = queryIsAtomInRing(bptr->getBeginAtom());
|
||||
|
||||
@@ -22,6 +22,11 @@
|
||||
namespace RDKit {
|
||||
class RWMol;
|
||||
namespace MolHash {
|
||||
// atoms or bonds with this property set (value not important) will not be
|
||||
// included in tautomerism when using the HetAtomTautomerv2 or
|
||||
// HetAtomProtomerv2 hash functions.
|
||||
constexpr const char *excludeFromTautomerismProp =
|
||||
"_MolHashExcludeFromTautomerism";
|
||||
enum class HashFunction {
|
||||
AnonymousGraph = 1,
|
||||
ElementGraph = 2,
|
||||
|
||||
Reference in New Issue
Block a user