From 53203079c159a207f8cff373abb0dbc38b091c1e Mon Sep 17 00:00:00 2001 From: David Cosgrove Date: Wed, 29 Oct 2025 20:50:19 +0000 Subject: [PATCH] Allow Multiple Core Hits in the Same Molecule in RGroupDecomposition (#8813) * Allow the same core to match more than once in a molecule. * Update annotation. * Changes after review. --------- Co-authored-by: David Cosgrove --- .../RGroupDecomposition/RGroupDecomp.cpp | 349 ++++++++++-------- .../RGroupDecomposition/RGroupDecompParams.h | 48 +-- .../Wrap/rdRGroupComposition.cpp | 8 +- .../RGroupDecomposition/catch_rgd.cpp | 82 ++++ 4 files changed, 302 insertions(+), 185 deletions(-) diff --git a/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp b/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp index b4f272aaf..f35989e20 100644 --- a/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp +++ b/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp @@ -346,6 +346,36 @@ int RGroupDecomposition::getMatchingCoreInternal( return core_idx; } +namespace { +// Take the matches, all from the same molecule and split them so that +// different atom sets are separated out. So that if a core hits +// more than once in the molecule, both sets of R Groups will be +// returned. +std::vector> splitNonUniqueMatches( + const std::vector &tmatches, unsigned int nAtoms) { + std::vector> outMatches; + std::vector> atomSets; + for (const auto &match : tmatches) { + boost::dynamic_bitset<> atomSet(nAtoms); + for (const auto &mp : match) { + atomSet.set(mp.second); + } + if (std::find(atomSets.begin(), atomSets.end(), atomSet) == + atomSets.end()) { + atomSets.push_back(atomSet); + outMatches.push_back(std::vector(1, match)); + } else { + for (size_t i = 0; i < atomSets.size(); ++i) { + if (atomSet == atomSets[i]) { + outMatches[i].push_back(match); + } + } + } + } + return outMatches; +} +} // namespace + int RGroupDecomposition::add(const ROMol &inmol) { RWMOL_SPTR mol(new RWMol(inmol)); const RCore *rcore; @@ -371,6 +401,7 @@ int RGroupDecomposition::add(const ROMol &inmol) { } } } + // mark any wildcards in input molecule: for (auto &atom : mol->atoms()) { if (atom->getAtomicNum() == 0) { @@ -397,180 +428,192 @@ int RGroupDecomposition::add(const ROMol &inmol) { std::vector potentialMatches; constexpr size_t MAX_PERMUTATIONS = 100000; - std::unique_ptr tMol; - for (const auto &tmatche : tmatches) { - const bool replaceDummies = false; - const bool labelByIndex = true; - const bool requireDummyMatch = false; - // TODO see if we need replaceCoreAtomsWithMolMatches or can just use - // rcore->core - auto coreCopy = rcore->replaceCoreAtomsWithMolMatches(*mol, tmatche); - tMol.reset(replaceCore(*mol, *coreCopy, tmatche, replaceDummies, - labelByIndex, requireDummyMatch)); + std::vector> nonUniqueMatches; + if (data->params.allowMultipleCoresInSameMol) { + nonUniqueMatches = splitNonUniqueMatches(tmatches, mol->getNumAtoms()); + } else { + nonUniqueMatches.push_back(tmatches); + } + + for (const auto &splitMatch : nonUniqueMatches) { + std::unique_ptr tMol; + for (const auto &tmatche : splitMatch) { + const bool replaceDummies = false; + const bool labelByIndex = true; + const bool requireDummyMatch = false; + // TODO see if we need replaceCoreAtomsWithMolMatches or can just use + // rcore->core + auto coreCopy = rcore->replaceCoreAtomsWithMolMatches(*mol, tmatche); + tMol.reset(replaceCore(*mol, *coreCopy, tmatche, replaceDummies, + labelByIndex, requireDummyMatch)); #ifdef VERBOSE - std::cerr << "Core Match core_idx " << core_idx << " idx " - << data->matches.size() << ": " << MolToSmarts(*coreCopy) - << std::endl; + std::cerr << "Core Match core_idx " << core_idx << " idx " + << data->matches.size() << ": " << MolToSmarts(*coreCopy) + << std::endl; #endif - if (tMol) { + if (tMol) { #ifdef VERBOSE - std::cerr << "All Fragments " << MolToSmiles(*tMol) << std::endl; + std::cerr << "All Fragments " << MolToSmiles(*tMol) << std::endl; #endif - R_DECOMP match; - // rlabel rgroups - MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false); - std::set coreAtomAnyMatched; - // get the sidechains - for (size_t i = 0; i < fragments.size(); ++i) { - const auto &newMol = fragments[i]; - std::vector rlabelsOnSideChain; - newMol->setProp("core", core_idx); - newMol->setProp("idx", data->matches.size()); - newMol->setProp("frag_idx", i); + R_DECOMP match; + // rlabel rgroups + MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false); + std::set coreAtomAnyMatched; + // get the sidechains + for (size_t i = 0; i < fragments.size(); ++i) { + const auto &newMol = fragments[i]; + std::vector rlabelsOnSideChain; + newMol->setProp("core", core_idx); + newMol->setProp("idx", data->matches.size()); + newMol->setProp("frag_idx", i); #ifdef VERBOSE - std::cerr << "Fragment " << MolToSmiles(*newMol) << std::endl; + std::cerr << "Fragment " << MolToSmiles(*newMol) << std::endl; #endif - for (auto sideChainAtom : newMol->atoms()) { - if (sideChainAtom->getAtomicNum() != 0) { - // we are only interested in sidechain R group atoms - continue; + for (auto sideChainAtom : newMol->atoms()) { + if (sideChainAtom->getAtomicNum() != 0) { + // we are only interested in sidechain R group atoms + continue; + } + if (!sideChainAtom->hasProp(_rgroupInputDummy)) { + // this is the index of the core atom that the R group + // atom is attached to + unsigned int coreAtomIndex = sideChainAtom->getIsotope(); + auto coreAtom = rcore->core->getAtomWithIdx(coreAtomIndex); + coreAtomAnyMatched.insert(coreAtomIndex); + int rlabel; + if (coreAtom->getPropIfPresent(RLABEL, rlabel)) { + std::vector rlabelsOnSideChainAtom; + sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS, + rlabelsOnSideChainAtom); + rlabelsOnSideChainAtom.push_back(rlabel); + sideChainAtom->setProp(SIDECHAIN_RLABELS, + rlabelsOnSideChainAtom); + data->labels.insert(rlabel); // keep track of all labels used + rlabelsOnSideChain.push_back(rlabel); + if (const auto [bondIdx, end] = + newMol->getAtomBonds(sideChainAtom); + bondIdx != end) { + auto connectingBond = (*newMol)[*bondIdx]; + if (connectingBond->getStereo() > + Bond::BondStereo::STEREOANY) { + // TODO: how to handle bond stereo on rgroups connected to + // core by stereo double bonds + connectingBond->setStereo(Bond::BondStereo::STEREOANY); + } + } + } + } else { + // restore input wildcard + sideChainAtom->clearProp(_rgroupInputDummy); + } } - if (!sideChainAtom->hasProp(_rgroupInputDummy)) { - // this is the index of the core atom that the R group - // atom is attached to - unsigned int coreAtomIndex = sideChainAtom->getIsotope(); - auto coreAtom = rcore->core->getAtomWithIdx(coreAtomIndex); - coreAtomAnyMatched.insert(coreAtomIndex); - int rlabel; - if (coreAtom->getPropIfPresent(RLABEL, rlabel)) { - std::vector rlabelsOnSideChainAtom; - sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS, - rlabelsOnSideChainAtom); - rlabelsOnSideChainAtom.push_back(rlabel); - sideChainAtom->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChainAtom); - data->labels.insert(rlabel); // keep track of all labels used - rlabelsOnSideChain.push_back(rlabel); - if (const auto [bondIdx, end] = - newMol->getAtomBonds(sideChainAtom); - bondIdx != end) { - auto connectingBond = (*newMol)[*bondIdx]; - if (connectingBond->getStereo() > Bond::BondStereo::STEREOANY) { - // TODO: how to handle bond stereo on rgroups connected to - // core by stereo double bonds - connectingBond->setStereo(Bond::BondStereo::STEREOANY); + if (data->params.includeTargetMolInResults) { + setTargetAtomBondIndices(*newMol, true); + } + if (!rlabelsOnSideChain.empty()) { +#ifdef VERBOSE + std::string newCoreSmi = MolToSmiles(*newMol, true); +#endif + + for (auto rlabel : rlabelsOnSideChain) { + ADD_MATCH(match, rlabel); + match[rlabel]->add(newMol, rlabelsOnSideChain); +#ifdef VERBOSE + std::cerr << "Fragment " << i << " R" << rlabel << " " + << MolToSmiles(*newMol) << std::endl; +#endif + } + } else { + // special case, only one fragment + if (fragments.size() == 1) { // need to make a new core + // remove the sidechains + + // GJ I think if we ever get here that it's really an error and I + // believe that I've fixed the case where this code was called. + // Still, I'm too scared to delete the block. + RWMol newCore(*mol); + + for (const auto &mvpair : tmatche) { + const Atom *coreAtm = rcore->core->getAtomWithIdx(mvpair.first); + Atom *newCoreAtm = newCore.getAtomWithIdx(mvpair.second); + int rlabel; + if (coreAtm->getPropIfPresent(RLABEL, rlabel)) { + newCoreAtm->setProp(RLABEL, rlabel); + } + newCoreAtm->setProp("keep", true); + } + + newCore.beginBatchEdit(); + for (const auto atom : newCore.atoms()) { + if (!atom->hasProp("keep")) { + newCore.removeAtom(atom); + } + } + newCore.commitBatchEdit(); + if (newCore.getNumAtoms()) { + std::string newCoreSmi = MolToSmiles(newCore, true); + // add a new core if possible + auto newcore = data->newCores.find(newCoreSmi); + int core_idx = 0; + if (newcore == data->newCores.end()) { + core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--; + data->cores[core_idx] = RCore(newCore); + return add(inmol); } } } - } else { - // restore input wildcard - sideChainAtom->clearProp(_rgroupInputDummy); } } - if (data->params.includeTargetMolInResults) { - setTargetAtomBondIndices(*newMol, true); - } - if (!rlabelsOnSideChain.empty()) { -#ifdef VERBOSE - std::string newCoreSmi = MolToSmiles(*newMol, true); -#endif - for (auto rlabel : rlabelsOnSideChain) { - ADD_MATCH(match, rlabel); - match[rlabel]->add(newMol, rlabelsOnSideChain); -#ifdef VERBOSE - std::cerr << "Fragment " << i << " R" << rlabel << " " - << MolToSmiles(*newMol) << std::endl; -#endif + if (!match.empty()) { + // this is the number of user-defined R labels associated with + // non-hydrogen substituents + auto numberUserGroupsInMatch = std::accumulate( + match.begin(), match.end(), 0, + [](int sum, + const std::pair> &p) { + return p.first > 0 && !p.second->is_hydrogen ? ++sum : sum; + }); + int numberMissingUserGroups = + rcore->numberUserRGroups - numberUserGroupsInMatch; + CHECK_INVARIANT(numberMissingUserGroups >= 0, + "Data error in missing user rgroup count"); + const auto extractedCore = + rcore->extractCoreFromMolMatch(*mol, tmatche, params()); + if (data->params.includeTargetMolInResults) { + setTargetAtomBondIndices(*extractedCore, false); } - } else { - // special case, only one fragment - if (fragments.size() == 1) { // need to make a new core - // remove the sidechains - - // GJ I think if we ever get here that it's really an error and I - // believe that I've fixed the case where this code was called. - // Still, I'm too scared to delete the block. - RWMol newCore(*mol); - - for (const auto &mvpair : tmatche) { - const Atom *coreAtm = rcore->core->getAtomWithIdx(mvpair.first); - Atom *newCoreAtm = newCore.getAtomWithIdx(mvpair.second); - int rlabel; - if (coreAtm->getPropIfPresent(RLABEL, rlabel)) { - newCoreAtm->setProp(RLABEL, rlabel); - } - newCoreAtm->setProp("keep", true); - } - - newCore.beginBatchEdit(); - for (const auto atom : newCore.atoms()) { - if (!atom->hasProp("keep")) { - newCore.removeAtom(atom); - } - } - newCore.commitBatchEdit(); - if (newCore.getNumAtoms()) { - std::string newCoreSmi = MolToSmiles(newCore, true); - // add a new core if possible - auto newcore = data->newCores.find(newCoreSmi); - int core_idx = 0; - if (newcore == data->newCores.end()) { - core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--; - data->cores[core_idx] = RCore(newCore); - return add(inmol); - } - } + potentialMatches.emplace_back(core_idx, numberMissingUserGroups, + match, extractedCore); + if (data->params.includeTargetMolInResults) { + potentialMatches.back().setTargetMoleculeForHighlights(mol); } } } + } - if (!match.empty()) { - // this is the number of user-defined R labels associated with - // non-hydrogen substituents - auto numberUserGroupsInMatch = std::accumulate( - match.begin(), match.end(), 0, - [](int sum, - const std::pair> &p) { - return p.first > 0 && !p.second->is_hydrogen ? ++sum : sum; - }); - int numberMissingUserGroups = - rcore->numberUserRGroups - numberUserGroupsInMatch; - CHECK_INVARIANT(numberMissingUserGroups >= 0, - "Data error in missing user rgroup count"); - const auto extractedCore = - rcore->extractCoreFromMolMatch(*mol, tmatche, params()); - if (data->params.includeTargetMolInResults) { - setTargetAtomBondIndices(*extractedCore, false); - } - potentialMatches.emplace_back(core_idx, numberMissingUserGroups, match, - extractedCore); - if (data->params.includeTargetMolInResults) { - potentialMatches.back().setTargetMoleculeForHighlights(mol); - } + if (potentialMatches.empty()) { + BOOST_LOG(rdDebugLog) + << "No attachment points in side chains" << std::endl; + return -2; + } + + if (data->params.matchingStrategy != GA) { + size_t N = 1; + for (auto matche = data->matches.begin() + data->previousMatchSize; + matche != data->matches.end(); ++matche) { + size_t sz = matche->size(); + N *= sz; + } + // Highly symmetric cores can lead to a very large number of + // permutations to test. Fall back to Greedy for the current chunk + // when the number is too high. + if (N * potentialMatches.size() > MAX_PERMUTATIONS) { + data->process(data->prunePermutations); } } + data->matches.push_back(std::move(potentialMatches)); } - if (potentialMatches.empty()) { - BOOST_LOG(rdDebugLog) << "No attachment points in side chains" << std::endl; - return -2; - } - - if (data->params.matchingStrategy != GA) { - size_t N = 1; - for (auto matche = data->matches.begin() + data->previousMatchSize; - matche != data->matches.end(); ++matche) { - size_t sz = matche->size(); - N *= sz; - } - // Highly symmetric cores can lead to a very large number of - // permutations to test. Fall back to Greedy for the current chunk - // when the number is too high. - if (N * potentialMatches.size() > MAX_PERMUTATIONS) { - data->process(data->prunePermutations); - } - } - data->matches.push_back(std::move(potentialMatches)); - if (!data->matches.empty()) { if (data->params.matchingStrategy & Greedy || (data->params.matchingStrategy & GreedyChunks && diff --git a/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h b/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h index ba968df83..f0237ddb2 100644 --- a/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h +++ b/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h @@ -18,40 +18,23 @@ namespace RDKit { -BETTER_ENUM(RGroupLabels, unsigned int, - IsotopeLabels = 0x01, - AtomMapLabels = 0x02, - AtomIndexLabels = 0x04, - RelabelDuplicateLabels = 0x08, - MDLRGroupLabels = 0x10, - DummyAtomLabels = 0x20, // These are rgroups but will get relabelled - AutoDetect = 0xFF -); - -BETTER_ENUM(RGroupMatching, unsigned int, - Greedy = 0x01, - GreedyChunks = 0x02, - Exhaustive = 0x04, // not really useful for large sets - NoSymmetrization = 0x08, - GA = 0x10 -); - BETTER_ENUM( - RGroupLabelling, unsigned int, - AtomMap = 0x01, - Isotope = 0x02, - MDLRGroup = 0x04 -); + RGroupLabels, unsigned int, IsotopeLabels = 0x01, AtomMapLabels = 0x02, + AtomIndexLabels = 0x04, RelabelDuplicateLabels = 0x08, + MDLRGroupLabels = 0x10, + DummyAtomLabels = 0x20, // These are rgroups but will get relabelled + AutoDetect = 0xFF); -BETTER_ENUM(RGroupCoreAlignment, unsigned int, - NoAlignment = 0x0, - MCS = 0x01 -); +BETTER_ENUM(RGroupMatching, unsigned int, Greedy = 0x01, GreedyChunks = 0x02, + Exhaustive = 0x04, // not really useful for large sets + NoSymmetrization = 0x08, GA = 0x10); -BETTER_ENUM(RGroupScore, unsigned int, - Match = 0x1, - FingerprintVariance = 0x4 -); +BETTER_ENUM(RGroupLabelling, unsigned int, AtomMap = 0x01, Isotope = 0x02, + MDLRGroup = 0x04); + +BETTER_ENUM(RGroupCoreAlignment, unsigned int, NoAlignment = 0x0, MCS = 0x01); + +BETTER_ENUM(RGroupScore, unsigned int, Match = 0x1, FingerprintVariance = 0x4); struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters { unsigned int labels = RGroupLabels::AutoDetect; @@ -75,6 +58,9 @@ struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters { bool allowNonTerminalRGroups = false; //! unlabelled core atoms can have multiple rgroups bool allowMultipleRGroupsOnUnlabelled = false; + //! Permit a core to match more than once in the same molecule if the sets of + // matched atoms are not equal. + bool allowMultipleCoresInSameMol = false; // extended query settings for core matching bool doTautomers = false; bool doEnumeration = false; diff --git a/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp b/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp index fefa87e99..bb89a8cc3 100644 --- a/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp +++ b/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp @@ -288,7 +288,10 @@ struct rgroupdecomp_wrapper { "input structure\n" " - doEnumeration: expand input cores into enumerated mol bundles\n" " - allowMultipleRGroupsOnUnlabelled: permit more than one rgroup to " - "be attached to an unlabelled core atom"; + "be attached to an unlabelled core atom\n" + " - allowMultipleCoresInSameMol: permit a core to match more than" + " once in the same molecule if the sets of matched atoms are not equal" + " (default=False)"; python::class_( "RGroupDecompositionParameters", docString.c_str(), python::init<>(python::args("self"), "Constructor, takes no arguments")) @@ -338,6 +341,9 @@ struct rgroupdecomp_wrapper { .def_readwrite("allowMultipleRGroupsOnUnlabelled", &RDKit::RGroupDecompositionParameters:: allowMultipleRGroupsOnUnlabelled) + .def_readwrite( + "allowMultipleCoresInSameMol", + &RDKit::RGroupDecompositionParameters::allowMultipleCoresInSameMol) .def_readwrite("doTautomers", &RDKit::RGroupDecompositionParameters::doTautomers) .def_readwrite("doEnumeration", diff --git a/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp b/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp index c15c686f5..86089758b 100644 --- a/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp +++ b/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp @@ -1141,3 +1141,85 @@ TEST_CASE("includeTargetMolInResults") { } } } + +TEST_CASE("Multiple Core Hits") { + { + std::vector cores{ + "c1([*:9])c([*:8])c([*:7])c2c(c1([*:10]))c(c([*:5])n2([*:6]))[CH2]C([*:3])([*:4])[N,n]([*:1])([*:2])"_smarts}; + REQUIRE(cores.front()); + std::vector mols{ + "CC1(C)N2[C@@H](Cc3c1[nH]c4ccccc34)C(=O)N(CCc5c[nH]c6ccccc56)CC2=O"_smiles}; + REQUIRE(mols.front()); + RGroupRows rows; + RGroupDecompositionParameters ps; + ps.allowMultipleCoresInSameMol = true; + auto n = RGroupDecompose(cores, mols, rows, nullptr, ps); + CHECK(n == 1); + CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( +[ + { + "Core":"c1ccc2c(C[C@H](N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1", + "R1":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]", + "R2":"CC(C)([*:2])[*:5]", + "R3":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]", + "R5":"CC(C)([*:2])[*:5]" + }, + { + "Core":"c1ccc2c(CC(N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1", + "R1":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]", + "R2":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]", + "R3":"[H][*:3]", + "R5":"[H][*:5]" + } +])JSON")); + } + { + std::vector cores{"c1ccccc1"_smarts}; + std::vector mols{"Fc1ccccc1Nc2ccc(Cl)cc2"_smiles, + "c1cc(O)cc(Oc2cccc(Br)c2)c1"_smiles, + "Ic1ccccc1"_smiles}; + RGroupRows rows; + RGroupDecompositionParameters ps; + ps.allowMultipleCoresInSameMol = true; + auto n = RGroupDecompose(cores, mols, rows, nullptr, ps); + CHECK(n == 3); + CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( +[ + { + "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", + "R1":"[H][*:1]", + "R2":"[H][*:2]", + "R3":"F[*:3]", + "R4":"Clc1ccc(N[*:4])cc1" + }, + { + "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", + "R1":"Fc1ccccc1N[*:1]", + "R2":"[H][*:2]", + "R3":"[H][*:3]", + "R4":"Cl[*:4]" + }, + { + "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", + "R1":"[H][*:1]", + "R2":"O[*:2]", + "R3":"[H][*:3]", + "R4":"Brc1cccc(O[*:4])c1" + }, + { + "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", + "R1":"[H][*:1]", + "R2":"Oc1cccc(O[*:2])c1", + "R3":"[H][*:3]", + "R4":"Br[*:4]" + }, + { + "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", + "R1":"[H][*:1]", + "R2":"[H][*:2]", + "R3":"[H][*:3]", + "R4":"I[*:4]" + } +])JSON")); + } +} \ No newline at end of file