Allow Multiple Core Hits in the Same Molecule in RGroupDecomposition (#8813)

* Allow the same core to match more than once in a molecule. * Update annotation. * Changes after review. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
2026-06-03 21:44:30 +08:00 · 2025-10-29 20:50:19 +00:00
parent e6f37dc498
commit 53203079c1
4 changed files with 302 additions and 185 deletions
--- a/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp
+++ b/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp
@@ -346,6 +346,36 @@ int RGroupDecomposition::getMatchingCoreInternal(
  return core_idx;
 }

+namespace {
+// Take the matches, all from the same molecule and split them so that
+// different atom sets are separated out.  So that if a core hits
+// more than once in the molecule, both sets of R Groups will be
+// returned.
+std::vector<std::vector<MatchVectType>> splitNonUniqueMatches(
+    const std::vector<MatchVectType> &tmatches, unsigned int nAtoms) {
+  std::vector<std::vector<MatchVectType>> outMatches;
+  std::vector<boost::dynamic_bitset<>> atomSets;
+  for (const auto &match : tmatches) {
+    boost::dynamic_bitset<> atomSet(nAtoms);
+    for (const auto &mp : match) {
+      atomSet.set(mp.second);
+    }
+    if (std::find(atomSets.begin(), atomSets.end(), atomSet) ==
+        atomSets.end()) {
+      atomSets.push_back(atomSet);
+      outMatches.push_back(std::vector<MatchVectType>(1, match));
+    } else {
+      for (size_t i = 0; i < atomSets.size(); ++i) {
+        if (atomSet == atomSets[i]) {
+          outMatches[i].push_back(match);
+        }
+      }
+    }
+  }
+  return outMatches;
+}
+}  // namespace
+
 int RGroupDecomposition::add(const ROMol &inmol) {
  RWMOL_SPTR mol(new RWMol(inmol));
  const RCore *rcore;
@@ -371,6 +401,7 @@ int RGroupDecomposition::add(const ROMol &inmol) {
      }
    }
  }
+
  // mark any wildcards in input molecule:
  for (auto &atom : mol->atoms()) {
    if (atom->getAtomicNum() == 0) {
@@ -397,8 +428,16 @@ int RGroupDecomposition::add(const ROMol &inmol) {
  std::vector<RGroupMatch> potentialMatches;
  constexpr size_t MAX_PERMUTATIONS = 100000;

+  std::vector<std::vector<MatchVectType>> nonUniqueMatches;
+  if (data->params.allowMultipleCoresInSameMol) {
+    nonUniqueMatches = splitNonUniqueMatches(tmatches, mol->getNumAtoms());
+  } else {
+    nonUniqueMatches.push_back(tmatches);
+  }
+
+  for (const auto &splitMatch : nonUniqueMatches) {
    std::unique_ptr<ROMol> tMol;
-  for (const auto &tmatche : tmatches) {
+    for (const auto &tmatche : splitMatch) {
      const bool replaceDummies = false;
      const bool labelByIndex = true;
      const bool requireDummyMatch = false;
@@ -447,14 +486,16 @@ int RGroupDecomposition::add(const ROMol &inmol) {
                sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS,
                                                rlabelsOnSideChainAtom);
                rlabelsOnSideChainAtom.push_back(rlabel);
-              sideChainAtom->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChainAtom);
+                sideChainAtom->setProp(SIDECHAIN_RLABELS,
+                                       rlabelsOnSideChainAtom);
                data->labels.insert(rlabel);  // keep track of all labels used
                rlabelsOnSideChain.push_back(rlabel);
                if (const auto [bondIdx, end] =
                        newMol->getAtomBonds(sideChainAtom);
                    bondIdx != end) {
                  auto connectingBond = (*newMol)[*bondIdx];
-                if (connectingBond->getStereo() > Bond::BondStereo::STEREOANY) {
+                  if (connectingBond->getStereo() >
+                      Bond::BondStereo::STEREOANY) {
                    // TODO: how to handle bond stereo on rgroups connected to
                    // core by stereo double bonds
                    connectingBond->setStereo(Bond::BondStereo::STEREOANY);
@@ -542,16 +583,18 @@ int RGroupDecomposition::add(const ROMol &inmol) {
          if (data->params.includeTargetMolInResults) {
            setTargetAtomBondIndices(*extractedCore, false);
          }
-        potentialMatches.emplace_back(core_idx, numberMissingUserGroups, match,
-                                      extractedCore);
+          potentialMatches.emplace_back(core_idx, numberMissingUserGroups,
+                                        match, extractedCore);
          if (data->params.includeTargetMolInResults) {
            potentialMatches.back().setTargetMoleculeForHighlights(mol);
          }
        }
      }
    }
+
    if (potentialMatches.empty()) {
-    BOOST_LOG(rdDebugLog) << "No attachment points in side chains" << std::endl;
+      BOOST_LOG(rdDebugLog)
+          << "No attachment points in side chains" << std::endl;
      return -2;
    }

@@ -570,7 +613,7 @@ int RGroupDecomposition::add(const ROMol &inmol) {
      }
    }
    data->matches.push_back(std::move(potentialMatches));
-
+  }
  if (!data->matches.empty()) {
    if (data->params.matchingStrategy & Greedy ||
        (data->params.matchingStrategy & GreedyChunks &&
--- a/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h
+++ b/Code/GraphMol/RGroupDecomposition/RGroupDecompParams.h
@@ -18,40 +18,23 @@

 namespace RDKit {

-BETTER_ENUM(RGroupLabels, unsigned int,
-  IsotopeLabels = 0x01,
-  AtomMapLabels = 0x02,
-  AtomIndexLabels = 0x04,
-  RelabelDuplicateLabels = 0x08,
+BETTER_ENUM(
+    RGroupLabels, unsigned int, IsotopeLabels = 0x01, AtomMapLabels = 0x02,
+    AtomIndexLabels = 0x04, RelabelDuplicateLabels = 0x08,
    MDLRGroupLabels = 0x10,
    DummyAtomLabels = 0x20,  // These are rgroups but will get relabelled
-  AutoDetect = 0xFF
-);
+    AutoDetect = 0xFF);

-BETTER_ENUM(RGroupMatching, unsigned int,
-  Greedy = 0x01,
-  GreedyChunks = 0x02,
+BETTER_ENUM(RGroupMatching, unsigned int, Greedy = 0x01, GreedyChunks = 0x02,
            Exhaustive = 0x04,  // not really useful for large sets
-  NoSymmetrization = 0x08,
-  GA = 0x10
-);
+            NoSymmetrization = 0x08, GA = 0x10);

-BETTER_ENUM(
-  RGroupLabelling, unsigned int,
-  AtomMap = 0x01,
-  Isotope = 0x02,
-  MDLRGroup = 0x04
-);
+BETTER_ENUM(RGroupLabelling, unsigned int, AtomMap = 0x01, Isotope = 0x02,
+            MDLRGroup = 0x04);

-BETTER_ENUM(RGroupCoreAlignment, unsigned int,
-  NoAlignment = 0x0,
-  MCS = 0x01
-);
+BETTER_ENUM(RGroupCoreAlignment, unsigned int, NoAlignment = 0x0, MCS = 0x01);

-BETTER_ENUM(RGroupScore, unsigned int,
-  Match = 0x1,
-  FingerprintVariance = 0x4
-);
+BETTER_ENUM(RGroupScore, unsigned int, Match = 0x1, FingerprintVariance = 0x4);

 struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters {
  unsigned int labels = RGroupLabels::AutoDetect;
@@ -75,6 +58,9 @@ struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters {
  bool allowNonTerminalRGroups = false;
  //! unlabelled core atoms can have multiple rgroups
  bool allowMultipleRGroupsOnUnlabelled = false;
+  //! Permit a core to match more than once in the same molecule if the sets of
+  // matched atoms are not equal.
+  bool allowMultipleCoresInSameMol = false;
  // extended query settings for core matching
  bool doTautomers = false;
  bool doEnumeration = false;
--- a/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp
+++ b/Code/GraphMol/RGroupDecomposition/Wrap/rdRGroupComposition.cpp
@@ -288,7 +288,10 @@ struct rgroupdecomp_wrapper {
        "input structure\n"
        "    - doEnumeration: expand input cores into enumerated mol bundles\n"
        "    - allowMultipleRGroupsOnUnlabelled: permit more than one rgroup to "
-        "be attached to an unlabelled core atom";
+        "be attached to an unlabelled core atom\n"
+        "    - allowMultipleCoresInSameMol: permit a core to match more than"
+        " once in the same molecule if the sets of matched atoms are not equal"
+        " (default=False)";
    python::class_<RDKit::RGroupDecompositionParameters>(
        "RGroupDecompositionParameters", docString.c_str(),
        python::init<>(python::args("self"), "Constructor, takes no arguments"))
@@ -338,6 +341,9 @@ struct rgroupdecomp_wrapper {
        .def_readwrite("allowMultipleRGroupsOnUnlabelled",
                       &RDKit::RGroupDecompositionParameters::
                           allowMultipleRGroupsOnUnlabelled)
+        .def_readwrite(
+            "allowMultipleCoresInSameMol",
+            &RDKit::RGroupDecompositionParameters::allowMultipleCoresInSameMol)
        .def_readwrite("doTautomers",
                       &RDKit::RGroupDecompositionParameters::doTautomers)
        .def_readwrite("doEnumeration",
--- a/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp
+++ b/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp
@@ -1141,3 +1141,85 @@ TEST_CASE("includeTargetMolInResults") {
    }
  }
 }
+
+TEST_CASE("Multiple Core Hits") {
+  {
+    std::vector<ROMOL_SPTR> cores{
+        "c1([*:9])c([*:8])c([*:7])c2c(c1([*:10]))c(c([*:5])n2([*:6]))[CH2]C([*:3])([*:4])[N,n]([*:1])([*:2])"_smarts};
+    REQUIRE(cores.front());
+    std::vector<ROMOL_SPTR> mols{
+        "CC1(C)N2[C@@H](Cc3c1[nH]c4ccccc34)C(=O)N(CCc5c[nH]c6ccccc56)CC2=O"_smiles};
+    REQUIRE(mols.front());
+    RGroupRows rows;
+    RGroupDecompositionParameters ps;
+    ps.allowMultipleCoresInSameMol = true;
+    auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
+    CHECK(n == 1);
+    CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
+[
+  {
+    "Core":"c1ccc2c(C[C@H](N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1",
+    "R1":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]",
+    "R2":"CC(C)([*:2])[*:5]",
+    "R3":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]",
+    "R5":"CC(C)([*:2])[*:5]"
+  },
+  {
+    "Core":"c1ccc2c(CC(N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1",
+    "R1":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]",
+    "R2":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]",
+    "R3":"[H][*:3]",
+    "R5":"[H][*:5]"
+  }
+])JSON"));
+  }
+  {
+    std::vector<ROMOL_SPTR> cores{"c1ccccc1"_smarts};
+    std::vector<ROMOL_SPTR> mols{"Fc1ccccc1Nc2ccc(Cl)cc2"_smiles,
+                                 "c1cc(O)cc(Oc2cccc(Br)c2)c1"_smiles,
+                                 "Ic1ccccc1"_smiles};
+    RGroupRows rows;
+    RGroupDecompositionParameters ps;
+    ps.allowMultipleCoresInSameMol = true;
+    auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
+    CHECK(n == 3);
+    CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
+[
+  {
+    "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
+    "R1":"[H][*:1]",
+    "R2":"[H][*:2]",
+    "R3":"F[*:3]",
+    "R4":"Clc1ccc(N[*:4])cc1"
+  },
+  {
+    "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
+    "R1":"Fc1ccccc1N[*:1]",
+    "R2":"[H][*:2]",
+    "R3":"[H][*:3]",
+    "R4":"Cl[*:4]"
+  },
+  {
+    "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
+    "R1":"[H][*:1]",
+    "R2":"O[*:2]",
+    "R3":"[H][*:3]",
+    "R4":"Brc1cccc(O[*:4])c1"
+  },
+  {
+    "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
+    "R1":"[H][*:1]",
+    "R2":"Oc1cccc(O[*:2])c1",
+    "R3":"[H][*:3]",
+    "R4":"Br[*:4]"
+  },
+  {
+    "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
+    "R1":"[H][*:1]",
+    "R2":"[H][*:2]",
+    "R3":"[H][*:3]",
+    "R4":"I[*:4]"
+  }
+])JSON"));
+  }
+}