mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Allow Multiple Core Hits in the Same Molecule in RGroupDecomposition (#8813)
* Allow the same core to match more than once in a molecule. * Update annotation. * Changes after review. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
This commit is contained in:
committed by
greg landrum
parent
e6f37dc498
commit
53203079c1
@@ -346,6 +346,36 @@ int RGroupDecomposition::getMatchingCoreInternal(
|
||||
return core_idx;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// Take the matches, all from the same molecule and split them so that
|
||||
// different atom sets are separated out. So that if a core hits
|
||||
// more than once in the molecule, both sets of R Groups will be
|
||||
// returned.
|
||||
std::vector<std::vector<MatchVectType>> splitNonUniqueMatches(
|
||||
const std::vector<MatchVectType> &tmatches, unsigned int nAtoms) {
|
||||
std::vector<std::vector<MatchVectType>> outMatches;
|
||||
std::vector<boost::dynamic_bitset<>> atomSets;
|
||||
for (const auto &match : tmatches) {
|
||||
boost::dynamic_bitset<> atomSet(nAtoms);
|
||||
for (const auto &mp : match) {
|
||||
atomSet.set(mp.second);
|
||||
}
|
||||
if (std::find(atomSets.begin(), atomSets.end(), atomSet) ==
|
||||
atomSets.end()) {
|
||||
atomSets.push_back(atomSet);
|
||||
outMatches.push_back(std::vector<MatchVectType>(1, match));
|
||||
} else {
|
||||
for (size_t i = 0; i < atomSets.size(); ++i) {
|
||||
if (atomSet == atomSets[i]) {
|
||||
outMatches[i].push_back(match);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return outMatches;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
RWMOL_SPTR mol(new RWMol(inmol));
|
||||
const RCore *rcore;
|
||||
@@ -371,6 +401,7 @@ int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mark any wildcards in input molecule:
|
||||
for (auto &atom : mol->atoms()) {
|
||||
if (atom->getAtomicNum() == 0) {
|
||||
@@ -397,8 +428,16 @@ int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
std::vector<RGroupMatch> potentialMatches;
|
||||
constexpr size_t MAX_PERMUTATIONS = 100000;
|
||||
|
||||
std::vector<std::vector<MatchVectType>> nonUniqueMatches;
|
||||
if (data->params.allowMultipleCoresInSameMol) {
|
||||
nonUniqueMatches = splitNonUniqueMatches(tmatches, mol->getNumAtoms());
|
||||
} else {
|
||||
nonUniqueMatches.push_back(tmatches);
|
||||
}
|
||||
|
||||
for (const auto &splitMatch : nonUniqueMatches) {
|
||||
std::unique_ptr<ROMol> tMol;
|
||||
for (const auto &tmatche : tmatches) {
|
||||
for (const auto &tmatche : splitMatch) {
|
||||
const bool replaceDummies = false;
|
||||
const bool labelByIndex = true;
|
||||
const bool requireDummyMatch = false;
|
||||
@@ -447,14 +486,16 @@ int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS,
|
||||
rlabelsOnSideChainAtom);
|
||||
rlabelsOnSideChainAtom.push_back(rlabel);
|
||||
sideChainAtom->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChainAtom);
|
||||
sideChainAtom->setProp(SIDECHAIN_RLABELS,
|
||||
rlabelsOnSideChainAtom);
|
||||
data->labels.insert(rlabel); // keep track of all labels used
|
||||
rlabelsOnSideChain.push_back(rlabel);
|
||||
if (const auto [bondIdx, end] =
|
||||
newMol->getAtomBonds(sideChainAtom);
|
||||
bondIdx != end) {
|
||||
auto connectingBond = (*newMol)[*bondIdx];
|
||||
if (connectingBond->getStereo() > Bond::BondStereo::STEREOANY) {
|
||||
if (connectingBond->getStereo() >
|
||||
Bond::BondStereo::STEREOANY) {
|
||||
// TODO: how to handle bond stereo on rgroups connected to
|
||||
// core by stereo double bonds
|
||||
connectingBond->setStereo(Bond::BondStereo::STEREOANY);
|
||||
@@ -542,16 +583,18 @@ int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
if (data->params.includeTargetMolInResults) {
|
||||
setTargetAtomBondIndices(*extractedCore, false);
|
||||
}
|
||||
potentialMatches.emplace_back(core_idx, numberMissingUserGroups, match,
|
||||
extractedCore);
|
||||
potentialMatches.emplace_back(core_idx, numberMissingUserGroups,
|
||||
match, extractedCore);
|
||||
if (data->params.includeTargetMolInResults) {
|
||||
potentialMatches.back().setTargetMoleculeForHighlights(mol);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (potentialMatches.empty()) {
|
||||
BOOST_LOG(rdDebugLog) << "No attachment points in side chains" << std::endl;
|
||||
BOOST_LOG(rdDebugLog)
|
||||
<< "No attachment points in side chains" << std::endl;
|
||||
return -2;
|
||||
}
|
||||
|
||||
@@ -570,7 +613,7 @@ int RGroupDecomposition::add(const ROMol &inmol) {
|
||||
}
|
||||
}
|
||||
data->matches.push_back(std::move(potentialMatches));
|
||||
|
||||
}
|
||||
if (!data->matches.empty()) {
|
||||
if (data->params.matchingStrategy & Greedy ||
|
||||
(data->params.matchingStrategy & GreedyChunks &&
|
||||
|
||||
@@ -18,40 +18,23 @@
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
BETTER_ENUM(RGroupLabels, unsigned int,
|
||||
IsotopeLabels = 0x01,
|
||||
AtomMapLabels = 0x02,
|
||||
AtomIndexLabels = 0x04,
|
||||
RelabelDuplicateLabels = 0x08,
|
||||
BETTER_ENUM(
|
||||
RGroupLabels, unsigned int, IsotopeLabels = 0x01, AtomMapLabels = 0x02,
|
||||
AtomIndexLabels = 0x04, RelabelDuplicateLabels = 0x08,
|
||||
MDLRGroupLabels = 0x10,
|
||||
DummyAtomLabels = 0x20, // These are rgroups but will get relabelled
|
||||
AutoDetect = 0xFF
|
||||
);
|
||||
AutoDetect = 0xFF);
|
||||
|
||||
BETTER_ENUM(RGroupMatching, unsigned int,
|
||||
Greedy = 0x01,
|
||||
GreedyChunks = 0x02,
|
||||
BETTER_ENUM(RGroupMatching, unsigned int, Greedy = 0x01, GreedyChunks = 0x02,
|
||||
Exhaustive = 0x04, // not really useful for large sets
|
||||
NoSymmetrization = 0x08,
|
||||
GA = 0x10
|
||||
);
|
||||
NoSymmetrization = 0x08, GA = 0x10);
|
||||
|
||||
BETTER_ENUM(
|
||||
RGroupLabelling, unsigned int,
|
||||
AtomMap = 0x01,
|
||||
Isotope = 0x02,
|
||||
MDLRGroup = 0x04
|
||||
);
|
||||
BETTER_ENUM(RGroupLabelling, unsigned int, AtomMap = 0x01, Isotope = 0x02,
|
||||
MDLRGroup = 0x04);
|
||||
|
||||
BETTER_ENUM(RGroupCoreAlignment, unsigned int,
|
||||
NoAlignment = 0x0,
|
||||
MCS = 0x01
|
||||
);
|
||||
BETTER_ENUM(RGroupCoreAlignment, unsigned int, NoAlignment = 0x0, MCS = 0x01);
|
||||
|
||||
BETTER_ENUM(RGroupScore, unsigned int,
|
||||
Match = 0x1,
|
||||
FingerprintVariance = 0x4
|
||||
);
|
||||
BETTER_ENUM(RGroupScore, unsigned int, Match = 0x1, FingerprintVariance = 0x4);
|
||||
|
||||
struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters {
|
||||
unsigned int labels = RGroupLabels::AutoDetect;
|
||||
@@ -75,6 +58,9 @@ struct RDKIT_RGROUPDECOMPOSITION_EXPORT RGroupDecompositionParameters {
|
||||
bool allowNonTerminalRGroups = false;
|
||||
//! unlabelled core atoms can have multiple rgroups
|
||||
bool allowMultipleRGroupsOnUnlabelled = false;
|
||||
//! Permit a core to match more than once in the same molecule if the sets of
|
||||
// matched atoms are not equal.
|
||||
bool allowMultipleCoresInSameMol = false;
|
||||
// extended query settings for core matching
|
||||
bool doTautomers = false;
|
||||
bool doEnumeration = false;
|
||||
|
||||
@@ -288,7 +288,10 @@ struct rgroupdecomp_wrapper {
|
||||
"input structure\n"
|
||||
" - doEnumeration: expand input cores into enumerated mol bundles\n"
|
||||
" - allowMultipleRGroupsOnUnlabelled: permit more than one rgroup to "
|
||||
"be attached to an unlabelled core atom";
|
||||
"be attached to an unlabelled core atom\n"
|
||||
" - allowMultipleCoresInSameMol: permit a core to match more than"
|
||||
" once in the same molecule if the sets of matched atoms are not equal"
|
||||
" (default=False)";
|
||||
python::class_<RDKit::RGroupDecompositionParameters>(
|
||||
"RGroupDecompositionParameters", docString.c_str(),
|
||||
python::init<>(python::args("self"), "Constructor, takes no arguments"))
|
||||
@@ -338,6 +341,9 @@ struct rgroupdecomp_wrapper {
|
||||
.def_readwrite("allowMultipleRGroupsOnUnlabelled",
|
||||
&RDKit::RGroupDecompositionParameters::
|
||||
allowMultipleRGroupsOnUnlabelled)
|
||||
.def_readwrite(
|
||||
"allowMultipleCoresInSameMol",
|
||||
&RDKit::RGroupDecompositionParameters::allowMultipleCoresInSameMol)
|
||||
.def_readwrite("doTautomers",
|
||||
&RDKit::RGroupDecompositionParameters::doTautomers)
|
||||
.def_readwrite("doEnumeration",
|
||||
|
||||
@@ -1141,3 +1141,85 @@ TEST_CASE("includeTargetMolInResults") {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Multiple Core Hits") {
|
||||
{
|
||||
std::vector<ROMOL_SPTR> cores{
|
||||
"c1([*:9])c([*:8])c([*:7])c2c(c1([*:10]))c(c([*:5])n2([*:6]))[CH2]C([*:3])([*:4])[N,n]([*:1])([*:2])"_smarts};
|
||||
REQUIRE(cores.front());
|
||||
std::vector<ROMOL_SPTR> mols{
|
||||
"CC1(C)N2[C@@H](Cc3c1[nH]c4ccccc34)C(=O)N(CCc5c[nH]c6ccccc56)CC2=O"_smiles};
|
||||
REQUIRE(mols.front());
|
||||
RGroupRows rows;
|
||||
RGroupDecompositionParameters ps;
|
||||
ps.allowMultipleCoresInSameMol = true;
|
||||
auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
|
||||
CHECK(n == 1);
|
||||
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
|
||||
[
|
||||
{
|
||||
"Core":"c1ccc2c(C[C@H](N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1",
|
||||
"R1":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]",
|
||||
"R2":"CC(C)([*:2])[*:5]",
|
||||
"R3":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]",
|
||||
"R5":"CC(C)([*:2])[*:5]"
|
||||
},
|
||||
{
|
||||
"Core":"c1ccc2c(CC(N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1",
|
||||
"R1":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]",
|
||||
"R2":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]",
|
||||
"R3":"[H][*:3]",
|
||||
"R5":"[H][*:5]"
|
||||
}
|
||||
])JSON"));
|
||||
}
|
||||
{
|
||||
std::vector<ROMOL_SPTR> cores{"c1ccccc1"_smarts};
|
||||
std::vector<ROMOL_SPTR> mols{"Fc1ccccc1Nc2ccc(Cl)cc2"_smiles,
|
||||
"c1cc(O)cc(Oc2cccc(Br)c2)c1"_smiles,
|
||||
"Ic1ccccc1"_smiles};
|
||||
RGroupRows rows;
|
||||
RGroupDecompositionParameters ps;
|
||||
ps.allowMultipleCoresInSameMol = true;
|
||||
auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
|
||||
CHECK(n == 3);
|
||||
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
|
||||
[
|
||||
{
|
||||
"Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
|
||||
"R1":"[H][*:1]",
|
||||
"R2":"[H][*:2]",
|
||||
"R3":"F[*:3]",
|
||||
"R4":"Clc1ccc(N[*:4])cc1"
|
||||
},
|
||||
{
|
||||
"Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
|
||||
"R1":"Fc1ccccc1N[*:1]",
|
||||
"R2":"[H][*:2]",
|
||||
"R3":"[H][*:3]",
|
||||
"R4":"Cl[*:4]"
|
||||
},
|
||||
{
|
||||
"Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
|
||||
"R1":"[H][*:1]",
|
||||
"R2":"O[*:2]",
|
||||
"R3":"[H][*:3]",
|
||||
"R4":"Brc1cccc(O[*:4])c1"
|
||||
},
|
||||
{
|
||||
"Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
|
||||
"R1":"[H][*:1]",
|
||||
"R2":"Oc1cccc(O[*:2])c1",
|
||||
"R3":"[H][*:3]",
|
||||
"R4":"Br[*:4]"
|
||||
},
|
||||
{
|
||||
"Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]",
|
||||
"R1":"[H][*:1]",
|
||||
"R2":"[H][*:2]",
|
||||
"R3":"[H][*:3]",
|
||||
"R4":"I[*:4]"
|
||||
}
|
||||
])JSON"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user