Files
rdkit/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp
David Cosgrove 53203079c1 Allow Multiple Core Hits in the Same Molecule in RGroupDecomposition (#8813)
* Allow the same core to match more than once in a molecule.

* Update annotation.

* Changes after review.

---------

Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
2025-10-31 13:49:35 +01:00

885 lines
34 KiB
C++

//
// Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
// and other RDKit contributors
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "RGroupDecomp.h"
#include "RGroupDecompData.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/ChemTransforms/ChemTransforms.h>
#include <boost/dynamic_bitset.hpp>
#include <set>
#include <utility>
#include <vector>
#include "GraphMol/TautomerQuery/TautomerQuery.h"
// #define VERBOSE 1
namespace RDKit {
// Attachment Points
// labeled cores => isotopes
// atom mappings
// atom indices => use -1 - atom index, range is [-1, ...., -num_atoms]
const std::string RLABEL = "tempRlabel";
const std::string RLABEL_TYPE = "tempRlabelType";
const std::string RLABEL_CORE_INDEX = "rLabelCoreIndex";
const std::string SIDECHAIN_RLABELS = "sideChainRlabels";
const std::string done = "RLABEL_PROCESSED";
const std::string _rgroupInputDummy = "_rgroupInputDummy";
const std::string UNLABELED_CORE_ATTACHMENT = "unlabeledCoreAttachment";
namespace {
static const std::string TARGET_ATOM_IDX = "__rgdTargetAtomIdx";
static const std::string TARGET_BOND_IDX = "__rgdTargetBondIdx";
void ADD_MATCH(R_DECOMP &match, int rlabel) {
if (match.find(rlabel) == match.end()) {
match[rlabel] = boost::make_shared<RGroupData>();
}
}
} // namespace
RGroupDecomposition::RGroupDecomposition(
const ROMol &inputCore, const RGroupDecompositionParameters &params)
: data(new RGroupDecompData(inputCore, params)) {}
RGroupDecomposition::RGroupDecomposition(
const std::vector<ROMOL_SPTR> &cores,
const RGroupDecompositionParameters &params)
: data(new RGroupDecompData(cores, params)) {}
RGroupDecomposition::~RGroupDecomposition() { delete data; }
void RGroupDecomposition::labelAtomBondIndices(RWMol &mol) {
for (const auto targetAtom : mol.atoms()) {
targetAtom->setProp(TARGET_ATOM_IDX, targetAtom->getIdx());
}
for (const auto targetBond : mol.bonds()) {
targetBond->setProp(TARGET_BOND_IDX, targetBond->getIdx());
}
}
void RGroupDecomposition::setTargetAtomBondIndices(
ROMol &mol, bool includeBondsToRLabels) const {
std::vector<int> atomIndices(mol.getNumAtoms(), -1);
std::vector<int> bondIndices(mol.getNumBonds(), -1);
int largestAtomIdx = -1;
bool isHydrogen = RGroupData::isMolHydrogen(mol);
for (const auto atom : mol.atoms()) {
int targetAtomIdx;
if (atom->getPropIfPresent(TARGET_ATOM_IDX, targetAtomIdx)) {
atom->clearProp(TARGET_ATOM_IDX);
if ((atom->getAtomicNum() == 1 && data->params.removeHydrogensPostMatch &&
!isHydrogen) ||
(atom->getAtomicNum() == 0 && !atom->hasProp(_rgroupInputDummy))) {
continue;
}
int atomIdx = atom->getIdx();
atomIndices[atomIdx] = targetAtomIdx;
largestAtomIdx = std::max(atomIdx, largestAtomIdx);
}
}
atomIndices.resize(largestAtomIdx + 1);
int largestBondIdx = -1;
for (const auto bond : mol.bonds()) {
int targetBondIdx;
if (bond->getPropIfPresent(TARGET_BOND_IDX, targetBondIdx)) {
bond->clearProp(TARGET_BOND_IDX);
if ((bond->getBeginAtom()->getAtomicNum() == 1 ||
bond->getEndAtom()->getAtomicNum() == 1) &&
data->params.removeHydrogensPostMatch && !isHydrogen) {
continue;
}
if (!includeBondsToRLabels &&
((bond->getBeginAtom()->getAtomicNum() == 0 &&
!bond->getBeginAtom()->hasProp(_rgroupInputDummy)) ||
(bond->getEndAtom()->getAtomicNum() == 0 &&
!bond->getEndAtom()->hasProp(_rgroupInputDummy)))) {
continue;
}
int bondIdx = bond->getIdx();
bondIndices[bondIdx] = targetBondIdx;
largestBondIdx = std::max(bondIdx, largestBondIdx);
}
}
bondIndices.resize(largestBondIdx + 1);
std::vector<int> highlightAtoms;
highlightAtoms.reserve(atomIndices.size());
std::copy_if(atomIndices.begin(), atomIndices.end(),
std::back_inserter(highlightAtoms),
[](const auto atomIdx) { return atomIdx != -1; });
std::vector<int> highlightBonds;
highlightBonds.reserve(bondIndices.size());
std::copy_if(bondIndices.begin(), bondIndices.end(),
std::back_inserter(highlightBonds),
[](const auto bondIdx) { return bondIdx != -1; });
mol.setProp(common_properties::_rgroupTargetAtoms, highlightAtoms);
mol.setProp(common_properties::_rgroupTargetBonds, highlightBonds);
}
int RGroupDecomposition::getMatchingCoreIdx(
const ROMol &mol, std::vector<MatchVectType> *matches) {
RWMol rwmol(mol);
std::vector<MatchVectType> matchesTmp;
const RCore *rcore;
auto coreIdx = getMatchingCoreInternal(rwmol, rcore, matchesTmp);
if (matches) {
std::set<MatchVectType> uniqueMatches;
int numAtoms = mol.getNumAtoms();
for (const auto &match : matchesTmp) {
MatchVectType heavyMatch;
heavyMatch.reserve(match.size());
std::copy_if(
std::make_move_iterator(match.begin()),
std::make_move_iterator(match.end()), std::back_inserter(heavyMatch),
[numAtoms](const auto &pair) { return pair.second < numAtoms; });
std::sort(heavyMatch.begin(), heavyMatch.end());
uniqueMatches.insert(heavyMatch);
}
*matches =
std::vector<MatchVectType>(uniqueMatches.begin(), uniqueMatches.end());
}
return coreIdx;
}
int RGroupDecomposition::getMatchingCoreInternal(
RWMol &mol, const RCore *&rcore, std::vector<MatchVectType> &matches) {
rcore = nullptr;
int core_idx = -1;
const bool explicitOnly = false;
const bool addCoords = true;
MolOps::addHs(mol, explicitOnly, addCoords);
std::vector<MatchVectType> tmatches;
std::vector<MatchVectType> tmatches_filtered;
// Find the first matching core (onlyMatchAtRGroups)
// or the first core that requires the smallest number
// of newly added labels and is a superstructure of
// the first matching core
int global_min_heavy_nbrs = -1;
SubstructMatchParameters sssparams(params().substructmatchParams);
sssparams.uniquify = false;
sssparams.recursionPossible = true;
for (auto &core : data->cores) {
{
// matching the core to the molecule is a two step process
// First match to a reduced representation (the core minus terminal
// R-groups). Next, match the R-groups. We do this as the core may not be
// a substructure match for the molecule if a single molecule atom matches
// 2 RGroup attachments (see https://github.com/rdkit/rdkit/pull/4002)
// match the reduced representation:
std::vector<MatchVectType> baseMatches;
if (params().doTautomers) {
// Here we are attempting to enumerate tautomers of the core
if (auto tautomerQuery = core.second.getMatchingTautomerQuery();
tautomerQuery != nullptr) {
// query atom indices from the tautomer query are the same as the
// template matching molecule
baseMatches = tautomerQuery->substructOf(mol, sssparams);
} else {
// However, if it is not possible to Kekulize the core, we revert back
// to the non-tautomer matching.
baseMatches =
SubstructMatch(mol, *core.second.matchingMol, sssparams);
}
} else {
baseMatches = SubstructMatch(mol, *core.second.matchingMol, sssparams);
}
tmatches.clear();
for (const auto &baseMatch : baseMatches) {
// Match the R Groups
// Important: there can be multiple core indices matching
// the same target idx, because of #4002
auto matchesIncludingRGroups =
core.second.matchTerminalUserRGroups(mol, baseMatch, sssparams);
/*
std::cerr << "baseMatch ";
for (const auto &pair : baseMatch) std::cerr << "(" << pair.first <<","
<< pair.second << "),"; std::cerr << std::endl; std::cerr <<
"matchesIncludingRGroups "; for (const auto &matchWithDummy :
matchesIncludingRGroups) { for (const auto &pair : matchWithDummy)
std::cerr << "(" << pair.first
<<"," << pair.second << "),"; std::cerr << " /// ";
}
std::cerr << std::endl;
*/
tmatches.insert(
tmatches.end(),
std::make_move_iterator(matchesIncludingRGroups.cbegin()),
std::make_move_iterator(matchesIncludingRGroups.cend()));
}
}
if (tmatches.empty()) {
continue;
}
std::vector<int> tmatches_heavy_nbrs(tmatches.size(), 0);
size_t i = 0;
for (const auto &mv : tmatches) {
bool passes_filter = data->params.onlyMatchAtRGroups;
// targetToCoreIndices maps each atom idx in the molecule to a vector
// of atom indices. This vector may be empty (if the atom in the molecule
// has no match with core) or not. When not empty, it will most often
// contain a single atom idx, corresponding to the matching index in the
// core, as usually a core atom can only match a single molecule atom.
// However, there is an important exception to this rule, i.e. when
// the core bears a single R-group dummy at a certain position, while
// the molecule has multiple substituents at the corresponding
// position; in this case, the vector will contain the indices of the
// root atom in all substituents which match a single R-group dummy on
// the core.
std::vector<std::vector<int>> targetToCoreIndices(mol.getNumAtoms());
for (const auto &match : mv) {
targetToCoreIndices[match.second].push_back(match.first);
}
for (const auto &match : mv) {
const auto atm = mol.getAtomWithIdx(match.second);
// is this a labelled rgroup or not?
if (!core.second.isCoreAtomUserLabelled(match.first)) {
// nope... if any neighbor is not part of the substructure
// check if it is a hydrogen; otherwise, if onlyMatchAtRGroups
// is true, skip the match
for (const auto &nbri :
boost::make_iterator_range(mol.getAtomNeighbors(atm))) {
const auto &nbr = mol[nbri];
if (nbr->getAtomicNum() != 1 &&
targetToCoreIndices.at(nbr->getIdx()).empty()) {
if (data->params.onlyMatchAtRGroups) {
passes_filter = false;
break;
} else {
// for each match, we keep track of the number of
// R labels that need to be added to match all
// non-user-labelled R groups in this molecule
// if we use this core for RGD
++tmatches_heavy_nbrs[i];
}
}
}
} else if (core.second.isTerminalRGroupWithUserLabel(match.first) &&
data->params.onlyMatchAtRGroups &&
!core.second.checkAllBondsToRGroupPresent(
mol, match.second, targetToCoreIndices)) {
// labelled R-group
passes_filter = false;
}
if (!passes_filter && data->params.onlyMatchAtRGroups) {
break;
}
}
if (passes_filter) {
tmatches_filtered.push_back(std::move(mv));
}
++i;
}
if (!data->params.onlyMatchAtRGroups) {
// tmatches_heavy_nbrs.size() = tmatches.size(), and
// tmatches.size() cannot be empty, otherwise we should not be here
// but let's check it in case something changes upstream
CHECK_INVARIANT(!tmatches_heavy_nbrs.empty(),
"tmatches_heavy_nbrs must not be empty");
int min_heavy_nbrs = *std::min_element(tmatches_heavy_nbrs.begin(),
tmatches_heavy_nbrs.end());
if (!rcore || (min_heavy_nbrs < global_min_heavy_nbrs &&
!SubstructMatch(*core.second.core, *rcore->core, sssparams)
.empty())) {
i = 0;
tmatches_filtered.clear();
for (const auto heavy_nbrs : tmatches_heavy_nbrs) {
if (heavy_nbrs <= min_heavy_nbrs) {
tmatches_filtered.push_back(std::move(tmatches[i]));
}
++i;
}
global_min_heavy_nbrs = min_heavy_nbrs;
rcore = &core.second;
core_idx = core.first;
if (global_min_heavy_nbrs == 0) {
break;
}
}
} else if (!tmatches_filtered.empty()) {
rcore = &core.second;
core_idx = core.first;
break;
}
}
if (rcore) {
matches = std::move(tmatches_filtered);
}
return core_idx;
}
namespace {
// Take the matches, all from the same molecule and split them so that
// different atom sets are separated out. So that if a core hits
// more than once in the molecule, both sets of R Groups will be
// returned.
std::vector<std::vector<MatchVectType>> splitNonUniqueMatches(
const std::vector<MatchVectType> &tmatches, unsigned int nAtoms) {
std::vector<std::vector<MatchVectType>> outMatches;
std::vector<boost::dynamic_bitset<>> atomSets;
for (const auto &match : tmatches) {
boost::dynamic_bitset<> atomSet(nAtoms);
for (const auto &mp : match) {
atomSet.set(mp.second);
}
if (std::find(atomSets.begin(), atomSets.end(), atomSet) ==
atomSets.end()) {
atomSets.push_back(atomSet);
outMatches.push_back(std::vector<MatchVectType>(1, match));
} else {
for (size_t i = 0; i < atomSets.size(); ++i) {
if (atomSet == atomSets[i]) {
outMatches[i].push_back(match);
}
}
}
}
return outMatches;
}
} // namespace
int RGroupDecomposition::add(const ROMol &inmol) {
RWMOL_SPTR mol(new RWMol(inmol));
const RCore *rcore;
std::vector<MatchVectType> tmatches;
// Add Hs for better symmetrization
auto core_idx = getMatchingCoreInternal(*mol, rcore, tmatches);
if (rcore == nullptr) {
BOOST_LOG(rdDebugLog) << "No core matches" << std::endl;
return -1;
}
if (data->params.includeTargetMolInResults) {
labelAtomBondIndices(*mol);
}
if (tmatches.size() > 1) {
if (data->params.matchingStrategy == NoSymmetrization) {
tmatches.resize(1);
} else if (data->matches.size() == 0) {
// Greedy strategy just grabs the first match and
// takes the best matches from the rest
if (data->params.matchingStrategy == Greedy) {
tmatches.resize(1);
}
}
}
// mark any wildcards in input molecule:
for (auto &atom : mol->atoms()) {
if (atom->getAtomicNum() == 0) {
atom->setProp(_rgroupInputDummy, true);
// clean any existing R group numbers
atom->setIsotope(0);
atom->setAtomMapNum(0);
atom->clearProp(common_properties::_MolFileRLabel);
atom->setProp(common_properties::dummyLabel, "*");
}
}
// strategies
// ==========
// Exhaustive - saves all matches and optimizes later exhaustive
// May never finish due to combinatorial complexity
// Greedy - matches to *FIRST* available match
// GreedyChunks - default - process every N chunks, unless
// MAX_PERMUTATIONS is exceeded, in which case it falls back to
// Greedy for the current chunk
// Should probably scan all mols first to find match with
// smallest number of matches...
std::vector<RGroupMatch> potentialMatches;
constexpr size_t MAX_PERMUTATIONS = 100000;
std::vector<std::vector<MatchVectType>> nonUniqueMatches;
if (data->params.allowMultipleCoresInSameMol) {
nonUniqueMatches = splitNonUniqueMatches(tmatches, mol->getNumAtoms());
} else {
nonUniqueMatches.push_back(tmatches);
}
for (const auto &splitMatch : nonUniqueMatches) {
std::unique_ptr<ROMol> tMol;
for (const auto &tmatche : splitMatch) {
const bool replaceDummies = false;
const bool labelByIndex = true;
const bool requireDummyMatch = false;
// TODO see if we need replaceCoreAtomsWithMolMatches or can just use
// rcore->core
auto coreCopy = rcore->replaceCoreAtomsWithMolMatches(*mol, tmatche);
tMol.reset(replaceCore(*mol, *coreCopy, tmatche, replaceDummies,
labelByIndex, requireDummyMatch));
#ifdef VERBOSE
std::cerr << "Core Match core_idx " << core_idx << " idx "
<< data->matches.size() << ": " << MolToSmarts(*coreCopy)
<< std::endl;
#endif
if (tMol) {
#ifdef VERBOSE
std::cerr << "All Fragments " << MolToSmiles(*tMol) << std::endl;
#endif
R_DECOMP match;
// rlabel rgroups
MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false);
std::set<int> coreAtomAnyMatched;
// get the sidechains
for (size_t i = 0; i < fragments.size(); ++i) {
const auto &newMol = fragments[i];
std::vector<int> rlabelsOnSideChain;
newMol->setProp<int>("core", core_idx);
newMol->setProp<int>("idx", data->matches.size());
newMol->setProp<int>("frag_idx", i);
#ifdef VERBOSE
std::cerr << "Fragment " << MolToSmiles(*newMol) << std::endl;
#endif
for (auto sideChainAtom : newMol->atoms()) {
if (sideChainAtom->getAtomicNum() != 0) {
// we are only interested in sidechain R group atoms
continue;
}
if (!sideChainAtom->hasProp(_rgroupInputDummy)) {
// this is the index of the core atom that the R group
// atom is attached to
unsigned int coreAtomIndex = sideChainAtom->getIsotope();
auto coreAtom = rcore->core->getAtomWithIdx(coreAtomIndex);
coreAtomAnyMatched.insert(coreAtomIndex);
int rlabel;
if (coreAtom->getPropIfPresent(RLABEL, rlabel)) {
std::vector<int> rlabelsOnSideChainAtom;
sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS,
rlabelsOnSideChainAtom);
rlabelsOnSideChainAtom.push_back(rlabel);
sideChainAtom->setProp(SIDECHAIN_RLABELS,
rlabelsOnSideChainAtom);
data->labels.insert(rlabel); // keep track of all labels used
rlabelsOnSideChain.push_back(rlabel);
if (const auto [bondIdx, end] =
newMol->getAtomBonds(sideChainAtom);
bondIdx != end) {
auto connectingBond = (*newMol)[*bondIdx];
if (connectingBond->getStereo() >
Bond::BondStereo::STEREOANY) {
// TODO: how to handle bond stereo on rgroups connected to
// core by stereo double bonds
connectingBond->setStereo(Bond::BondStereo::STEREOANY);
}
}
}
} else {
// restore input wildcard
sideChainAtom->clearProp(_rgroupInputDummy);
}
}
if (data->params.includeTargetMolInResults) {
setTargetAtomBondIndices(*newMol, true);
}
if (!rlabelsOnSideChain.empty()) {
#ifdef VERBOSE
std::string newCoreSmi = MolToSmiles(*newMol, true);
#endif
for (auto rlabel : rlabelsOnSideChain) {
ADD_MATCH(match, rlabel);
match[rlabel]->add(newMol, rlabelsOnSideChain);
#ifdef VERBOSE
std::cerr << "Fragment " << i << " R" << rlabel << " "
<< MolToSmiles(*newMol) << std::endl;
#endif
}
} else {
// special case, only one fragment
if (fragments.size() == 1) { // need to make a new core
// remove the sidechains
// GJ I think if we ever get here that it's really an error and I
// believe that I've fixed the case where this code was called.
// Still, I'm too scared to delete the block.
RWMol newCore(*mol);
for (const auto &mvpair : tmatche) {
const Atom *coreAtm = rcore->core->getAtomWithIdx(mvpair.first);
Atom *newCoreAtm = newCore.getAtomWithIdx(mvpair.second);
int rlabel;
if (coreAtm->getPropIfPresent(RLABEL, rlabel)) {
newCoreAtm->setProp<int>(RLABEL, rlabel);
}
newCoreAtm->setProp<bool>("keep", true);
}
newCore.beginBatchEdit();
for (const auto atom : newCore.atoms()) {
if (!atom->hasProp("keep")) {
newCore.removeAtom(atom);
}
}
newCore.commitBatchEdit();
if (newCore.getNumAtoms()) {
std::string newCoreSmi = MolToSmiles(newCore, true);
// add a new core if possible
auto newcore = data->newCores.find(newCoreSmi);
int core_idx = 0;
if (newcore == data->newCores.end()) {
core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--;
data->cores[core_idx] = RCore(newCore);
return add(inmol);
}
}
}
}
}
if (!match.empty()) {
// this is the number of user-defined R labels associated with
// non-hydrogen substituents
auto numberUserGroupsInMatch = std::accumulate(
match.begin(), match.end(), 0,
[](int sum,
const std::pair<int, boost::shared_ptr<RGroupData>> &p) {
return p.first > 0 && !p.second->is_hydrogen ? ++sum : sum;
});
int numberMissingUserGroups =
rcore->numberUserRGroups - numberUserGroupsInMatch;
CHECK_INVARIANT(numberMissingUserGroups >= 0,
"Data error in missing user rgroup count");
const auto extractedCore =
rcore->extractCoreFromMolMatch(*mol, tmatche, params());
if (data->params.includeTargetMolInResults) {
setTargetAtomBondIndices(*extractedCore, false);
}
potentialMatches.emplace_back(core_idx, numberMissingUserGroups,
match, extractedCore);
if (data->params.includeTargetMolInResults) {
potentialMatches.back().setTargetMoleculeForHighlights(mol);
}
}
}
}
if (potentialMatches.empty()) {
BOOST_LOG(rdDebugLog)
<< "No attachment points in side chains" << std::endl;
return -2;
}
if (data->params.matchingStrategy != GA) {
size_t N = 1;
for (auto matche = data->matches.begin() + data->previousMatchSize;
matche != data->matches.end(); ++matche) {
size_t sz = matche->size();
N *= sz;
}
// Highly symmetric cores can lead to a very large number of
// permutations to test. Fall back to Greedy for the current chunk
// when the number is too high.
if (N * potentialMatches.size() > MAX_PERMUTATIONS) {
data->process(data->prunePermutations);
}
}
data->matches.push_back(std::move(potentialMatches));
}
if (!data->matches.empty()) {
if (data->params.matchingStrategy & Greedy ||
(data->params.matchingStrategy & GreedyChunks &&
data->matches.size() % data->params.chunkSize == 0)) {
data->process(data->prunePermutations);
}
}
return data->matches.size() - 1;
}
bool RGroupDecomposition::process() { return processAndScore().success; }
RGroupDecompositionProcessResult RGroupDecomposition::processAndScore() {
try {
const bool finalize = true;
return data->process(data->prunePermutations, finalize);
} catch (...) {
return RGroupDecompositionProcessResult(false, -1);
}
}
std::vector<std::string> RGroupDecomposition::getRGroupLabels() const {
// this is a bit of a cheat
RGroupColumns cols = getRGroupsAsColumns();
std::vector<std::string> labels;
for (auto it : cols) {
labels.push_back(it.first);
}
std::sort(labels.begin(), labels.end());
return labels;
}
RWMOL_SPTR RGroupDecomposition::outputCoreMolecule(
const RGroupMatch &match, const UsedLabelMap &usedLabelMap) const {
// this routine could probably be merged into RGroupDecompData::relabelCore
const auto &core = data->cores[match.core_idx];
if (!match.matchedCore) {
return core.labelledCore;
}
auto coreWithMatches = match.matchedCore;
#ifdef VERBOSE
std::cerr << "output core mol1 " << MolToSmarts(*coreWithMatches)
<< std::endl;
#endif
std::map<Atom *, int> retainedRGroups;
for (auto atomIdx = coreWithMatches->getNumAtoms(); atomIdx--;) {
auto atom = coreWithMatches->getAtomWithIdx(atomIdx);
if (atom->getAtomicNum()) {
continue;
}
auto label = data->getRlabel(atom);
// Always convert to hydrogen - then remove later if
// removeHydrogensPostMatch is set
Atom *nbrAtom = nullptr;
for (const auto &nbri :
boost::make_iterator_range(coreWithMatches->getAtomNeighbors(atom))) {
nbrAtom = (*coreWithMatches)[nbri];
break;
}
if (nbrAtom) {
const bool isUserDefinedLabel =
usedLabelMap.has(label) && usedLabelMap.isUserDefined(label);
const bool isUsedLabel =
usedLabelMap.has(label) && usedLabelMap.getIsUsed(label);
if (!isUsedLabel && (!isUserDefinedLabel ||
data->params.removeAllHydrogenRGroupsAndLabels)) {
// Always convert to hydrogen - then remove later if
// removeHydrogensPostMatch is set
atom->setAtomicNum(1);
atom->updatePropertyCache(false);
} else {
retainedRGroups[atom] = label;
}
}
}
#ifdef VERBOSE
std::cerr << "output core mol2 " << MolToSmiles(*coreWithMatches)
<< std::endl;
#endif
if (data->params.removeHydrogensPostMatch) {
RDLog::LogStateSetter blocker;
const MolOps::RemoveHsParameters rhp;
constexpr bool sanitize = false;
MolOps::removeHs(*coreWithMatches, rhp, sanitize);
coreWithMatches->updatePropertyCache(false);
}
if (coreWithMatches->getNumConformers() > 0) {
for (const auto &[atom, label] : retainedRGroups) {
if (usedLabelMap.has(label) && usedLabelMap.isUserDefined(label)) {
// coordinates of user defined R groups should already be copied over
continue;
}
const auto neighbor = *coreWithMatches->atomNeighbors(atom).begin();
const auto &mapping = data->finalRlabelMapping;
if (const auto oldLabel = std::find_if(
mapping.begin(), mapping.end(),
[label = label](const auto &p) { return p.second == label; });
oldLabel != mapping.end()) {
if (auto iter = match.rgroups.find(oldLabel->first);
iter != match.rgroups.end()) {
MolOps::setTerminalAtomCoords(*coreWithMatches, atom->getIdx(),
neighbor->getIdx());
}
}
}
}
if (!coreWithMatches->getRingInfo()->isInitialized()) {
MolOps::symmetrizeSSSR(*coreWithMatches);
}
#ifdef VERBOSE
std::cerr << "output core mol3 " << MolToSmiles(*coreWithMatches)
<< std::endl;
#endif
return coreWithMatches;
}
RGroupRows RGroupDecomposition::getRGroupsAsRows() const {
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
RGroupRows groups;
auto usedLabelMap = UsedLabelMap(data->finalRlabelMapping);
for (auto it = permutation.begin(); it != permutation.end(); ++it) {
auto Rs_seen(usedLabelMap);
// make a new rgroup entry
groups.push_back(RGroupRow());
RGroupRow &out_rgroups = groups.back();
if (data->params.includeTargetMolInResults) {
out_rgroups.emplace(RGroupData::getMolLabel(),
it->getTargetMoleculeForHighlights(
data->params.removeHydrogensPostMatch));
}
const R_DECOMP &in_rgroups = it->rgroups;
for (const auto &rgroup : in_rgroups) {
const auto realLabel = data->finalRlabelMapping.find(rgroup.first);
CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(),
"unprocessed rlabel, please call process() first.");
Rs_seen.setIsUsed(realLabel->second);
out_rgroups.emplace(RGroupData::getRGroupLabel(realLabel->second),
rgroup.second->combinedMol);
}
out_rgroups.emplace(RGroupData::getCoreLabel(),
outputCoreMolecule(*it, Rs_seen));
}
return groups;
}
//! return rgroups in column order group[attachment_point][molidx] = ROMol
RGroupColumns RGroupDecomposition::getRGroupsAsColumns() const {
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
RGroupColumns groups;
std::unordered_set<std::string> rGroupWithRealMol{RGroupData::getCoreLabel()};
if (data->params.includeTargetMolInResults) {
rGroupWithRealMol.insert(RGroupData::getMolLabel());
}
auto usedLabelMap = UsedLabelMap(data->finalRlabelMapping);
unsigned int molidx = 0;
for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) {
auto Rs_seen(usedLabelMap);
const R_DECOMP &in_rgroups = it->rgroups;
if (data->params.includeTargetMolInResults) {
groups[RGroupData::getMolLabel()].push_back(
it->getTargetMoleculeForHighlights(
data->params.removeHydrogensPostMatch));
}
for (const auto &rgroup : in_rgroups) {
const auto realLabel = data->finalRlabelMapping.find(rgroup.first);
CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(),
"unprocessed rlabel, please call process() first.");
CHECK_INVARIANT(rgroup.second->combinedMol->hasProp(done),
"Not done! Call process()");
CHECK_INVARIANT(!Rs_seen.getIsUsed(realLabel->second),
"R group label appears multiple times!");
Rs_seen.setIsUsed(realLabel->second);
auto r = RGroupData::getRGroupLabel(realLabel->second);
RGroupColumn &col = groups[r];
if (molidx && col.size() < molidx - 1) {
col.resize(molidx - 1);
}
col.push_back(rgroup.second->combinedMol);
rGroupWithRealMol.insert(r);
}
groups[RGroupData::getCoreLabel()].push_back(
outputCoreMolecule(*it, Rs_seen));
// add empty entries to columns where this molecule didn't appear
for (const auto &realLabel : data->finalRlabelMapping) {
if (!Rs_seen.getIsUsed(realLabel.second)) {
auto r = RGroupData::getRGroupLabel(realLabel.second);
groups[r].push_back(boost::make_shared<RWMol>());
}
}
}
// purge R-group entries that have no mols
for (auto it = groups.begin(); it != groups.end();) {
auto itToErase = groups.end();
if (!rGroupWithRealMol.count(it->first)) {
itToErase = it;
}
++it;
if (itToErase != groups.end()) {
groups.erase(itToErase);
}
}
return groups;
}
const RGroupDecompositionParameters &RGroupDecomposition::params() const {
return data->params;
}
namespace {
std::vector<unsigned int> Decomp(RGroupDecomposition &decomp,
const std::vector<ROMOL_SPTR> &mols) {
auto t0 = std::chrono::steady_clock::now();
std::vector<unsigned int> unmatched;
for (size_t i = 0; i < mols.size(); ++i) {
int v = decomp.add(*mols[i].get());
if (v == -1) {
unmatched.push_back(i);
}
checkForTimeout(t0, decomp.params().timeout);
}
decomp.process();
return unmatched;
}
} // namespace
unsigned int RGroupDecompose(const std::vector<ROMOL_SPTR> &cores,
const std::vector<ROMOL_SPTR> &mols,
RGroupRows &rows,
std::vector<unsigned int> *unmatchedIndices,
const RGroupDecompositionParameters &options) {
RGroupDecomposition decomp(cores, options);
std::vector<unsigned int> unmatched = Decomp(decomp, mols);
if (unmatchedIndices) {
*unmatchedIndices = unmatched;
}
rows = decomp.getRGroupsAsRows();
return mols.size() - unmatched.size();
}
unsigned int RGroupDecompose(const std::vector<ROMOL_SPTR> &cores,
const std::vector<ROMOL_SPTR> &mols,
RGroupColumns &columns,
std::vector<unsigned int> *unmatchedIndices,
const RGroupDecompositionParameters &options) {
RGroupDecomposition decomp(cores, options);
std::vector<unsigned int> unmatched = Decomp(decomp, mols);
if (unmatchedIndices) {
*unmatchedIndices = unmatched;
}
columns = decomp.getRGroupsAsColumns();
return mols.size() - unmatched.size();
}
} // namespace RDKit