Files
rdkit/Code/GraphMol/RGroupDecomposition/RGroupCore.cpp
Paolo Tosco 6c966b4fe5 RGD code cleanup (#7186)
* RGD code cleanup
- made an effort to give more meaningful names to variables (e.g., renamed most instances of attachment (point) to avoid ambiguity as attachment may be interpreted as either the R-group atom or its neighbor atom on the core, which are two different things)
- replaced the old school removeAtom() method with begin/commitBatchEdit()
- added std::move and std::make_move_iterator where relevant to avoid potential unintended copying
- replaced instances of container.size() == 0 and !container.size() with container.empty() for better clarity
- replaced std::map::find() with std::map::at() where the key was not needed
- replaced expensive std::find_if with more efficient alternative
- added some missing const keywords and added references to avoid copying where appropriate
- replaced for loops with modern implicit looping alternatives where convenient
- avoid calling MolToSmiles when VERBOSE is not defined as the result is anyway not used
- removed "oops, exponential is a pain" code snippet as I believe 1. it is never executed 2. it is not tested 3. I do not think it is correct
- removed check for data->matches.size() > 1 as I do not believe it is correct
- Use std::unique_ptr::reset instead of defining a new std::unique_ptr and moving it to the original one

* changes in response to review

* change in response to review

* replaced std::set with boost::dynamic_bitset to save time on std::set::insert and std::set::find

* make sure we do not go out of bounds

---------

Co-authored-by: ptosco <paolo.tosco@novartis.com>
2024-03-03 05:34:13 +01:00

834 lines
33 KiB
C++

//
// Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
// and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "RGroupCore.h"
#include "GraphMol/SmilesParse/SmilesWrite.h"
#include "GraphMol/ChemTransforms/ChemTransforms.h"
#include "GraphMol/Substruct/SubstructUtils.h"
#include "GraphMol/TautomerQuery/TautomerQuery.h"
namespace RDKit {
namespace {
// From answer 12 in
// https://stackoverflow.com/questions/5279051/how-can-i-create-cartesian-product-of-vector-of-vectors
// by anumi
// modified to exclude duplicates
static std::vector<std::vector<int>> cartesianProduct(
const std::vector<std::vector<int>> &v, bool allowDuplicates) {
std::vector<std::vector<int>> s = {{}};
for (const auto &u : v) {
std::vector<std::vector<int>> r;
for (const auto &x : s) {
for (const auto y : u) {
// check for duplicates
if (allowDuplicates || std::find(x.begin(), x.end(), y) == x.end()) {
r.push_back(x);
r.back().push_back(y);
}
}
}
if (r.empty()) {
// unable to extend
return r;
}
s = std::move(r);
}
return s;
}
} // namespace
// move this to constructor if the create new core path can be removed from
// RGroupDecomposition::add
void RCore::init() {
findIndicesWithRLabel();
countUserRGroups();
buildMatchingMol();
}
void RCore::findIndicesWithRLabel() {
// Find all the core atoms that have user
// label and set their indices to 1 in core_atoms_with_user_labels
core_atoms_with_user_labels.resize(core->getNumAtoms());
for (const auto atom : core->atoms()) {
int label;
if (atom->getPropIfPresent(RLABEL, label) && label > 0) {
core_atoms_with_user_labels.set(atom->getIdx());
}
}
}
RWMOL_SPTR RCore::extractCoreFromMolMatch(
const ROMol &mol, const MatchVectType &match,
const RGroupDecompositionParameters &params) const {
auto extractedCore = boost::make_shared<RWMol>(mol);
boost::dynamic_bitset<> atomIndicesToKeep(mol.getNumAtoms());
std::vector<Bond *> newBonds;
std::map<Atom *, int> dummyAtomMap;
std::map<const Atom *, Atom *> molAtomMap;
for (auto &pair : match) {
const auto queryAtom = core->getAtomWithIdx(pair.first);
auto const targetAtom = extractedCore->getAtomWithIdx(pair.second);
if (int rLabel; queryAtom->getPropIfPresent(RLABEL, rLabel)) {
targetAtom->setProp(RLABEL, rLabel);
}
if (int rLabelType; queryAtom->getPropIfPresent(RLABEL_TYPE, rLabelType)) {
targetAtom->setProp(RLABEL_TYPE, rLabelType);
}
if (queryAtom->getAtomicNum() == 0 && queryAtom->hasProp(RLABEL) &&
queryAtom->getDegree() == 1) {
continue;
} else {
atomIndicesToKeep.set(pair.second);
molAtomMap[mol.getAtomWithIdx(pair.second)] = targetAtom;
int neighborNumber = -1;
#ifdef VERBOSE
std::cerr << "Atom Chirality In " << targetAtom->getChiralTag()
<< std::endl;
#endif
bool isChiral = targetAtom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW ||
targetAtom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW;
if (isChiral && !params.substructmatchParams.useChirality) {
// if we're not doing chiral matching don't copy chirality to the
// extracted core.
targetAtom->setChiralTag(Atom::CHI_UNSPECIFIED);
isChiral = false;
}
// collect neighbors in vector, so we can add atoms while looping
std::vector<Atom *> targetNeighborAtoms;
for (auto targetNeighborAtom : extractedCore->atomNeighbors(targetAtom)) {
targetNeighborAtoms.push_back(targetNeighborAtom);
}
// explicit hydrogens to keep in extracted core to preserve chiralty
std::vector<int> hydrogensToAdd;
for (const auto targetNeighborAtom : targetNeighborAtoms) {
++neighborNumber;
auto targetNeighborIndex = targetNeighborAtom->getIdx();
auto queryNeighborMapping = std::find_if(
match.begin(), match.end(),
[this, targetNeighborIndex, pair](const auto &p) {
return p.second == static_cast<int>(targetNeighborIndex) &&
core->getBondBetweenAtoms(pair.first, p.first);
});
if (queryNeighborMapping == match.end()) {
if (targetNeighborAtom->getAtomicNum() == 1) {
// Hydrogen needed to define chirality is present in target but
// not mapped to core. Copy it to the extracted core
hydrogensToAdd.push_back(static_cast<int>(targetNeighborIndex));
molAtomMap[mol.getAtomWithIdx(targetNeighborIndex)] =
targetNeighborAtom;
} else if (isChiral) {
// There is a heavy sidechain in the decomp that is connected to
// the core by an unknown bond (onlyMatchAtRGroups = False and
// allowMultipleRGroupsOnUnlabelled = False).
// As there is no explicit target bond chriality is not preserved.
// In some cases we could handle chirality, but that has not been
// implemented (if there is only one free bond on the query that
// would be easy- or we could arbitrarily assign bonds).
isChiral = false;
targetAtom->setChiralTag(Atom::CHI_UNSPECIFIED);
}
continue;
}
const auto queryNeighbor =
core->getAtomWithIdx((*queryNeighborMapping).first);
if (queryNeighbor->getAtomicNum() == 0 &&
queryNeighbor->hasProp(RLABEL) && queryNeighbor->getDegree() == 1) {
auto newDummy = new Atom(*queryNeighbor);
dummyAtomMap[newDummy] = static_cast<int>(targetNeighborIndex);
newDummy->clearComputedProps();
const auto newDummyIdx =
extractedCore->addAtom(newDummy, false, true);
if (newDummyIdx >= atomIndicesToKeep.size()) {
atomIndicesToKeep.resize(newDummyIdx + 1);
}
atomIndicesToKeep.set(newDummyIdx);
auto connectingBond =
extractedCore
->getBondBetweenAtoms(pair.second, targetNeighborIndex)
->copy();
if (connectingBond->getStereo() > Bond::BondStereo::STEREOANY) {
// stereo double bonds
connectingBond->setStereo(Bond::BondStereo::STEREOANY);
}
if (connectingBond->getBeginAtomIdx() == targetNeighborIndex) {
connectingBond->setBeginAtomIdx(newDummyIdx);
} else {
connectingBond->setEndAtomIdx(newDummyIdx);
}
newBonds.push_back(connectingBond);
// Check to see if we are breaking a stereo bond definition, by removing one of the stereo atoms
// If so, set to the new atom
for (auto bond: extractedCore->atomBonds(targetAtom)) {
if (bond->getIdx() == connectingBond->getIdx()) {
continue;
}
if (bond->getStereo() > Bond::STEREOANY) {
auto &stereoAtoms = bond->getStereoAtoms();
for (int& stereoAtom : stereoAtoms) {
if (stereoAtom == static_cast<int>(targetNeighborIndex)) {
stereoAtom = static_cast<int>(newDummyIdx);
}
}
}
}
// Chirality parity stuff see RDKit::replaceCore in
// Code/GraphMol/ChemTransforms/ChemTransforms.cpp
if (isChiral) {
bool switchIt = false;
switch (extractedCore->getAtomDegree(targetAtom)) {
case 4:
if (!(neighborNumber % 2)) {
switchIt = true;
}
break;
case 3:
if (neighborNumber == 1) {
switchIt = true;
}
break;
}
// because this neighbor will be moved at the end of the neighbor
// list, we need to decrement the neighbor number in case additional
// dummy atoms are to be connected to the targetAtom
--neighborNumber;
if (switchIt) {
targetAtom->invertChirality();
}
}
}
}
for (const auto index : hydrogensToAdd) {
atomIndicesToKeep.set(index);
}
#ifdef VERBOSE
std::cerr << "Atom Chirality Out " << targetAtom->getChiralTag()
<< std::endl;
#endif
}
}
for (const auto newBond : newBonds) {
extractedCore->addBond(newBond, true);
}
// Now delete atom's that are not in the core.
extractedCore->beginBatchEdit();
boost::dynamic_bitset<> removedAtoms(extractedCore->getNumAtoms());
for (const auto atom : extractedCore->atoms()) {
if (!atomIndicesToKeep.test(atom->getIdx())) {
extractedCore->removeAtom(atom);
removedAtoms.set(atom->getIdx());
}
}
extractedCore->commitBatchEdit();
// Copy molecule coordinates to extracted core
details::updateSubMolConfs(mol, *extractedCore, removedAtoms);
for (auto citer = mol.beginConformers(); citer != mol.endConformers();
++citer) {
Conformer &newConf =
extractedCore->getConformer(static_cast<int>((*citer)->getId()));
for (const auto &[fst, snd] : dummyAtomMap) {
newConf.setAtomPos(fst->getIdx(), (*citer)->getAtomPos(snd));
}
}
// If the molecule has no coordinates and the core does, copy those over
if (!mol.getNumConformers() && core->getNumConformers()) {
ROMol molCopy(mol);
for (auto citer = core->beginConformers(); citer != core->endConformers();
++citer) {
const auto newConf = new Conformer(mol.getNumAtoms());
newConf->setId((*citer)->getId());
newConf->set3D((*citer)->is3D());
for (const auto &[fst, snd] : match) {
newConf->setAtomPos(snd, (*citer)->getAtomPos(fst));
}
molCopy.addConformer(newConf);
}
details::updateSubMolConfs(molCopy, *extractedCore, removedAtoms);
molCopy.clearConformers();
for (const auto atom : extractedCore->atoms()) {
if (isUserRLabel(*atom)) {
int rLabel = atom->getProp<int>(RLABEL);
for (const auto coreAtom : core->atoms()) {
if (int l; coreAtom->getPropIfPresent(RLABEL, l) && l == rLabel) {
int i = 0;
for (auto citer = core->beginConformers();
citer != core->endConformers(); ++citer, ++i) {
extractedCore->getConformer(i).setAtomPos(
atom->getIdx(), (*citer)->getAtomPos(coreAtom->getIdx()));
}
break;
}
}
}
}
}
// Copy over any stereo groups that lie in the extracted core
details::copyStereoGroups(molAtomMap, mol, *extractedCore);
extractedCore->clearComputedProps(true);
extractedCore->updatePropertyCache(false);
#ifdef VERBOSE
std::cerr << "Extracted core smiles " << MolToSmiles(*extractedCore)
<< std::endl;
std::cerr << "Extracted core smarts " << MolToSmarts(*extractedCore)
<< std::endl;
#endif
try {
unsigned int failed;
MolOps::sanitizeMol(*extractedCore, failed,
MolOps::SANITIZE_SYMMRINGS | MolOps::SANITIZE_CLEANUP);
} catch (const MolSanitizeException &) {
}
return extractedCore;
}
// Return a copy of core where dummy atoms are replaced by
// the respective matching atom in mol, while other atoms have
// their aromatic flag and formal charge copied from
// the respective matching atom in mol
ROMOL_SPTR RCore::replaceCoreAtomsWithMolMatches(
const ROMol &mol, const MatchVectType &match) const {
auto coreReplacedAtoms = boost::make_shared<RWMol>(*core);
for (const auto &p : match) {
auto atom = coreReplacedAtoms->getAtomWithIdx(p.first);
if (isAtomWithMultipleNeighborsOrNotDummyRGroupAttachment(*atom)) {
auto molAtom = mol.getAtomWithIdx(p.second);
replaceCoreAtom(*coreReplacedAtoms, *atom, *molAtom);
}
}
std::map<int, int> matchLookup(match.cbegin(), match.cend());
for (auto bond : coreReplacedAtoms->bonds()) {
if (bond->hasQuery()) {
const auto molBond =
mol.getBondBetweenAtoms(matchLookup[bond->getBeginAtomIdx()],
matchLookup[bond->getEndAtomIdx()]);
if (molBond == nullptr) {
// this can happen if we have a user-defined R group that is not
// matched in the query
CHECK_INVARIANT(bond->getBeginAtom()->getAtomicNum() == 0 ||
bond->getEndAtom()->getAtomicNum() == 0,
"Failed to find core bond in molecule");
} else {
Bond newBond(molBond->getBondType());
newBond.setIsAromatic(molBond->getIsAromatic());
coreReplacedAtoms->replaceBond(bond->getIdx(), &newBond, true);
}
}
}
#ifdef VERBOSE
std::cerr << "Original core smarts " << MolToSmarts(*core) << std::endl;
std::cerr << "Dummy replaced core smarts " << MolToSmarts(*coreReplacedAtoms)
<< std::endl;
#endif
if (mol.getNumConformers() > 0) {
// if the input structure has coordinates copy them to the core
if (!coreReplacedAtoms->getNumConformers()) {
coreReplacedAtoms->addConformer(
new Conformer(coreReplacedAtoms->getNumAtoms()));
}
auto &replacedConformer = coreReplacedAtoms->getConformer();
const auto &molConformer = mol.getConformer();
for (const auto &p : match) {
auto molPoint = molConformer.getAtomPos(p.second);
replacedConformer.setAtomPos(p.first, molPoint);
}
} else {
// otherwise, delete all core coordinates from the replaced core
coreReplacedAtoms->clearConformers();
}
return coreReplacedAtoms;
}
void RCore::replaceCoreAtom(RWMol &mol, Atom &atom, const Atom &other) const {
auto atomicNumber = other.getAtomicNum();
auto targetAtom = &atom;
bool wasDummy = (atom.getAtomicNum() == 0);
if (wasDummy || atom.hasQuery()) {
if (atom.hasQuery()) {
Atom newAtom(atomicNumber);
auto atomIdx = atom.getIdx();
mol.replaceAtom(atomIdx, &newAtom, false, true);
targetAtom = mol.getAtomWithIdx(atomIdx);
} else {
atom.setAtomicNum(atomicNumber);
}
}
targetAtom->setIsAromatic(other.getIsAromatic());
targetAtom->setFormalCharge(other.getFormalCharge());
if (wasDummy) {
targetAtom->setNoImplicit(true);
unsigned int numHs = 0;
const auto &otherMol = other.getOwningMol();
for (const auto &nbri :
boost::make_iterator_range(otherMol.getAtomNeighbors(&other))) {
const auto nbrAtom = otherMol[nbri];
if (nbrAtom->getAtomicNum() == 1) {
++numHs;
}
}
targetAtom->setNumExplicitHs(numHs + other.getTotalNumHs());
targetAtom->updatePropertyCache(false);
}
}
// matching the core to the target molecule is a two step process
// First match to a reduced representation (the core minus terminal user
// R-groups). Next, match the R-groups. We do this as the core may not be a
// substructure match for the molecule if a single molecule atom matches 2
// terminal user defined RGroup attachments (see
// https://github.com/rdkit/rdkit/pull/4002) buildMatchingMol() creates the
// reduced representation from the core and matchTerminalUserRGroups() adds in
// the terminal R Groups to the match
// Builds a matching molecule which is the core with terminal user R groups
// removed Also creates the data structures used in matching the R groups
void RCore::buildMatchingMol() {
matchingMol = boost::make_shared<RWMol>(*core);
terminalRGroupAtomToNeighbor.clear();
matchingMol->beginBatchEdit();
for (auto atom : matchingMol->atoms()) {
// keep track of the original core index in the matching molecule atom
atom->setProp<int>(RLABEL_CORE_INDEX, atom->getIdx());
// TODO for unlabelled core attachments if the heavy neighbor is not dummy
// then keep one attachment
if (atom->getAtomicNum() == 0 && atom->getDegree() == 1 &&
isDummyRGroupAttachment(*atom)) {
// remove terminal user R groups and map the index of the core neighbor
// atom to the index of the removed terminal R group
const int neighborIdx = *matchingMol->getAtomNeighbors(atom).first;
terminalRGroupAtomToNeighbor.emplace(atom->getIdx(), neighborIdx);
matchingMol->removeAtom(atom);
}
}
matchingMol->commitBatchEdit();
}
// Given a matching molecule substructure match to a target molecule, return
// core matches with terminal user R groups matched
std::vector<MatchVectType> RCore::matchTerminalUserRGroups(
const RWMol &target, MatchVectType match,
const SubstructMatchParameters &sssParams) const {
// Transform match indexed by matching molecule atoms to a map
// indexed by core atoms
std::transform(match.begin(), match.end(), match.begin(),
[this](const std::pair<int, int> &mapping) {
auto queryIdx =
this->matchingIndexToCoreIndex(mapping.first);
std::pair<int, int> newMapping(queryIdx, mapping.second);
return newMapping;
});
std::map<int, int> matchMap(match.cbegin(), match.cend());
std::vector<MatchVectType> allMappings;
if (terminalRGroupAtomToNeighbor.empty()) {
allMappings.push_back(std::move(match));
return allMappings;
}
// build a dynamic_bitset of target atoms currently mapped
boost::dynamic_bitset<> mappedTargetIdx(target.getNumAtoms());
for (const auto &pair : match) {
mappedTargetIdx.set(pair.second);
}
// Dummy atoms/r group attachments that cannot be mapped to target atoms
std::vector<int> missingDummies;
// A map of terminal dummies/R group attachment points to a list of possible
// target atoms that the R group can map to
std::map<int, std::vector<int>> availableMappingsForDummyMap;
boost::dynamic_bitset<> symmetricHydrogens(target.getNumAtoms());
// keep a count of target atoms and the number of connections to the core
std::map<int, int> targetAtomBondsToCoreCounts;
for (const auto coreAtom : core->atoms()) {
if (coreAtom->getAtomicNum() == 1) {
continue;
}
if (isTerminalRGroupWithUserLabel(coreAtom->getIdx())) {
continue;
}
std::vector<int> dummyIndexes;
for (auto neighbor : core->atomNeighbors(coreAtom)) {
if (isTerminalRGroupWithUserLabel(neighbor->getIdx())) {
dummyIndexes.push_back(neighbor->getIdx());
}
}
if (dummyIndexes.empty()) {
continue;
}
// Sort dummies based on user RLABEL (ascending) then unlabeled
// (descending as they are negative)
std::sort(dummyIndexes.begin(), dummyIndexes.end(), [this](int a, int b) {
auto dummy = core->getAtomWithIdx(a);
auto otherDummy = core->getAtomWithIdx(b);
auto l1 = dummy->getProp<int>(RLABEL);
auto l2 = otherDummy->getProp<int>(RLABEL);
return l1 > 0 && l2 > 0 ? l1 < l2 : l1 > l2;
});
// Find what target atoms will bond to these dummies
std::vector<std::vector<int>> neighborDummyLists;
for (auto dummyIndex : dummyIndexes) {
const int neighborIdx =
terminalRGroupAtomToNeighbor.at(dummyIndex);
const auto coreBond = core->getBondBetweenAtoms(dummyIndex, neighborIdx);
// find the atom in the target mapped to the neighbor in the core
const int targetIdx = matchMap[neighborIdx];
const auto targetAtom = target.getAtomWithIdx(targetIdx);
std::vector<int> available;
// now look for neighbors of that target atom that are not mapped to a
// core atom- the dummy atom can potentially be mapped to each of those
for (const auto &nbrIdx :
boost::make_iterator_range(target.getAtomNeighbors(targetAtom))) {
if (!mappedTargetIdx.test(nbrIdx)) {
const auto targetBond =
target.getBondBetweenAtoms(targetIdx, nbrIdx);
// check for bond compatibility
if (bondCompat(coreBond, targetBond, sssParams)) {
available.push_back(nbrIdx);
++targetAtomBondsToCoreCounts[nbrIdx];
}
}
}
if (available.size() > 1) {
bool allHydrogens = std::all_of(
available.begin(), available.end(), [&target](const int idx) {
return target.getAtomWithIdx(idx)->getAtomicNum() == 1;
});
if (allHydrogens) {
// If all neighbors are hydrogens we don't need to iterate through
// them- just assign the first free hydrogen. Could extend to cover
// symmetric groups in general
auto hydrogen = std::find_if(available.begin(), available.end(),
[&symmetricHydrogens](const int idx) {
return !symmetricHydrogens.test(idx);
});
int singleHydrogen =
hydrogen == available.end() ? *available.begin() : *hydrogen;
available = std::vector<int>{singleHydrogen};
symmetricHydrogens.set(singleHydrogen);
}
}
neighborDummyLists.push_back(available);
}
// Now search through all the dummies and see if we need to exclude any as
// it may not be possible to assign target atoms to all R groups. If that
// is the case exclude R Groups with higher labels
// use negative numbers to indicate a group is excluded
std::vector<std::vector<int>> neighborListsWithUnmapped(
neighborDummyLists.size());
int start = 0;
std::transform(neighborDummyLists.begin(), neighborDummyLists.end(),
neighborListsWithUnmapped.begin(),
[&start](std::vector<int> v) {
v.push_back(--start);
return v;
});
// could optimize this product as depth search to return as soon as we
// have a permutation of all positive numbers
auto cp = cartesianProduct(neighborListsWithUnmapped, false);
// now sort
bool foundAll = false;
std::sort(cp.begin(), cp.end(),
[&foundAll](const std::vector<int> a, std::vector<int> b) {
auto isNegative = [](int v) -> bool { return v < 0; };
// firstly to minimize number of unmapped dummies
const int numUnmappedA =
std::count_if(a.begin(), a.end(), isNegative);
const int numUnmappedB =
std::count_if(b.begin(), b.end(), isNegative);
if (numUnmappedA < numUnmappedB) {
return true;
}
if (numUnmappedB == 0 || numUnmappedA == 0) {
foundAll = true;
}
if (!foundAll && numUnmappedA == numUnmappedB && numUnmappedA) {
// in a tie prefer the permutation that excludes the r group
// with the lowest label
// don't need to sort these if we've found an all mapped
// combination
for (size_t i = 0; i < a.size(); ++i) {
const int v1 = a[i];
const int v2 = b[i];
if ((v1 > 0 && v2 > 0) || (v1 < 0 && v2 < 0)) {
continue;
}
if (v1 > 0 && v2 < 0) {
return true;
}
if (v1 < 0 && v2 > 0) {
break;
}
}
}
return false;
});
auto best = cp[0];
for (size_t i = 0; i < dummyIndexes.size(); i++) {
auto dummyIdx = dummyIndexes[i];
if (best[i] < 0) {
// We can't map this dummy to a target atom. That is OK if it is an
// unlabeled core attachment atom or a user R label connected to a
// query or wildcard atom
const auto dummy = core->getAtomWithIdx(dummyIdx);
if (dummy->hasProp(UNLABELED_CORE_ATTACHMENT)) {
missingDummies.push_back(dummyIdx);
} else if (isUserRLabel(*dummy) &&
(coreAtom->getAtomicNum() == 0 || coreAtom->hasQuery())) {
// https://github.com/rdkit/rdkit/issues/4505
missingDummies.push_back(dummyIdx);
} else {
return allMappings;
}
} else {
availableMappingsForDummyMap[dummyIdx] = neighborDummyLists[i];
}
}
}
if (availableMappingsForDummyMap.empty()) {
allMappings.push_back(std::move(match));
return allMappings;
}
std::vector<int> dummiesWithMapping;
std::vector<std::vector<int>> availableMappingsForDummy;
for (const auto &mapping : availableMappingsForDummyMap) {
dummiesWithMapping.push_back(mapping.first);
availableMappingsForDummy.push_back(mapping.second);
}
// enumerate over all available atoms using a cartesian product.
// only allow duplicates if a target atom can be bonded to more than one query
// atom
const bool allowDuplicates =
std::find_if(targetAtomBondsToCoreCounts.begin(),
targetAtomBondsToCoreCounts.end(),
[](const std::pair<int, int> &p) { return p.second > 1; }) !=
targetAtomBondsToCoreCounts.end();
const auto allAvailableMappings =
cartesianProduct(availableMappingsForDummy, allowDuplicates);
if (allAvailableMappings.empty()) {
allMappings.push_back(std::move(match));
return allMappings;
}
// the size of the final mapping
size_t size = allAvailableMappings[0].size() + match.size();
// these indices are needed for the whole molecule match check functor
std::unique_ptr<RWMol> checkCore;
std::map<size_t, size_t> coreToCheck;
const std::string indexProp("__core_index__");
bool hasMissing = !missingDummies.empty();
if (hasMissing) {
// if there are dummies that we can't map these need to be removed from the
// query before atom-by-atom matching. Create a copy of the query for that
// and use properties to map atoms back to the core
for (auto atom : core->atoms()) {
atom->setProp(indexProp, atom->getIdx());
}
checkCore = std::make_unique<RWMol>(*core);
std::sort(missingDummies.begin(), missingDummies.end(),
std::greater<int>());
for (int index : missingDummies) {
auto [nbrIdx, endNbrs] =
checkCore->getAtomNeighbors(checkCore->getAtomWithIdx(index));
auto neighborAtom = checkCore->getAtomWithIdx(*nbrIdx);
checkCore->removeAtom(index);
neighborAtom->updatePropertyCache(false);
}
size_t index = 0U;
for (const auto atom : checkCore->atoms()) {
auto coreIndex = atom->getProp<int>(indexProp);
coreToCheck[coreIndex] = index++;
}
for (auto atom : core->atoms()) {
atom->clearProp(indexProp);
}
}
auto queryIndices = new std::uint32_t[size];
auto targetIndices = new std::uint32_t[size];
for (size_t position = 0; position < match.size(); position++) {
const auto &pair = match[position];
auto queryIndex = hasMissing ? coreToCheck[pair.first] : pair.first;
queryIndices[position] = queryIndex;
targetIndices[position] = pair.second;
}
auto queryMatchingMol = hasMissing ? checkCore.get() : core.get();
MolMatchFinalCheckFunctor molMatchFunctor(*queryMatchingMol, target,
sssParams);
boost::dynamic_bitset<> targetBondsPresent(target.getNumBonds());
// Filter all available mappings removing those that violate chirality or have
// duplicate bonds
for (const auto &dummyMapping : allAvailableMappings) {
CHECK_INVARIANT(match.size() + dummyMapping.size() == size,
"Size error in dummy mapping");
auto duplicateBonds = false;
targetBondsPresent.reset();
for (size_t i = 0; i < dummyMapping.size(); i++) {
size_t position = match.size() + i;
auto queryIndex = hasMissing ? coreToCheck[dummiesWithMapping[i]]
: dummiesWithMapping[i];
queryIndices[position] = queryIndex;
targetIndices[position] = dummyMapping[i];
if (allowDuplicates) {
const int neighborIdx =
terminalRGroupAtomToNeighbor.at(dummiesWithMapping[i]);
const int targetNeighborIdx = matchMap[neighborIdx];
const auto targetBond =
target.getBondBetweenAtoms(dummyMapping[i], targetNeighborIdx);
CHECK_INVARIANT(targetBond != nullptr,
"Matching target bond not found");
const auto targetBondIdx = targetBond->getIdx();
// check for duplicates
if (targetBondsPresent[targetBondIdx]) {
duplicateBonds = true;
break;
}
targetBondsPresent[targetBondIdx] = 1;
}
}
// use MolMatchFinalCheckFunctor to check this match works with chirality
if (!duplicateBonds && molMatchFunctor(queryIndices, targetIndices)) {
MatchVectType matchWithDummy(match);
for (size_t i = 0; i < dummyMapping.size(); i++) {
matchWithDummy.emplace_back(dummiesWithMapping[i], dummyMapping[i]);
}
allMappings.push_back(std::move(matchWithDummy));
}
}
delete[] queryIndices;
delete[] targetIndices;
return allMappings;
}
// This function checks the bond environment is valid when onlyMatchAtRGroups
// is set. When this function is called targetRGroupIdx is the index of an atom
// in the target that is mapped to an R group. Validates that all core bonds to
// the R group are present - in certain circumstances there may be two
// core bonds to a target R group and when onlyMatchAtRGroups is set
// both bonds should be present (#4002).
bool RCore::checkAllBondsToRGroupPresent(
const ROMol &mol, const int targetRGroupIdx,
const std::vector<std::vector<int>> &targetToCoreIndices) const {
const auto targetRGroupAtom = mol.getAtomWithIdx(targetRGroupIdx);
std::set<int> coreNeighborIndices;
for (const auto &nbri :
boost::make_iterator_range(mol.getAtomNeighbors(targetRGroupAtom))) {
const auto &nbr = mol[nbri];
// could a neighbor to an r group attachment match another r group
// attachment? I don't think so.
if (nbr->getAtomicNum() >= 1) {
const auto &coreAtomIndices = targetToCoreIndices.at(nbri);
if (coreAtomIndices.empty()) {
continue;
}
CHECK_INVARIANT(coreAtomIndices.size() == 1,
"Target atom neighboring R group should have exactly one match in core");
auto coreAtomIdx = coreAtomIndices.front();
const auto coreAtom = core->getAtomWithIdx(coreAtomIdx);
// don't need to match a non terminal user R group
// if (!(coreAtom->getDegree() > 1 && isUserRLabel(*coreAtom))) {
if (!(coreAtom->getAtomicNum() == 0 && isUserRLabel(*coreAtom))) {
coreNeighborIndices.insert(coreAtomIdx);
}
}
}
CHECK_INVARIANT(
coreNeighborIndices.size() >= 1,
"Unable to find target atom(s) matching core for attachment point");
if (coreNeighborIndices.size() == 1) {
// currently this routine is only called when we know the attachment to
// one core atom exists.
return true;
}
// at this point we know the target atom is connected to two or more core
// atoms. Now check we have core R groups for each.
// There should be a map entry for a different R group for each of
// the core atoms.
const auto &coreRGroupIndices = targetToCoreIndices.at(targetRGroupIdx);
if (coreRGroupIndices.size() != coreNeighborIndices.size()) {
return false;
}
// check that there is a bond to an attachment point for every neighbor
std::set<int> bondIndices;
for (const auto coreNeighborIdx : coreNeighborIndices) {
for (const auto coreRGroupIdx : coreRGroupIndices) {
const auto bond = core->getBondBetweenAtoms(coreNeighborIdx, coreRGroupIdx);
if (bond) {
bondIndices.insert(bond->getIdx());
}
}
}
return bondIndices.size() == coreNeighborIndices.size();
}
// Convert a matching molecule index to a core index
int RCore::matchingIndexToCoreIndex(int matchingIndex) const {
auto atom = matchingMol->getAtomWithIdx(matchingIndex);
CHECK_INVARIANT(atom->hasProp(RLABEL_CORE_INDEX),
"Matched atom missing core index");
return atom->getProp<int>(RLABEL_CORE_INDEX);
}
// Create tautomer query for the matching mol on demand and cache for performance
// If the tautomer query cannot be created (because we can't kekulize the query)
// then nullptr will be returned and we revert to non-tautomer match
std::shared_ptr<TautomerQuery> RCore::getMatchingTautomerQuery() {
if (!checkedForTautomerQuery) {
try {
// Enumerate tautomers from a sanitized copy of the matching molecule
RWMol copy(*matchingMol);
// If the core has had rgroup labels removed when creating the matching mol
// then we need to update properties. Should a full sanitization be done?
// MolOps::sanitizeMol(*copy);
copy.updatePropertyCache(false);
std::shared_ptr<TautomerQuery> tautomerQuery(
TautomerQuery::fromMol(copy));
matchingTautomerQuery = tautomerQuery;
} catch (const MolSanitizeException &) {
matchingTautomerQuery = nullptr;
}
checkedForTautomerQuery = true;
}
return matchingTautomerQuery;
}
} // namespace RDKit