// // Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc. // and other RDKit contributors // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include "RGroupDecomp.h" #include "RGroupDecompData.h" #include #include #include #include #include #include #include #include #include #include "GraphMol/TautomerQuery/TautomerQuery.h" // #define VERBOSE 1 namespace RDKit { // Attachment Points // labeled cores => isotopes // atom mappings // atom indices => use -1 - atom index, range is [-1, ...., -num_atoms] const std::string RLABEL = "tempRlabel"; const std::string RLABEL_TYPE = "tempRlabelType"; const std::string RLABEL_CORE_INDEX = "rLabelCoreIndex"; const std::string SIDECHAIN_RLABELS = "sideChainRlabels"; const std::string done = "RLABEL_PROCESSED"; const std::string _rgroupInputDummy = "_rgroupInputDummy"; const std::string UNLABELED_CORE_ATTACHMENT = "unlabeledCoreAttachment"; namespace { static const std::string TARGET_ATOM_IDX = "__rgdTargetAtomIdx"; static const std::string TARGET_BOND_IDX = "__rgdTargetBondIdx"; void ADD_MATCH(R_DECOMP &match, int rlabel) { if (match.find(rlabel) == match.end()) { match[rlabel] = boost::make_shared(); } } } // namespace RGroupDecomposition::RGroupDecomposition( const ROMol &inputCore, const RGroupDecompositionParameters ¶ms) : data(new RGroupDecompData(inputCore, params)) {} RGroupDecomposition::RGroupDecomposition( const std::vector &cores, const RGroupDecompositionParameters ¶ms) : data(new RGroupDecompData(cores, params)) {} RGroupDecomposition::~RGroupDecomposition() { delete data; } void RGroupDecomposition::labelAtomBondIndices(RWMol &mol) { for (const auto targetAtom : mol.atoms()) { targetAtom->setProp(TARGET_ATOM_IDX, targetAtom->getIdx()); } for (const auto targetBond : mol.bonds()) { targetBond->setProp(TARGET_BOND_IDX, targetBond->getIdx()); } } void RGroupDecomposition::setTargetAtomBondIndices( ROMol &mol, bool includeBondsToRLabels) const { std::vector atomIndices(mol.getNumAtoms(), -1); std::vector bondIndices(mol.getNumBonds(), -1); int largestAtomIdx = -1; bool isHydrogen = RGroupData::isMolHydrogen(mol); for (const auto atom : mol.atoms()) { int targetAtomIdx; if (atom->getPropIfPresent(TARGET_ATOM_IDX, targetAtomIdx)) { atom->clearProp(TARGET_ATOM_IDX); if ((atom->getAtomicNum() == 1 && data->params.removeHydrogensPostMatch && !isHydrogen) || (atom->getAtomicNum() == 0 && !atom->hasProp(_rgroupInputDummy))) { continue; } int atomIdx = atom->getIdx(); atomIndices[atomIdx] = targetAtomIdx; largestAtomIdx = std::max(atomIdx, largestAtomIdx); } } atomIndices.resize(largestAtomIdx + 1); int largestBondIdx = -1; for (const auto bond : mol.bonds()) { int targetBondIdx; if (bond->getPropIfPresent(TARGET_BOND_IDX, targetBondIdx)) { bond->clearProp(TARGET_BOND_IDX); if ((bond->getBeginAtom()->getAtomicNum() == 1 || bond->getEndAtom()->getAtomicNum() == 1) && data->params.removeHydrogensPostMatch && !isHydrogen) { continue; } if (!includeBondsToRLabels && ((bond->getBeginAtom()->getAtomicNum() == 0 && !bond->getBeginAtom()->hasProp(_rgroupInputDummy)) || (bond->getEndAtom()->getAtomicNum() == 0 && !bond->getEndAtom()->hasProp(_rgroupInputDummy)))) { continue; } int bondIdx = bond->getIdx(); bondIndices[bondIdx] = targetBondIdx; largestBondIdx = std::max(bondIdx, largestBondIdx); } } bondIndices.resize(largestBondIdx + 1); std::vector highlightAtoms; highlightAtoms.reserve(atomIndices.size()); std::copy_if(atomIndices.begin(), atomIndices.end(), std::back_inserter(highlightAtoms), [](const auto atomIdx) { return atomIdx != -1; }); std::vector highlightBonds; highlightBonds.reserve(bondIndices.size()); std::copy_if(bondIndices.begin(), bondIndices.end(), std::back_inserter(highlightBonds), [](const auto bondIdx) { return bondIdx != -1; }); mol.setProp(common_properties::_rgroupTargetAtoms, highlightAtoms); mol.setProp(common_properties::_rgroupTargetBonds, highlightBonds); } int RGroupDecomposition::getMatchingCoreIdx( const ROMol &mol, std::vector *matches) { RWMol rwmol(mol); std::vector matchesTmp; const RCore *rcore; auto coreIdx = getMatchingCoreInternal(rwmol, rcore, matchesTmp); if (matches) { std::set uniqueMatches; int numAtoms = mol.getNumAtoms(); for (const auto &match : matchesTmp) { MatchVectType heavyMatch; heavyMatch.reserve(match.size()); std::copy_if( std::make_move_iterator(match.begin()), std::make_move_iterator(match.end()), std::back_inserter(heavyMatch), [numAtoms](const auto &pair) { return pair.second < numAtoms; }); std::sort(heavyMatch.begin(), heavyMatch.end()); uniqueMatches.insert(heavyMatch); } *matches = std::vector(uniqueMatches.begin(), uniqueMatches.end()); } return coreIdx; } int RGroupDecomposition::getMatchingCoreInternal( RWMol &mol, const RCore *&rcore, std::vector &matches) { rcore = nullptr; int core_idx = -1; const bool explicitOnly = false; const bool addCoords = true; MolOps::addHs(mol, explicitOnly, addCoords); std::vector tmatches; std::vector tmatches_filtered; // Find the first matching core (onlyMatchAtRGroups) // or the first core that requires the smallest number // of newly added labels and is a superstructure of // the first matching core int global_min_heavy_nbrs = -1; SubstructMatchParameters sssparams(params().substructmatchParams); sssparams.uniquify = false; sssparams.recursionPossible = true; for (auto &core : data->cores) { { // matching the core to the molecule is a two step process // First match to a reduced representation (the core minus terminal // R-groups). Next, match the R-groups. We do this as the core may not be // a substructure match for the molecule if a single molecule atom matches // 2 RGroup attachments (see https://github.com/rdkit/rdkit/pull/4002) // match the reduced representation: std::vector baseMatches; if (params().doTautomers) { // Here we are attempting to enumerate tautomers of the core if (auto tautomerQuery = core.second.getMatchingTautomerQuery(); tautomerQuery != nullptr) { // query atom indices from the tautomer query are the same as the // template matching molecule baseMatches = tautomerQuery->substructOf(mol, sssparams); } else { // However, if it is not possible to Kekulize the core, we revert back // to the non-tautomer matching. baseMatches = SubstructMatch(mol, *core.second.matchingMol, sssparams); } } else { baseMatches = SubstructMatch(mol, *core.second.matchingMol, sssparams); } tmatches.clear(); for (const auto &baseMatch : baseMatches) { // Match the R Groups // Important: there can be multiple core indices matching // the same target idx, because of #4002 auto matchesIncludingRGroups = core.second.matchTerminalUserRGroups(mol, baseMatch, sssparams); /* std::cerr << "baseMatch "; for (const auto &pair : baseMatch) std::cerr << "(" << pair.first <<"," << pair.second << "),"; std::cerr << std::endl; std::cerr << "matchesIncludingRGroups "; for (const auto &matchWithDummy : matchesIncludingRGroups) { for (const auto &pair : matchWithDummy) std::cerr << "(" << pair.first <<"," << pair.second << "),"; std::cerr << " /// "; } std::cerr << std::endl; */ tmatches.insert( tmatches.end(), std::make_move_iterator(matchesIncludingRGroups.cbegin()), std::make_move_iterator(matchesIncludingRGroups.cend())); } } if (tmatches.empty()) { continue; } std::vector tmatches_heavy_nbrs(tmatches.size(), 0); size_t i = 0; for (const auto &mv : tmatches) { bool passes_filter = data->params.onlyMatchAtRGroups; // targetToCoreIndices maps each atom idx in the molecule to a vector // of atom indices. This vector may be empty (if the atom in the molecule // has no match with core) or not. When not empty, it will most often // contain a single atom idx, corresponding to the matching index in the // core, as usually a core atom can only match a single molecule atom. // However, there is an important exception to this rule, i.e. when // the core bears a single R-group dummy at a certain position, while // the molecule has multiple substituents at the corresponding // position; in this case, the vector will contain the indices of the // root atom in all substituents which match a single R-group dummy on // the core. std::vector> targetToCoreIndices(mol.getNumAtoms()); for (const auto &match : mv) { targetToCoreIndices[match.second].push_back(match.first); } for (const auto &match : mv) { const auto atm = mol.getAtomWithIdx(match.second); // is this a labelled rgroup or not? if (!core.second.isCoreAtomUserLabelled(match.first)) { // nope... if any neighbor is not part of the substructure // check if it is a hydrogen; otherwise, if onlyMatchAtRGroups // is true, skip the match for (const auto &nbri : boost::make_iterator_range(mol.getAtomNeighbors(atm))) { const auto &nbr = mol[nbri]; if (nbr->getAtomicNum() != 1 && targetToCoreIndices.at(nbr->getIdx()).empty()) { if (data->params.onlyMatchAtRGroups) { passes_filter = false; break; } else { // for each match, we keep track of the number of // R labels that need to be added to match all // non-user-labelled R groups in this molecule // if we use this core for RGD ++tmatches_heavy_nbrs[i]; } } } } else if (core.second.isTerminalRGroupWithUserLabel(match.first) && data->params.onlyMatchAtRGroups && !core.second.checkAllBondsToRGroupPresent( mol, match.second, targetToCoreIndices)) { // labelled R-group passes_filter = false; } if (!passes_filter && data->params.onlyMatchAtRGroups) { break; } } if (passes_filter) { tmatches_filtered.push_back(std::move(mv)); } ++i; } if (!data->params.onlyMatchAtRGroups) { // tmatches_heavy_nbrs.size() = tmatches.size(), and // tmatches.size() cannot be empty, otherwise we should not be here // but let's check it in case something changes upstream CHECK_INVARIANT(!tmatches_heavy_nbrs.empty(), "tmatches_heavy_nbrs must not be empty"); int min_heavy_nbrs = *std::min_element(tmatches_heavy_nbrs.begin(), tmatches_heavy_nbrs.end()); if (!rcore || (min_heavy_nbrs < global_min_heavy_nbrs && !SubstructMatch(*core.second.core, *rcore->core, sssparams) .empty())) { i = 0; tmatches_filtered.clear(); for (const auto heavy_nbrs : tmatches_heavy_nbrs) { if (heavy_nbrs <= min_heavy_nbrs) { tmatches_filtered.push_back(std::move(tmatches[i])); } ++i; } global_min_heavy_nbrs = min_heavy_nbrs; rcore = &core.second; core_idx = core.first; if (global_min_heavy_nbrs == 0) { break; } } } else if (!tmatches_filtered.empty()) { rcore = &core.second; core_idx = core.first; break; } } if (rcore) { matches = std::move(tmatches_filtered); } return core_idx; } namespace { // Take the matches, all from the same molecule and split them so that // different atom sets are separated out. So that if a core hits // more than once in the molecule, both sets of R Groups will be // returned. std::vector> splitNonUniqueMatches( const std::vector &tmatches, unsigned int nAtoms) { std::vector> outMatches; std::vector> atomSets; for (const auto &match : tmatches) { boost::dynamic_bitset<> atomSet(nAtoms); for (const auto &mp : match) { atomSet.set(mp.second); } if (std::find(atomSets.begin(), atomSets.end(), atomSet) == atomSets.end()) { atomSets.push_back(atomSet); outMatches.push_back(std::vector(1, match)); } else { for (size_t i = 0; i < atomSets.size(); ++i) { if (atomSet == atomSets[i]) { outMatches[i].push_back(match); } } } } return outMatches; } } // namespace int RGroupDecomposition::add(const ROMol &inmol) { RWMOL_SPTR mol(new RWMol(inmol)); const RCore *rcore; std::vector tmatches; // Add Hs for better symmetrization auto core_idx = getMatchingCoreInternal(*mol, rcore, tmatches); if (rcore == nullptr) { BOOST_LOG(rdDebugLog) << "No core matches" << std::endl; return -1; } if (data->params.includeTargetMolInResults) { labelAtomBondIndices(*mol); } if (tmatches.size() > 1) { if (data->params.matchingStrategy == NoSymmetrization) { tmatches.resize(1); } else if (data->matches.size() == 0) { // Greedy strategy just grabs the first match and // takes the best matches from the rest if (data->params.matchingStrategy == Greedy) { tmatches.resize(1); } } } // mark any wildcards in input molecule: for (auto &atom : mol->atoms()) { if (atom->getAtomicNum() == 0) { atom->setProp(_rgroupInputDummy, true); // clean any existing R group numbers atom->setIsotope(0); atom->setAtomMapNum(0); atom->clearProp(common_properties::_MolFileRLabel); atom->setProp(common_properties::dummyLabel, "*"); } } // strategies // ========== // Exhaustive - saves all matches and optimizes later exhaustive // May never finish due to combinatorial complexity // Greedy - matches to *FIRST* available match // GreedyChunks - default - process every N chunks, unless // MAX_PERMUTATIONS is exceeded, in which case it falls back to // Greedy for the current chunk // Should probably scan all mols first to find match with // smallest number of matches... std::vector potentialMatches; constexpr size_t MAX_PERMUTATIONS = 100000; std::vector> nonUniqueMatches; if (data->params.allowMultipleCoresInSameMol) { nonUniqueMatches = splitNonUniqueMatches(tmatches, mol->getNumAtoms()); } else { nonUniqueMatches.push_back(tmatches); } for (const auto &splitMatch : nonUniqueMatches) { std::unique_ptr tMol; for (const auto &tmatche : splitMatch) { const bool replaceDummies = false; const bool labelByIndex = true; const bool requireDummyMatch = false; // TODO see if we need replaceCoreAtomsWithMolMatches or can just use // rcore->core auto coreCopy = rcore->replaceCoreAtomsWithMolMatches(*mol, tmatche); tMol.reset(replaceCore(*mol, *coreCopy, tmatche, replaceDummies, labelByIndex, requireDummyMatch)); #ifdef VERBOSE std::cerr << "Core Match core_idx " << core_idx << " idx " << data->matches.size() << ": " << MolToSmarts(*coreCopy) << std::endl; #endif if (tMol) { #ifdef VERBOSE std::cerr << "All Fragments " << MolToSmiles(*tMol) << std::endl; #endif R_DECOMP match; // rlabel rgroups MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false); std::set coreAtomAnyMatched; // get the sidechains for (size_t i = 0; i < fragments.size(); ++i) { const auto &newMol = fragments[i]; std::vector rlabelsOnSideChain; newMol->setProp("core", core_idx); newMol->setProp("idx", data->matches.size()); newMol->setProp("frag_idx", i); #ifdef VERBOSE std::cerr << "Fragment " << MolToSmiles(*newMol) << std::endl; #endif for (auto sideChainAtom : newMol->atoms()) { if (sideChainAtom->getAtomicNum() != 0) { // we are only interested in sidechain R group atoms continue; } if (!sideChainAtom->hasProp(_rgroupInputDummy)) { // this is the index of the core atom that the R group // atom is attached to unsigned int coreAtomIndex = sideChainAtom->getIsotope(); auto coreAtom = rcore->core->getAtomWithIdx(coreAtomIndex); coreAtomAnyMatched.insert(coreAtomIndex); int rlabel; if (coreAtom->getPropIfPresent(RLABEL, rlabel)) { std::vector rlabelsOnSideChainAtom; sideChainAtom->getPropIfPresent(SIDECHAIN_RLABELS, rlabelsOnSideChainAtom); rlabelsOnSideChainAtom.push_back(rlabel); sideChainAtom->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChainAtom); data->labels.insert(rlabel); // keep track of all labels used rlabelsOnSideChain.push_back(rlabel); if (const auto [bondIdx, end] = newMol->getAtomBonds(sideChainAtom); bondIdx != end) { auto connectingBond = (*newMol)[*bondIdx]; if (connectingBond->getStereo() > Bond::BondStereo::STEREOANY) { // TODO: how to handle bond stereo on rgroups connected to // core by stereo double bonds connectingBond->setStereo(Bond::BondStereo::STEREOANY); } } } } else { // restore input wildcard sideChainAtom->clearProp(_rgroupInputDummy); } } if (data->params.includeTargetMolInResults) { setTargetAtomBondIndices(*newMol, true); } if (!rlabelsOnSideChain.empty()) { #ifdef VERBOSE std::string newCoreSmi = MolToSmiles(*newMol, true); #endif for (auto rlabel : rlabelsOnSideChain) { ADD_MATCH(match, rlabel); match[rlabel]->add(newMol, rlabelsOnSideChain); #ifdef VERBOSE std::cerr << "Fragment " << i << " R" << rlabel << " " << MolToSmiles(*newMol) << std::endl; #endif } } else { // special case, only one fragment if (fragments.size() == 1) { // need to make a new core // remove the sidechains // GJ I think if we ever get here that it's really an error and I // believe that I've fixed the case where this code was called. // Still, I'm too scared to delete the block. RWMol newCore(*mol); for (const auto &mvpair : tmatche) { const Atom *coreAtm = rcore->core->getAtomWithIdx(mvpair.first); Atom *newCoreAtm = newCore.getAtomWithIdx(mvpair.second); int rlabel; if (coreAtm->getPropIfPresent(RLABEL, rlabel)) { newCoreAtm->setProp(RLABEL, rlabel); } newCoreAtm->setProp("keep", true); } newCore.beginBatchEdit(); for (const auto atom : newCore.atoms()) { if (!atom->hasProp("keep")) { newCore.removeAtom(atom); } } newCore.commitBatchEdit(); if (newCore.getNumAtoms()) { std::string newCoreSmi = MolToSmiles(newCore, true); // add a new core if possible auto newcore = data->newCores.find(newCoreSmi); int core_idx = 0; if (newcore == data->newCores.end()) { core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--; data->cores[core_idx] = RCore(newCore); return add(inmol); } } } } } if (!match.empty()) { // this is the number of user-defined R labels associated with // non-hydrogen substituents auto numberUserGroupsInMatch = std::accumulate( match.begin(), match.end(), 0, [](int sum, const std::pair> &p) { return p.first > 0 && !p.second->is_hydrogen ? ++sum : sum; }); int numberMissingUserGroups = rcore->numberUserRGroups - numberUserGroupsInMatch; CHECK_INVARIANT(numberMissingUserGroups >= 0, "Data error in missing user rgroup count"); const auto extractedCore = rcore->extractCoreFromMolMatch(*mol, tmatche, params()); if (data->params.includeTargetMolInResults) { setTargetAtomBondIndices(*extractedCore, false); } potentialMatches.emplace_back(core_idx, numberMissingUserGroups, match, extractedCore); if (data->params.includeTargetMolInResults) { potentialMatches.back().setTargetMoleculeForHighlights(mol); } } } } if (potentialMatches.empty()) { BOOST_LOG(rdDebugLog) << "No attachment points in side chains" << std::endl; return -2; } if (data->params.matchingStrategy != GA) { size_t N = 1; for (auto matche = data->matches.begin() + data->previousMatchSize; matche != data->matches.end(); ++matche) { size_t sz = matche->size(); N *= sz; } // Highly symmetric cores can lead to a very large number of // permutations to test. Fall back to Greedy for the current chunk // when the number is too high. if (N * potentialMatches.size() > MAX_PERMUTATIONS) { data->process(data->prunePermutations); } } data->matches.push_back(std::move(potentialMatches)); } if (!data->matches.empty()) { if (data->params.matchingStrategy & Greedy || (data->params.matchingStrategy & GreedyChunks && data->matches.size() % data->params.chunkSize == 0)) { data->process(data->prunePermutations); } } return data->matches.size() - 1; } bool RGroupDecomposition::process() { return processAndScore().success; } RGroupDecompositionProcessResult RGroupDecomposition::processAndScore() { try { const bool finalize = true; return data->process(data->prunePermutations, finalize); } catch (...) { return RGroupDecompositionProcessResult(false, -1); } } std::vector RGroupDecomposition::getRGroupLabels() const { // this is a bit of a cheat RGroupColumns cols = getRGroupsAsColumns(); std::vector labels; for (auto it : cols) { labels.push_back(it.first); } std::sort(labels.begin(), labels.end()); return labels; } RWMOL_SPTR RGroupDecomposition::outputCoreMolecule( const RGroupMatch &match, const UsedLabelMap &usedLabelMap) const { // this routine could probably be merged into RGroupDecompData::relabelCore const auto &core = data->cores[match.core_idx]; if (!match.matchedCore) { return core.labelledCore; } auto coreWithMatches = match.matchedCore; #ifdef VERBOSE std::cerr << "output core mol1 " << MolToSmarts(*coreWithMatches) << std::endl; #endif std::map retainedRGroups; for (auto atomIdx = coreWithMatches->getNumAtoms(); atomIdx--;) { auto atom = coreWithMatches->getAtomWithIdx(atomIdx); if (atom->getAtomicNum()) { continue; } auto label = data->getRlabel(atom); // Always convert to hydrogen - then remove later if // removeHydrogensPostMatch is set Atom *nbrAtom = nullptr; for (const auto &nbri : boost::make_iterator_range(coreWithMatches->getAtomNeighbors(atom))) { nbrAtom = (*coreWithMatches)[nbri]; break; } if (nbrAtom) { const bool isUserDefinedLabel = usedLabelMap.has(label) && usedLabelMap.isUserDefined(label); const bool isUsedLabel = usedLabelMap.has(label) && usedLabelMap.getIsUsed(label); if (!isUsedLabel && (!isUserDefinedLabel || data->params.removeAllHydrogenRGroupsAndLabels)) { // Always convert to hydrogen - then remove later if // removeHydrogensPostMatch is set atom->setAtomicNum(1); atom->updatePropertyCache(false); } else { retainedRGroups[atom] = label; } } } #ifdef VERBOSE std::cerr << "output core mol2 " << MolToSmiles(*coreWithMatches) << std::endl; #endif if (data->params.removeHydrogensPostMatch) { RDLog::LogStateSetter blocker; const MolOps::RemoveHsParameters rhp; constexpr bool sanitize = false; MolOps::removeHs(*coreWithMatches, rhp, sanitize); coreWithMatches->updatePropertyCache(false); } if (coreWithMatches->getNumConformers() > 0) { for (const auto &[atom, label] : retainedRGroups) { if (usedLabelMap.has(label) && usedLabelMap.isUserDefined(label)) { // coordinates of user defined R groups should already be copied over continue; } const auto neighbor = *coreWithMatches->atomNeighbors(atom).begin(); const auto &mapping = data->finalRlabelMapping; if (const auto oldLabel = std::find_if( mapping.begin(), mapping.end(), [label = label](const auto &p) { return p.second == label; }); oldLabel != mapping.end()) { if (auto iter = match.rgroups.find(oldLabel->first); iter != match.rgroups.end()) { MolOps::setTerminalAtomCoords(*coreWithMatches, atom->getIdx(), neighbor->getIdx()); } } } } if (!coreWithMatches->getRingInfo()->isInitialized()) { MolOps::symmetrizeSSSR(*coreWithMatches); } #ifdef VERBOSE std::cerr << "output core mol3 " << MolToSmiles(*coreWithMatches) << std::endl; #endif return coreWithMatches; } RGroupRows RGroupDecomposition::getRGroupsAsRows() const { std::vector permutation = data->GetCurrentBestPermutation(); RGroupRows groups; auto usedLabelMap = UsedLabelMap(data->finalRlabelMapping); for (auto it = permutation.begin(); it != permutation.end(); ++it) { auto Rs_seen(usedLabelMap); // make a new rgroup entry groups.push_back(RGroupRow()); RGroupRow &out_rgroups = groups.back(); if (data->params.includeTargetMolInResults) { out_rgroups.emplace(RGroupData::getMolLabel(), it->getTargetMoleculeForHighlights( data->params.removeHydrogensPostMatch)); } const R_DECOMP &in_rgroups = it->rgroups; for (const auto &rgroup : in_rgroups) { const auto realLabel = data->finalRlabelMapping.find(rgroup.first); CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(), "unprocessed rlabel, please call process() first."); Rs_seen.setIsUsed(realLabel->second); out_rgroups.emplace(RGroupData::getRGroupLabel(realLabel->second), rgroup.second->combinedMol); } out_rgroups.emplace(RGroupData::getCoreLabel(), outputCoreMolecule(*it, Rs_seen)); } return groups; } //! return rgroups in column order group[attachment_point][molidx] = ROMol RGroupColumns RGroupDecomposition::getRGroupsAsColumns() const { std::vector permutation = data->GetCurrentBestPermutation(); RGroupColumns groups; std::unordered_set rGroupWithRealMol{RGroupData::getCoreLabel()}; if (data->params.includeTargetMolInResults) { rGroupWithRealMol.insert(RGroupData::getMolLabel()); } auto usedLabelMap = UsedLabelMap(data->finalRlabelMapping); unsigned int molidx = 0; for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) { auto Rs_seen(usedLabelMap); const R_DECOMP &in_rgroups = it->rgroups; if (data->params.includeTargetMolInResults) { groups[RGroupData::getMolLabel()].push_back( it->getTargetMoleculeForHighlights( data->params.removeHydrogensPostMatch)); } for (const auto &rgroup : in_rgroups) { const auto realLabel = data->finalRlabelMapping.find(rgroup.first); CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(), "unprocessed rlabel, please call process() first."); CHECK_INVARIANT(rgroup.second->combinedMol->hasProp(done), "Not done! Call process()"); CHECK_INVARIANT(!Rs_seen.getIsUsed(realLabel->second), "R group label appears multiple times!"); Rs_seen.setIsUsed(realLabel->second); auto r = RGroupData::getRGroupLabel(realLabel->second); RGroupColumn &col = groups[r]; if (molidx && col.size() < molidx - 1) { col.resize(molidx - 1); } col.push_back(rgroup.second->combinedMol); rGroupWithRealMol.insert(r); } groups[RGroupData::getCoreLabel()].push_back( outputCoreMolecule(*it, Rs_seen)); // add empty entries to columns where this molecule didn't appear for (const auto &realLabel : data->finalRlabelMapping) { if (!Rs_seen.getIsUsed(realLabel.second)) { auto r = RGroupData::getRGroupLabel(realLabel.second); groups[r].push_back(boost::make_shared()); } } } // purge R-group entries that have no mols for (auto it = groups.begin(); it != groups.end();) { auto itToErase = groups.end(); if (!rGroupWithRealMol.count(it->first)) { itToErase = it; } ++it; if (itToErase != groups.end()) { groups.erase(itToErase); } } return groups; } const RGroupDecompositionParameters &RGroupDecomposition::params() const { return data->params; } namespace { std::vector Decomp(RGroupDecomposition &decomp, const std::vector &mols) { auto t0 = std::chrono::steady_clock::now(); std::vector unmatched; for (size_t i = 0; i < mols.size(); ++i) { int v = decomp.add(*mols[i].get()); if (v == -1) { unmatched.push_back(i); } checkForTimeout(t0, decomp.params().timeout); } decomp.process(); return unmatched; } } // namespace unsigned int RGroupDecompose(const std::vector &cores, const std::vector &mols, RGroupRows &rows, std::vector *unmatchedIndices, const RGroupDecompositionParameters &options) { RGroupDecomposition decomp(cores, options); std::vector unmatched = Decomp(decomp, mols); if (unmatchedIndices) { *unmatchedIndices = unmatched; } rows = decomp.getRGroupsAsRows(); return mols.size() - unmatched.size(); } unsigned int RGroupDecompose(const std::vector &cores, const std::vector &mols, RGroupColumns &columns, std::vector *unmatchedIndices, const RGroupDecompositionParameters &options) { RGroupDecomposition decomp(cores, options); std::vector unmatched = Decomp(decomp, mols); if (unmatchedIndices) { *unmatchedIndices = unmatched; } columns = decomp.getRGroupsAsColumns(); return mols.size() - unmatched.size(); } } // namespace RDKit