// Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include "RGroupDecomp.h" #include #include #include #include #include #include #include #include #include #include #include #include //#define DEBUG namespace RDKit { // Attachment Points // labeled cores => isotopes // atom mappings // atom indices => use -1 - atom index, range is [-1, ...., -num_atoms] namespace { const std::string RLABEL = "tempRlabel"; const std::string SIDECHAIN_RLABELS = "sideChainRlabels"; const std::string done = "RLABEL_PROCESSED"; // Various places where rgroups can be labeled // the order of precedence enum class Labelling { RGROUP_LABELS, ISOTOPE_LABELS, ATOMMAP_LABELS, INDEX_LABELS, DUMMY_LABELS, INTERNAL_LABELS }; std::string labellingToString(Labelling type) { switch (type) { case Labelling::RGROUP_LABELS: return "RGroupLabels"; case Labelling::ISOTOPE_LABELS: return "IsotopeLabels"; case Labelling::ATOMMAP_LABELS: return "AtomMapLabels"; case Labelling::INDEX_LABELS: return "IndexLabels"; case Labelling::DUMMY_LABELS: return "DummyLabels"; case Labelling::INTERNAL_LABELS: return "InternalLabels"; } return "unknown"; } // Return the current set of rlabels for the molecule // Negative RLabels are ones not assigned by the user and are // either set to the index of the atom OR set to the index of // the atom from the core with the best MCS. // Positive rlabels are user defined (i.e. the user has specified // specific R1, R2 etc... std::map getRlabels(const RWMol &mol) { std::map atoms; for (auto atom : mol.atoms()) { if (atom->hasProp(RLABEL)) { int rlabel = atom->getProp(RLABEL); // user label CHECK_INVARIANT(atoms.find(rlabel) == atoms.end(), "Duplicate labels in rgroup core!"); atoms[rlabel] = atom; } } return atoms; } void clearInputLabels(Atom *atom) { // atom->setIsotope(0); Don't want to clear deuterium and things like that if // they aren't labels atom->setAtomMapNum(0); if (atom->hasProp(common_properties::_MolFileRLabel)) { atom->clearProp(common_properties::_MolFileRLabel); } } bool setLabel(Atom *atom, int label, std::set &labels, int &maxLabel, bool relabel, Labelling type) { if (type == Labelling::ISOTOPE_LABELS) { atom->setIsotope(0); } else if (type == Labelling::ATOMMAP_LABELS) { atom->setAtomMapNum(0); } else if (type == Labelling::RGROUP_LABELS) { if (atom->hasProp(common_properties::_MolFileRLabel)) { atom->clearProp(common_properties::_MolFileRLabel); atom->setIsotope(0); } } if (label) { if (labels.find(label) != labels.end()) { if (relabel) { if (type == Labelling::INTERNAL_LABELS) { std::cerr << "WARNING: relabelling existing label" << std::endl; } label = maxLabel + 1; } else { // XXX FIX me - get label id throw ValueErrorException( std::string("Duplicate label in input, current type is:") + labellingToString(type)); } } atom->setProp(RLABEL, label); labels.insert(label); maxLabel = label + 1; return true; } return false; } bool hasDummy(const RWMol &core) { for (RWMol::ConstAtomIterator atIt = core.beginAtoms(); atIt != core.endAtoms(); ++atIt) { if ((*atIt)->getAtomicNum() == 0) { return true; } } return false; } } // namespace unsigned int RGroupDecompositionParameters::autoGetLabels(const RWMol &core) { unsigned int autoLabels = 0; if (!onlyMatchAtRGroups) { autoLabels = AtomIndexLabels; } bool hasMDLRGroup = false; bool hasAtomMapNum = false; bool hasIsotopes = false; bool hasDummies = false; for (auto atm : core.atoms()) { if (atm->getIsotope()) { hasIsotopes = true; } if (atm->getAtomMapNum()) { hasAtomMapNum = true; } if (atm->hasProp(common_properties::_MolFileRLabel)) { hasMDLRGroup = true; } if (atm->getAtomicNum() == 0) { hasDummies = true; } } if (hasMDLRGroup) { return autoLabels | MDLRGroupLabels; } else if (hasAtomMapNum) { return autoLabels | AtomMapLabels; } else if (hasIsotopes) { return autoLabels | IsotopeLabels; } else if (hasDummies) { return autoLabels | DummyAtomLabels; } return autoLabels; } bool RGroupDecompositionParameters::prepareCore(RWMol &core, const RWMol *alignCore) { const bool relabel = labels & RelabelDuplicateLabels; unsigned int autoLabels = labels; if (labels == AutoDetect) { autoLabels = autoGetLabels(core); if (!autoLabels) { BOOST_LOG(rdWarningLog) << "RGroupDecomposition auto detect found no " "rgroups and onlyMatAtRgroups is set to true" << std::endl; return false; } } if (alignCore && (alignment & MCS)) { std::vector mols; mols.push_back(ROMOL_SPTR(new ROMol(core))); mols.push_back(ROMOL_SPTR(new ROMol(*alignCore))); MCSResult res = findMCS(mols); if (res.isCompleted()) { RWMol *m = SmartsToMol(res.SmartsString); if (m) { MatchVectType match1; MatchVectType match2; bool target_matched1 = SubstructMatch(core, *m, match1); bool target_matched2 = SubstructMatch(*alignCore, *m, match2); CHECK_INVARIANT(match1.size() == match2.size(), "Matches should be the same size in prepareCore"); if (target_matched1 && target_matched2) { for (size_t i = 0; i < match1.size(); ++i) { int queryAtomIdx1 = match1[i].first; int coreAtomIdx = match1[i].second; int queryAtomIdx2 = match2[i].first; int alignCoreAtomIdx = match2[i].second; CHECK_INVARIANT(queryAtomIdx1 == queryAtomIdx2, "query atoms aren't the same"); Atom *coreAtm = core.getAtomWithIdx(coreAtomIdx); const Atom *alignCoreAtm = alignCore->getAtomWithIdx(alignCoreAtomIdx); // clear up input rlabels coreAtm->setAtomMapNum(0); if (coreAtm->hasProp(common_properties::_MolFileRLabel)) { coreAtm->clearProp(common_properties::_MolFileRLabel); coreAtm->setIsotope(0); } if (alignCoreAtm->hasProp(RLABEL)) { int rlabel = alignCoreAtm->getProp(RLABEL); coreAtm->setProp(RLABEL, rlabel); } } } delete m; } } } std::set foundLabels; int maxLabel = 0; int nextOffset = 0; std::map atomToLabel; for (auto atom : core.atoms()) { bool found = false; if (atom->hasProp(RLABEL)) { if (setLabel(atom, atom->getProp(RLABEL), foundLabels, maxLabel, relabel, Labelling::INTERNAL_LABELS)) { found = true; } } if (!found && (autoLabels & MDLRGroupLabels)) { unsigned int rgroup; if (atom->getPropIfPresent( common_properties::_MolFileRLabel, rgroup)) { if (setLabel(atom, rdcast(rgroup), foundLabels, maxLabel, relabel, Labelling::RGROUP_LABELS)) { found = true; } } } if (!found && (autoLabels & IsotopeLabels) && atom->getIsotope() > 0) { if (setLabel(atom, rdcast(atom->getIsotope()), foundLabels, maxLabel, relabel, Labelling::ISOTOPE_LABELS)) { found = true; } } if (!found && (autoLabels & AtomMapLabels) && atom->getAtomMapNum() > 0) { if (setLabel(atom, rdcast(atom->getAtomMapNum()), foundLabels, maxLabel, relabel, Labelling::ATOMMAP_LABELS)) { found = true; } } if (!found && (autoLabels & DummyAtomLabels) && atom->getAtomicNum() == 0) { const bool forceRelabellingWithDummies = true; int defaultDummyStartLabel = 1; if (setLabel(atom, defaultDummyStartLabel, foundLabels, maxLabel, forceRelabellingWithDummies, Labelling::DUMMY_LABELS)) { found = true; } } // Unless there is an MCS match from above, we need to give different // RLABELS to each core so keep track of which labels // we have used (note that these are negative since they are // potential rgroups and haven't been assigned yet) if (!found && (autoLabels & AtomIndexLabels)) { if (setLabel(atom, indexOffset - atom->getIdx(), foundLabels, maxLabel, relabel, Labelling::INDEX_LABELS)) { nextOffset++; } found = true; } clearInputLabels(atom); int rlabel; if (atom->getPropIfPresent(RLABEL, rlabel)) { atomToLabel[atom->getIdx()] = rlabel; } } indexOffset -= nextOffset; MolOps::AdjustQueryParameters adjustParams; adjustParams.makeDummiesQueries = true; adjustParams.adjustDegree = false; adjustQueryProperties(core, &adjustParams); for (auto &it : atomToLabel) { core.getAtomWithIdx(it.first)->setProp(RLABEL, it.second); } return true; } namespace { // RGroup Class to hold the attached bits struct RGroupData { boost::shared_ptr combinedMol; std::vector> mols; // All the mols in the rgroup std::set smilesSet; // used for rgroup equivalence std::string smiles; // smiles for all the mols in the rgroup (with attachments) std::set attachments; // attachment points bool labelled; private: RGroupData(const RGroupData &rhs); public: RGroupData() : combinedMol(), mols(), smilesSet(), smiles(), attachments(), labelled(false) {} void add(boost::shared_ptr newMol, const std::vector &rlabel_attachments) { // some fragments can be add multiple times if they are cyclic for (auto &mol : mols) { if (newMol.get() == mol.get()) { return; } } labelled = false; std::copy(rlabel_attachments.begin(), rlabel_attachments.end(), std::inserter(attachments, attachments.end())); mols.push_back(newMol); std::string smi = MolToSmiles(*newMol, true); // REVIEW: we probably shouldn't be using a set here... the merging of // duplicates is likely not what we want smilesSet.insert(smi); if (!combinedMol.get()) { combinedMol = boost::shared_ptr(new RWMol(*mols[0].get())); } else { ROMol *m = combineMols(*combinedMol.get(), *newMol.get()); m->updateProps(*combinedMol.get()); combinedMol.reset(new RWMol(*m)); delete m; } smiles = getSmiles(); combinedMol->setProp(common_properties::internalRgroupSmiles, smiles); } std::map getNumBondsToRlabels() const { std::map rlabelsUsedCount; for (ROMol::AtomIterator atIt = combinedMol->beginAtoms(); atIt != combinedMol->endAtoms(); ++atIt) { Atom *atom = *atIt; int rlabel; if (atom->getPropIfPresent(RLABEL, rlabel)) { rlabelsUsedCount[rlabel] += 1; } } return rlabelsUsedCount; } bool isHydrogen() const { // is the rgroup all Hs for (const auto &mol : mols) { for (ROMol::AtomIterator atIt = mol->beginAtoms(); atIt != mol->endAtoms(); ++atIt) { if ((*atIt)->getAtomicNum() > 1) { return false; } } } return true; } private: std::string getSmiles() const { // compute the canonical smiles for the attachments std::string s; for (const auto &it : smilesSet) { if (s.length()) { s += "."; } s += it; } return s; } }; } // namespace namespace { typedef boost::shared_ptr RData; typedef std::map R_DECOMP; struct RGroupMatch { // RGroupMatch is the decomposition for a single molecule size_t core_idx; // index of the matching core R_DECOMP rgroups; // rlabel->RGroupData mapping RGroupMatch(size_t core_index, R_DECOMP input_rgroups) : core_idx(core_index), rgroups(std::move(input_rgroups)) {} }; void ADD_MATCH(R_DECOMP &match, int rlabel) { if (match.find(rlabel) == match.end()) { match[rlabel] = boost::make_shared(); } } struct CartesianProduct { std::vector permutation; std::vector sizes; size_t maxPermutations; size_t permutationCount; CartesianProduct(const std::vector &inputSizes) : permutation(inputSizes.size(), 0), sizes(inputSizes), permutationCount(0) { maxPermutations = 1; for (unsigned long size : sizes) { maxPermutations *= size; // may overflow.... } } bool next() { ++permutationCount; if (permutationCount == 1) { return true; } return increment(0); } bool increment(size_t rowToIncrement) { if (permutationCount > maxPermutations) { return false; } permutation[rowToIncrement] += 1; size_t max_index_of_row = sizes[rowToIncrement] - 1; if (permutation[rowToIncrement] > max_index_of_row) { permutation[rowToIncrement] = 0; return increment(rowToIncrement + 1); } return true; } }; // stupid total score double score(const std::vector &permutation, const std::vector> &matches, const std::set &labels) { double score = 1.; #ifdef DEBUG std::cerr << "---------------------------------------------------" << std::endl; std::cerr << "Scoring permutation " << " num matches: " << matches.size() << std::endl; #endif for (int l : labels) { #ifdef DEBUG std::cerr << "Label: " << l << std::endl; #endif std::map matchSet; std::map, int> linkerMatchSet; std::map onlyH; for (size_t m = 0; m < permutation.size(); ++m) { // for each molecule auto rg = matches[m][permutation[m]].rgroups.find(l); if (rg != matches[m][permutation[m]].rgroups.end()) { #ifdef DEBUG std::cerr << " combined: " << MolToSmiles(*rg->second->combinedMol) << std::endl; std::cerr << " RGroup: " << rg->second->smiles << " " << rg->second->smiles.find_first_not_of("0123456789[]*H:."); #endif matchSet[rg->second->smiles] += 1; // detect whether or not this is an H if (rg->second->smiles.find_first_not_of("0123456789[]*H:.") < rg->second->smiles.length()) { onlyH[rg->second->smiles] = 0; } else { onlyH[rg->second->smiles] = 1; } #ifdef DEBUG std::cerr << " " << rg->second->combinedMol->getNumAtoms(false) << " isH: " << onlyH[rg->second->smiles] << " score: " << matchSet[rg->second->smiles] << std::endl; #endif // XXX Use fragment counts to see if we are linking cycles? if (rg->second->smiles.find(".") == std::string::npos && rg->second->attachments.size() > 1) { linkerMatchSet[rg->second->attachments]++; #ifdef DEBUG std::cerr << " Linker Score: " << linkerMatchSet[rg->second->attachments]++ << std::endl; #endif } } } // get the counts for each rgroup found and sort in reverse order std::vector equivalentRGroupCount; for (std::map::const_iterator it = matchSet.begin(); it != matchSet.end(); ++it) { #ifdef DEBUG std::cerr << " equiv: " << it->first << " " << it->second << " " << permutation.size() << std::endl; #endif // if the rgroup is hydrogens, only consider if the group is all // hydrogen, otherwise score based on the non hydrogens if (onlyH[it->first]) { if (static_cast(it->second) == permutation.size()) { equivalentRGroupCount.push_back(static_cast(it->second)); } else { // hydrogens in a mixed group don't contribute to the score equivalentRGroupCount.push_back(0.0); } } else { equivalentRGroupCount.push_back(static_cast(it->second)); } } std::sort(equivalentRGroupCount.begin(), equivalentRGroupCount.end(), std::greater()); double tempScore = 1.; // score the sets from the largest to the smallest // each smaller set gets penalized (i+1) below // 1.0 is the perfect score for (size_t i = 0; i < equivalentRGroupCount.size(); ++i) { auto lscore = equivalentRGroupCount[i] / ((i + 1) * (double)matches.size()); if (lscore > 0) { tempScore *= lscore * lscore; } #ifdef DEBUG std::cerr << " lscore^2 " << i << ": " << lscore * lscore << std::endl; #endif } // overweight linkers with the same attachments points.... // because these belong to 2 rgroups we really want these to stay // ** this heuristic really should be taken care of above ** int maxLinkerMatches = 0; for (std::map, int>::const_iterator it = linkerMatchSet.begin(); it != linkerMatchSet.end(); ++it) { if (it->second > 1) { if (it->second > maxLinkerMatches) { maxLinkerMatches = it->second; } } } #ifdef DEBUG std::cerr << "Max Linker Matches :" << maxLinkerMatches << std::endl; #endif double increment = 1.0; // no change in score double linkerIncrement = 1.0; // no change in score if (maxLinkerMatches) { linkerIncrement = (double)(maxLinkerMatches) / (double)matches.size(); } else { increment = tempScore; } score += increment * linkerIncrement; #ifdef DEBUG std::cerr << "Increment: " << increment << " Linker_Increment: " << linkerIncrement << std::endl; std::cerr << "increment*linkerIncrement: " << increment * linkerIncrement << std::endl; std::cerr << "Score = " << score << std::endl; #endif } return score; } } // namespace const unsigned int EMPTY_CORE_LABEL = -100000; struct RGroupDecompData { // matches[mol_idx] == vector of potential matches std::map cores; std::map newCores; // new "cores" found along the way int newCoreLabel; RGroupDecompositionParameters params; std::vector> matches; std::set labels; std::vector permutation; std::map> userLabels; std::vector processedRlabels; std::map> labelledCores; std::map finalRlabelMapping; RGroupDecompData(const RWMol &inputCore, RGroupDecompositionParameters inputParams) : cores(), newCores(), newCoreLabel(EMPTY_CORE_LABEL), params(std::move(inputParams)) { cores[0] = inputCore; prepareCores(); } RGroupDecompData(const std::vector &inputCores, RGroupDecompositionParameters inputParams) : cores(), newCores(), newCoreLabel(EMPTY_CORE_LABEL), params(std::move(inputParams)) { for (size_t i = 0; i < inputCores.size(); ++i) { cores[i] = *inputCores[i].get(); } prepareCores(); } void prepareCores() { size_t idx = 0; for (auto coreIt = cores.begin(); coreIt != cores.end(); ++coreIt, ++idx) { RWMol *alignCore = coreIt->first ? &cores[0] : nullptr; CHECK_INVARIANT(params.prepareCore(coreIt->second, alignCore), "Could not prepare at least one core"); labelledCores[coreIt->first] = boost::shared_ptr(new RWMol(coreIt->second)); } } void setRlabel(Atom *atom, int rlabel) { PRECONDITION(rlabel != 0, "RLabels must be >0"); if (params.rgroupLabelling & AtomMap) { atom->setAtomMapNum(rlabel); } if (params.rgroupLabelling & MDLRGroup) { std::string dLabel = "R" + std::to_string(rlabel); atom->setProp(common_properties::dummyLabel, dLabel); setAtomRLabel(atom, rlabel); } if (params.rgroupLabelling & Isotope) { atom->setIsotope(rlabel + 1); } } void prune() { // prune all but the current "best" permutation of matches for (size_t mol_idx = 0; mol_idx < permutation.size(); ++mol_idx) { std::vector keepVector; keepVector.push_back(matches[mol_idx][permutation[mol_idx]]); matches[mol_idx] = keepVector; } permutation = std::vector(matches.size(), 0); } // Return the RGroups with the current "best" permutation // of matches. std::vector GetCurrentBestPermutation() const { const bool removeAllHydrogenRGroups = params.removeAllHydrogenRGroups; std::vector results; // std::map > result; for (size_t i = 0; i < permutation.size(); ++i) { CHECK_INVARIANT(i < matches.size(), "Best Permutation mol idx out of range"); CHECK_INVARIANT(permutation[i] < matches[i].size(), "Selected match at permutation out of range"); results.push_back(matches[i][permutation[i]]); } if (removeAllHydrogenRGroups) { // if a label is all hydrogens, remove it // This logic is a bit tricky, find all labels that have common cores // and analyze those sets independently. // i.e. if core 1 doesn't have R1 then don't analyze it in when looking // at label 1 std::map> labelCores; // map from label->cores for (auto &position : results) { int core_idx = position.core_idx; if (labelCores.find(core_idx) == labelCores.end()) { auto core = cores.find(core_idx); if (core != cores.end()) { for (auto rlabels : getRlabels(core->second)) { int rlabel = rlabels.first; labelCores[rlabel].insert(core_idx); } } } } for (int label : labels) { bool allH = true; for (auto &position : results) { R_DECOMP::const_iterator rgroup = position.rgroups.find(label); bool labelHasCore = labelCores[label].find(position.core_idx) != labelCores[label].end(); if (labelHasCore && (rgroup == position.rgroups.end() || !rgroup->second->isHydrogen())) { allH = false; break; } } if (allH) { for (auto &position : results) { position.rgroups.erase(label); } } } } return results; } class UsedLabels { public: std::set labels_used; bool add(int rlabel) { if (labels_used.find(rlabel) != labels_used.end()) { return false; } labels_used.insert(rlabel); return true; } int next() { int i=1; while (labels_used.find(i) != labels_used.end()) { ++i; } labels_used.insert(i); return i; } }; void relabelCore(RWMol &core, std::map &mappings, UsedLabels &used_labels, const std::set &indexLabels, std::map> extraAtomRLabels) { // Now remap to proper rlabel ids // if labels are positive, they come from User labels // if they are negative, they come from indices and should be // numbered *after* the user labels. // // Some indices are attached to multiple bonds, // these rlabels should be incrementally added last std::map atoms = getRlabels(core); // a core only has one labelled index // a secondary structure extraAtomRLabels contains the number // of bonds between this atom and the side chain // a sidechain atom has a vector of the attachments back to the // core that takes the place of numBondsToRlabel std::map> bondsToCore; std::vector> atomsToAdd; // adds -R if necessary // Deal with user supplied labels for (auto rlabels : atoms) { int userLabel = rlabels.first; if (userLabel < 0) { continue; // not a user specified label } Atom *atom = rlabels.second; mappings[userLabel] = userLabel; used_labels.add(userLabel); if (atom->getAtomicNum() == 0) { // add to existing dummy/rlabel setRlabel(atom, userLabel); } else { // adds new rlabel auto *newAt = new Atom(0); setRlabel(newAt, userLabel); atomsToAdd.push_back(std::make_pair(atom, newAt)); } } // Deal with non-user supplied labels for (auto newLabel : indexLabels) { auto atm = atoms.find(newLabel); if (atm == atoms.end()) { continue; } Atom * atom = atm->second; int rlabel; auto mapping = mappings.find(newLabel); if (mapping == mappings.end()) { rlabel = used_labels.next(); mappings[newLabel] = rlabel; } else { rlabel = mapping->second; } if (atom->getAtomicNum() == 0) { // add to dummy setRlabel(atom, rlabel); } else { auto *newAt = new Atom(0); setRlabel(newAt, rlabel); atomsToAdd.push_back(std::make_pair(atom, newAt)); } } // Deal with multiple bonds to the same label for (auto &extraAtomRLabel : extraAtomRLabels) { auto atm = atoms.find(extraAtomRLabel.first); if (atm == atoms.end()) { continue; // label not used in the rgroup } Atom *atom = atm->second; for (int &i : extraAtomRLabel.second) { int rlabel = used_labels.next(); i = rlabel; // Is this necessary? CHECK_INVARIANT( atom->getAtomicNum() > 1, "Multiple attachments to a dummy (or hydrogen) is weird."); auto *newAt = new Atom(0); setRlabel(newAt, rlabel); atomsToAdd.push_back(std::make_pair(atom, newAt)); } } for (auto &i : atomsToAdd) { core.addAtom(i.second, false, true); core.addBond(i.first, i.second, Bond::SINGLE); } core.updatePropertyCache(false); // this was github #1550 } void relabelRGroup(RGroupData &rgroup, const std::map &mappings) { PRECONDITION(rgroup.combinedMol.get(), "Unprocessed rgroup"); RWMol &mol = *rgroup.combinedMol.get(); if (rgroup.combinedMol->hasProp(done)) { rgroup.labelled = true; return; } mol.setProp(done, true); std::vector> atomsToAdd; // adds -R if necessary for (RWMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms(); ++atIt) { Atom *atom = *atIt; if (atom->hasProp(SIDECHAIN_RLABELS)) { atom->setIsotope(0); const std::vector &rlabels = atom->getProp>(SIDECHAIN_RLABELS); // switch on atom mappings or rlabels.... for (int rlabel : rlabels) { auto label = mappings.find(rlabel); CHECK_INVARIANT(label != mappings.end(), "Unprocessed mapping"); if (atom->getAtomicNum() == 0) { setRlabel(atom, label->second); } else { auto *newAt = new Atom(0); setRlabel(newAt, label->second); atomsToAdd.push_back(std::make_pair(atom, newAt)); } } } } for (auto &i : atomsToAdd) { mol.addAtom(i.second, false, true); mol.addBond(i.first, i.second, Bond::SINGLE); } if (params.removeHydrogensPostMatch) { bool implicitOnly = false; bool updateExplicitCount = false; bool sanitize = false; MolOps::removeHs(mol, implicitOnly, updateExplicitCount, sanitize); } mol.updatePropertyCache(false); // this was github #1550 rgroup.labelled = true; } // relabel the core and sidechains using the specified user labels // if matches exist for non labelled atoms, these are added as well void relabel() { std::vector best = GetCurrentBestPermutation(); // get the labels used std::set userLabels; std::set indexLabels; // Go through all the RGroups and find out which labels were // actually used. // some atoms will have multiple attachment points, i.e. cycles // split these up into new rlabels if necessary // These are detected at match time // This vector will hold the extra (new) labels required std::map> extraAtomRLabels; for (auto &it : best) { for (auto &rgroup : it.rgroups) { if (rgroup.first >= 0) { userLabels.insert(rgroup.first); } if (rgroup.first < 0) { indexLabels.insert(rgroup.first); } std::map rlabelsUsedInRGroup = rgroup.second->getNumBondsToRlabels(); for (auto &numBondsUsed : rlabelsUsedInRGroup) { // Make space for the extra labels if (numBondsUsed.second > 1) { // multiple rgroup bonds to same atom extraAtomRLabels[numBondsUsed.first].resize(numBondsUsed.second - 1); } } } } // Assign final RGroup labels to the cores and propagate these to // the scaffold finalRlabelMapping.clear(); UsedLabels used_labels; for (std::map::const_iterator coreIt = cores.begin(); coreIt != cores.end(); ++coreIt) { boost::shared_ptr labelledCore(new RWMol(coreIt->second)); labelledCores[coreIt->first] = labelledCore; relabelCore(*labelledCore.get(), finalRlabelMapping, used_labels, indexLabels, extraAtomRLabels); } for (auto &it : best) { for (auto &rgroup : it.rgroups) { relabelRGroup(*rgroup.second, finalRlabelMapping); } } } // compute the number of rgroups that would be added if we // accepted this permutation int compute_num_added_rgroups(std::vector &tied_permutation) { int num_added_rgroups = 0; for (int label : labels) { if (label <= 0) { // label is not user supplied for (size_t m = 0; m < tied_permutation.size(); ++m) { // for each molecule auto rg = matches[m][tied_permutation[m]].rgroups.find(label); if (rg != matches[m][tied_permutation[m]].rgroups.end()) { if(rg->second->smiles.find_first_not_of("0123456789[]*H:.") < rg->second->smiles.length()) { num_added_rgroups +=1;//= label; break; } } } } } return num_added_rgroups; } bool process(bool pruneMatches, bool finalize = false) { if (matches.size() == 0) { return false; } // Exhaustive search, get the MxN matrix size_t M = matches.size(); // Number of molecules std::vector permutations; size_t N = 1; for (size_t m = 0; m < M; ++m) { size_t sz = matches[m].size(); // # permutations for molecule m permutations.push_back(sz); N *= sz; } permutation = std::vector(permutations.size(), 0); // run through all possible matches and score each // set double best_score = 0; std::vector best_permutation = permutation; std::vector> ties; size_t count = 0; #ifdef DEBUG std::cerr << "Processing" << std::endl; #endif CartesianProduct iterator(permutations); // Iterates through the permutation idx, i.e. // [m1_permutation_idx, m2_permutation_idx, m3_permutation_idx] while (iterator.next()) { if (count > N) { throw ValueErrorException("Next did not finish"); } #ifdef DEBUG std::cerr << "**************************************************" << std::endl; #endif double newscore = score(iterator.permutation, matches, labels); if (fabs(newscore - best_score) < 1e-6) { // heuristic to overcome floating point comparison issues ties.push_back(iterator.permutation); } else if (newscore > best_score) { #ifdef DEBUG std::cerr << " ===> current best:" << newscore << ">" << best_score << std::endl; #endif ties.clear(); ties.push_back(iterator.permutation); best_score = newscore; best_permutation = iterator.permutation; } } if(ties.size() > 1) { // choose one that doesn't add rgroups inappropriately // an rgroup is added when // (1) the label is <=0 // (2) the group has any substituent with heavy atoms // XXX Might be more efficient to add this comp to the score function int smallest_added_rgroups = 100000000; for(auto tied_permutation: ties) { int num_added_rgroups = compute_num_added_rgroups(tied_permutation); if(num_added_rgroups < smallest_added_rgroups) { smallest_added_rgroups = num_added_rgroups; best_permutation = tied_permutation; } } } permutation = best_permutation; if (pruneMatches || finalize) { prune(); } if (finalize) { relabel(); } return true; } }; RGroupDecomposition::RGroupDecomposition( const ROMol &inputCore, const RGroupDecompositionParameters ¶ms) : data(new RGroupDecompData(inputCore, params)) {} RGroupDecomposition::RGroupDecomposition( const std::vector &cores, const RGroupDecompositionParameters ¶ms) : data(new RGroupDecompData(cores, params)) {} RGroupDecomposition::~RGroupDecomposition() { delete data; } int RGroupDecomposition::add(const ROMol &inmol) { // get the sidechains if possible // Add hs for better symmetrization RWMol mol(inmol); bool explicitOnly = false; bool addCoords = true; MolOps::addHs(mol, explicitOnly, addCoords); int core_idx = 0; const RWMol *core = nullptr; std::vector tmatches; // Find the first matching core. for (std::map::const_iterator coreIt = data->cores.begin(); coreIt != data->cores.end(); ++coreIt) { { const bool uniquify = false; const bool recursionPossible = true; const bool useChirality = true; SubstructMatch(mol, coreIt->second, tmatches, uniquify, recursionPossible, useChirality); } if (data->params.onlyMatchAtRGroups) { // First find all the core atoms that have user // label and but their indices into core_atoms_with_user_labels std::set core_atoms_with_user_labels; for (auto atom : coreIt->second.atoms()) { if (atom->hasProp(RLABEL)) { core_atoms_with_user_labels.insert(atom->getIdx()); } } std::vector tmatches_filtered; for (auto &mv : tmatches) { bool passes_filter = true; boost::dynamic_bitset<> target_match_indices(mol.getNumAtoms()); for (auto &match : mv) { target_match_indices[match.second] = 1; } for (auto &match : mv) { const Atom *atm = mol.getAtomWithIdx(match.second); // is this a labelled rgroup or not? if (core_atoms_with_user_labels.find(match.first) == core_atoms_with_user_labels.end()) { // nope... if any neighbor is not part of the substructure // make sure we are a hydrogen, otherwise, skip the match for (const auto &nbri : boost::make_iterator_range(mol.getAtomNeighbors(atm))) { const auto &nbr = mol[nbri]; if (nbr->getAtomicNum() != 1 && !target_match_indices[nbr->getIdx()]) { passes_filter = false; break; } } } if (!passes_filter) { break; } } if (passes_filter) { tmatches_filtered.push_back(mv); } } tmatches = tmatches_filtered; } if (!tmatches.size()) { continue; } else { if (tmatches.size() > 1) { if (data->matches.size() == 0) { // Greedy strategy just grabs the first match and // takes the best matches from the rest if (data->params.matchingStrategy == Greedy) { tmatches.resize(1); } } } core = &coreIt->second; core_idx = coreIt->first; break; } } if (core == nullptr) { return -1; } // strategies // ========== // Exhaustive - saves all matches and optimizes later exhaustive // May never finish due to combinatorial complexity // Greedy - matches to *FIRST* available match // GreedyChunks - default - process every N chunks // Should probably scan all mols first to find match with // smallest number of matches... size_t size = data->matches.size(); std::vector potentialMatches; for (auto &tmatche : tmatches) { boost::scoped_ptr tMol; { const bool replaceDummies = false; const bool labelByIndex = true; const bool requireDummyMatch = false; tMol.reset(replaceCore(mol, *core, tmatche, replaceDummies, labelByIndex, requireDummyMatch)); } if (tMol) { R_DECOMP match; // rlabel rgroups MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false); for (size_t i = 0; i < fragments.size(); ++i) { std::vector attachments; boost::shared_ptr &newMol = fragments[i]; newMol->setProp("core", core_idx); newMol->setProp("idx", size); newMol->setProp("frag_idx", i); for (ROMol::AtomIterator atIt = newMol->beginAtoms(); atIt != newMol->endAtoms(); ++atIt) { Atom *tmp = *atIt; unsigned int elno = tmp->getAtomicNum(); if (elno == 0) { unsigned int index = tmp->getIsotope(); // this is the index into the core // it messes up when there are multiple ? int rlabel; if (core->getAtomWithIdx(index)->getPropIfPresent(RLABEL, rlabel)) { std::vector rlabelsOnSideChain; tmp->getPropIfPresent(SIDECHAIN_RLABELS, rlabelsOnSideChain); rlabelsOnSideChain.push_back(rlabel); tmp->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChain); data->labels.insert(rlabel); // keep track of all labels used attachments.push_back(rlabel); } } } if (attachments.size() > 0) { // reject multiple attachments? // what to do with labelled cores ? std::string newCoreSmi = MolToSmiles(*newMol, true); for (size_t attach_idx = 0; attach_idx < attachments.size(); ++attach_idx) { int rlabel = attachments[attach_idx]; ADD_MATCH(match, rlabel); match[rlabel]->add(newMol, attachments); } } else { // special case, only one fragment if (fragments.size() == 1) { // need to make a new core // remove the sidechains RWMol newCore(mol); for (MatchVectType::const_iterator mvit = tmatche.begin(); mvit != tmatche.end(); ++mvit) { const Atom *coreAtm = core->getAtomWithIdx(mvit->first); Atom *newCoreAtm = newCore.getAtomWithIdx(mvit->second); int rlabel; if (coreAtm->getPropIfPresent(RLABEL, rlabel)) { newCoreAtm->setProp(RLABEL, rlabel); } newCoreAtm->setProp("keep", true); } for (int aIdx = newCore.getNumAtoms() - 1; aIdx >= 0; --aIdx) { Atom *atom = newCore.getAtomWithIdx(aIdx); if (!atom->hasProp("keep")) { newCore.removeAtom(atom); } } if (newCore.getNumAtoms()) { std::string newCoreSmi = MolToSmiles(newCore, true); // add a new core if possible auto newcore = data->newCores.find(newCoreSmi); int core_idx = 0; if (newcore == data->newCores.end()) { core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--; data->cores[core_idx] = newCore; return add(inmol); } } } } } if (match.size()) { potentialMatches.push_back(RGroupMatch(core_idx, match)); } } } if (potentialMatches.size() == 0) { BOOST_LOG(rdWarningLog) << "No attachment points in side chains" << std::endl; return -1; } size_t N = 1; for (auto &matche : data->matches) { size_t sz = matche.size(); N *= sz; } // oops, exponential is a pain if (N * potentialMatches.size() > 100000) { data->permutation = std::vector(data->matches.size(), 0); data->process(true); } data->matches.push_back(potentialMatches); data->permutation = std::vector(data->matches.size(), 0); if (size) { if (data->params.matchingStrategy & Greedy || (data->params.matchingStrategy & GreedyChunks && size > 1 && size % data->params.chunkSize == 0)) { data->process(true); } } return data->matches.size() - 1; } bool RGroupDecomposition::process() { try { const bool prune = true; const bool finalize = true; return data->process(prune, finalize); } catch (...) { return false; } } std::vector RGroupDecomposition::getRGroupLabels() const { // this is a bit of a cheat RGroupColumns cols = getRGroupsAsColumns(); std::vector labels; for(auto it : cols) { labels.push_back(it.first); } std::sort(labels.begin(), labels.end()); return labels; } RGroupRows RGroupDecomposition::getRGroupsAsRows() const { std::vector permutation = data->GetCurrentBestPermutation(); RGroupRows groups; int molidx = 0; for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) { // make a new rgroup entry groups.push_back(RGroupRow()); RGroupRow &out_rgroups = groups.back(); out_rgroups["Core"] = data->labelledCores[it->core_idx]; R_DECOMP &in_rgroups = it->rgroups; for (R_DECOMP::const_iterator rgroup = in_rgroups.begin(); rgroup != in_rgroups.end(); ++rgroup) { std::map::const_iterator realLabel = data->finalRlabelMapping.find(rgroup->first); CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(), "unprocessed rlabel, please call process() first."); out_rgroups[std::string("R") + std::to_string(realLabel->second)] = rgroup->second->combinedMol; } } return groups; } //! return rgroups in column order group[attachment_point][molidx] = ROMol RGroupColumns RGroupDecomposition::getRGroupsAsColumns() const { std::vector permutation = data->GetCurrentBestPermutation(); RGroupColumns groups; // collect the list of all possible RGroups: std::map rgrp_pos_map; unsigned int ridx = 0; for (const auto rl : data->finalRlabelMapping) { rgrp_pos_map[rl.second] = ridx++; } unsigned int molidx = 0; for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) { boost::dynamic_bitset<> Rs_seen(rgrp_pos_map.size()); R_DECOMP &in_rgroups = it->rgroups; groups["Core"].push_back(data->labelledCores[it->core_idx]); for (R_DECOMP::const_iterator rgroup = in_rgroups.begin(); rgroup != in_rgroups.end(); ++rgroup) { std::map::const_iterator realLabel = data->finalRlabelMapping.find(rgroup->first); CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(), "unprocessed rlabel, please call process() first."); CHECK_INVARIANT(rgroup->second->combinedMol->hasProp(done), "Not done! Call process()"); Rs_seen.set(rgrp_pos_map[realLabel->second]); std::string r = std::string("R") + std::to_string(realLabel->second); RGroupColumn &col = groups[r]; if (molidx && col.size() < (size_t)(molidx - 1)) { col.resize(molidx - 1); } col.push_back(rgroup->second->combinedMol); } // add empty entries to columns where this molecule didn't appear for (const auto rpr : rgrp_pos_map) { if (!Rs_seen[rpr.second]) { std::string r = std::string("R") + std::to_string(rpr.first); groups[r].push_back(boost::make_shared()); } } } return groups; } namespace { std::vector Decomp(RGroupDecomposition &decomp, const std::vector &mols) { std::vector unmatched; for (size_t i = 0; i < mols.size(); ++i) { int v = decomp.add(*mols[i].get()); if (v == -1) { unmatched.push_back(i); } } decomp.process(); return unmatched; } } // namespace unsigned int RGroupDecompose(const std::vector &cores, const std::vector &mols, RGroupRows &rows, std::vector *unmatchedIndices, const RGroupDecompositionParameters &options) { RGroupDecomposition decomp(cores, options); std::vector unmatched = Decomp(decomp, mols); if (unmatchedIndices) { *unmatchedIndices = unmatched; } rows = decomp.getRGroupsAsRows(); return mols.size() - unmatched.size(); } unsigned int RGroupDecompose(const std::vector &cores, const std::vector &mols, RGroupColumns &columns, std::vector *unmatchedIndices, const RGroupDecompositionParameters &options) { RGroupDecomposition decomp(cores, options); std::vector unmatched = Decomp(decomp, mols); if (unmatchedIndices) { *unmatchedIndices = unmatched; } columns = decomp.getRGroupsAsColumns(); return mols.size() - unmatched.size(); } } // namespace RDKit