Files
rdkit/Code/GraphMol/RGroupDecomposition/RGroupDecomp.cpp

1416 lines
47 KiB
C++

// Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "RGroupDecomp.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/SmilesParse/SmartsWrite.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/ChemTransforms/ChemTransforms.h>
#include <GraphMol/FMCS/FMCS.h>
#include <boost/scoped_ptr.hpp>
#include <boost/dynamic_bitset.hpp>
#include <set>
#include <utility>
#include <vector>
//#define DEBUG
namespace RDKit {
// Attachment Points
// labeled cores => isotopes
// atom mappings
// atom indices => use -1 - atom index, range is [-1, ...., -num_atoms]
namespace {
const std::string RLABEL = "tempRlabel";
const std::string SIDECHAIN_RLABELS = "sideChainRlabels";
const std::string done = "RLABEL_PROCESSED";
// Various places where rgroups can be labeled
// the order of precedence
enum class Labelling {
RGROUP_LABELS,
ISOTOPE_LABELS,
ATOMMAP_LABELS,
INDEX_LABELS,
DUMMY_LABELS,
INTERNAL_LABELS
};
std::string labellingToString(Labelling type) {
switch (type) {
case Labelling::RGROUP_LABELS:
return "RGroupLabels";
case Labelling::ISOTOPE_LABELS:
return "IsotopeLabels";
case Labelling::ATOMMAP_LABELS:
return "AtomMapLabels";
case Labelling::INDEX_LABELS:
return "IndexLabels";
case Labelling::DUMMY_LABELS:
return "DummyLabels";
case Labelling::INTERNAL_LABELS:
return "InternalLabels";
}
return "unknown";
}
// Return the current set of rlabels for the molecule
// Negative RLabels are ones not assigned by the user and are
// either set to the index of the atom OR set to the index of
// the atom from the core with the best MCS.
// Positive rlabels are user defined (i.e. the user has specified
// specific R1, R2 etc...
std::map<int, Atom *> getRlabels(const RWMol &mol) {
std::map<int, Atom *> atoms;
for (auto atom : mol.atoms()) {
if (atom->hasProp(RLABEL)) {
int rlabel = atom->getProp<int>(RLABEL); // user label
CHECK_INVARIANT(atoms.find(rlabel) == atoms.end(),
"Duplicate labels in rgroup core!");
atoms[rlabel] = atom;
}
}
return atoms;
}
void clearInputLabels(Atom *atom) {
// atom->setIsotope(0); Don't want to clear deuterium and things like that if
// they aren't labels
atom->setAtomMapNum(0);
if (atom->hasProp(common_properties::_MolFileRLabel)) {
atom->clearProp(common_properties::_MolFileRLabel);
}
}
bool setLabel(Atom *atom, int label, std::set<int> &labels, int &maxLabel,
bool relabel, Labelling type) {
if (type == Labelling::ISOTOPE_LABELS) {
atom->setIsotope(0);
} else if (type == Labelling::ATOMMAP_LABELS) {
atom->setAtomMapNum(0);
} else if (type == Labelling::RGROUP_LABELS) {
if (atom->hasProp(common_properties::_MolFileRLabel)) {
atom->clearProp(common_properties::_MolFileRLabel);
atom->setIsotope(0);
}
}
if (label) {
if (labels.find(label) != labels.end()) {
if (relabel) {
if (type == Labelling::INTERNAL_LABELS)
std::cerr << "WARNING: relabelling existing label" << std::endl;
label = maxLabel + 1;
} else
// XXX FIX me - get label id
throw ValueErrorException(
std::string("Duplicate label in input, current type is:") +
labellingToString(type));
}
atom->setProp<int>(RLABEL, label);
labels.insert(label);
maxLabel = label + 1;
return true;
}
return false;
}
bool hasDummy(const RWMol &core) {
for (RWMol::ConstAtomIterator atIt = core.beginAtoms();
atIt != core.endAtoms(); ++atIt) {
if ((*atIt)->getAtomicNum() == 0) return true;
}
return false;
}
} // namespace
unsigned int RGroupDecompositionParameters::autoGetLabels(const RWMol &core) {
unsigned int autoLabels = 0;
if (!onlyMatchAtRGroups) autoLabels = AtomIndexLabels;
bool hasMDLRGroup = false;
bool hasAtomMapNum = false;
bool hasIsotopes = false;
bool hasDummies = false;
for (auto atm : core.atoms()) {
if (atm->getIsotope()) hasIsotopes = true;
if (atm->getAtomMapNum()) hasAtomMapNum = true;
if (atm->hasProp(common_properties::_MolFileRLabel)) hasMDLRGroup = true;
if (atm->getAtomicNum() == 0) hasDummies = true;
}
if (hasMDLRGroup)
return autoLabels | MDLRGroupLabels;
else if (hasAtomMapNum)
return autoLabels | AtomMapLabels;
else if (hasIsotopes)
return autoLabels | IsotopeLabels;
else if (hasDummies)
return autoLabels | DummyAtomLabels;
return autoLabels;
}
bool RGroupDecompositionParameters::prepareCore(RWMol &core,
const RWMol *alignCore) {
const bool relabel = labels & RelabelDuplicateLabels;
unsigned int autoLabels = labels;
if (labels == AutoDetect) {
autoLabels = autoGetLabels(core);
if (!autoLabels) {
BOOST_LOG(rdWarningLog) << "RGroupDecomposition auto detect found no "
"rgroups and onlyMatAtRgroups is set to true"
<< std::endl;
return false;
}
}
if (alignCore && (alignment & MCS)) {
std::vector<ROMOL_SPTR> mols;
mols.push_back(ROMOL_SPTR(new ROMol(core)));
mols.push_back(ROMOL_SPTR(new ROMol(*alignCore)));
MCSResult res = findMCS(mols);
if (res.isCompleted()) {
RWMol *m = SmartsToMol(res.SmartsString);
if (m) {
MatchVectType match1;
MatchVectType match2;
bool target_matched1 = SubstructMatch(core, *m, match1);
bool target_matched2 = SubstructMatch(*alignCore, *m, match2);
CHECK_INVARIANT(match1.size() == match2.size(),
"Matches should be the same size in prepareCore");
if (target_matched1 && target_matched2) {
for (size_t i = 0; i < match1.size(); ++i) {
int queryAtomIdx1 = match1[i].first;
int coreAtomIdx = match1[i].second;
int queryAtomIdx2 = match2[i].first;
int alignCoreAtomIdx = match2[i].second;
CHECK_INVARIANT(queryAtomIdx1 == queryAtomIdx2,
"query atoms aren't the same");
Atom *coreAtm = core.getAtomWithIdx(coreAtomIdx);
const Atom *alignCoreAtm =
alignCore->getAtomWithIdx(alignCoreAtomIdx);
// clear up input rlabels
coreAtm->setAtomMapNum(0);
if (coreAtm->hasProp(common_properties::_MolFileRLabel)) {
coreAtm->clearProp(common_properties::_MolFileRLabel);
coreAtm->setIsotope(0);
}
if (alignCoreAtm->hasProp(RLABEL)) {
int rlabel = alignCoreAtm->getProp<int>(RLABEL);
coreAtm->setProp(RLABEL, rlabel);
}
}
}
delete m;
}
}
}
std::set<int> foundLabels;
int maxLabel = 0;
int nextOffset = 0;
std::map<int, int> atomToLabel;
for (auto atom : core.atoms()) {
bool found = false;
if (atom->hasProp(RLABEL)) {
if (setLabel(atom, atom->getProp<int>(RLABEL), foundLabels, maxLabel,
relabel, Labelling::INTERNAL_LABELS))
found = true;
}
if (!found && (autoLabels & MDLRGroupLabels)) {
unsigned int rgroup;
if (atom->getPropIfPresent<unsigned int>(
common_properties::_MolFileRLabel, rgroup)) {
if (setLabel(atom, rdcast<int>(rgroup), foundLabels, maxLabel, relabel,
Labelling::RGROUP_LABELS))
found = true;
}
}
if (!found && (autoLabels & IsotopeLabels) && atom->getIsotope() > 0) {
if (setLabel(atom, rdcast<int>(atom->getIsotope()), foundLabels, maxLabel,
relabel, Labelling::ISOTOPE_LABELS))
found = true;
}
if (!found && (autoLabels & AtomMapLabels) && atom->getAtomMapNum() > 0) {
if (setLabel(atom, rdcast<int>(atom->getAtomMapNum()), foundLabels,
maxLabel, relabel, Labelling::ATOMMAP_LABELS))
found = true;
}
if (!found && (autoLabels & DummyAtomLabels) && atom->getAtomicNum() == 0) {
const bool forceRelabellingWithDummies = true;
int defaultDummyStartLabel = 1;
if (setLabel(atom, defaultDummyStartLabel, foundLabels, maxLabel,
forceRelabellingWithDummies, Labelling::DUMMY_LABELS))
found = true;
}
// Unless there is an MCS match from above, we need to give different
// RLABELS to each core so keep track of which labels
// we have used (note that these are negative since they are
// potential rgroups and haven't been assigned yet)
if (!found && (autoLabels & AtomIndexLabels)) {
if (setLabel(atom, indexOffset - atom->getIdx(), foundLabels, maxLabel,
relabel, Labelling::INDEX_LABELS))
nextOffset++;
found = true;
}
clearInputLabels(atom);
int rlabel;
if (atom->getPropIfPresent(RLABEL, rlabel)) {
atomToLabel[atom->getIdx()] = rlabel;
}
}
indexOffset -= nextOffset;
MolOps::AdjustQueryParameters adjustParams;
adjustParams.makeDummiesQueries = true;
adjustParams.adjustDegree = false;
adjustQueryProperties(core, &adjustParams);
for (auto &it : atomToLabel) {
core.getAtomWithIdx(it.first)->setProp(RLABEL, it.second);
}
return true;
}
namespace {
// RGroup Class to hold the attached bits
struct RGroupData {
boost::shared_ptr<RWMol> combinedMol;
std::vector<boost::shared_ptr<ROMol>> mols; // All the mols in the rgroup
std::set<std::string> smilesSet; // used for rgroup equivalence
std::string
smiles; // smiles for all the mols in the rgroup (with attachments)
std::set<int> attachments; // attachment points
bool labelled;
private:
RGroupData(const RGroupData &rhs);
public:
RGroupData()
: combinedMol(),
mols(),
smilesSet(),
smiles(),
attachments(),
labelled(false) {}
void add(boost::shared_ptr<ROMol> newMol,
const std::vector<int> &rlabel_attachments) {
// some fragments can be add multiple times if they are cyclic
for (auto &mol : mols) {
if (newMol.get() == mol.get()) return;
}
labelled = false;
std::copy(rlabel_attachments.begin(), rlabel_attachments.end(),
std::inserter(attachments, attachments.end()));
mols.push_back(newMol);
std::string smi = MolToSmiles(*newMol, true);
// REVIEW: we probably shouldn't be using a set here... the merging of
// duplicates is likely not what we want
smilesSet.insert(smi);
if (!combinedMol.get()) {
combinedMol = boost::shared_ptr<RWMol>(new RWMol(*mols[0].get()));
} else {
ROMol *m = combineMols(*combinedMol.get(), *newMol.get());
m->updateProps(*combinedMol.get());
combinedMol.reset(new RWMol(*m));
delete m;
}
smiles = getSmiles();
combinedMol->setProp(common_properties::internalRgroupSmiles, smiles);
}
std::map<int, int> getNumBondsToRlabels() const {
std::map<int, int> rlabelsUsedCount;
for (ROMol::AtomIterator atIt = combinedMol->beginAtoms();
atIt != combinedMol->endAtoms(); ++atIt) {
Atom *atom = *atIt;
int rlabel;
if (atom->getPropIfPresent<int>(RLABEL, rlabel))
rlabelsUsedCount[rlabel] += 1;
}
return rlabelsUsedCount;
}
bool isHydrogen() const { // is the rgroup all Hs
for (const auto &mol : mols) {
for (ROMol::AtomIterator atIt = mol->beginAtoms();
atIt != mol->endAtoms(); ++atIt) {
if ((*atIt)->getAtomicNum() > 1) return false;
}
}
return true;
}
private:
std::string getSmiles()
const { // compute the canonical smiles for the attachments
std::string s;
for (const auto &it : smilesSet) {
if (s.length()) s += ".";
s += it;
}
return s;
}
};
} // namespace
namespace {
typedef boost::shared_ptr<RGroupData> RData;
typedef std::map<int, RData> R_DECOMP;
struct RGroupMatch {
// RGroupMatch is the decomposition for a single molecule
size_t core_idx; // index of the matching core
R_DECOMP rgroups; // rlabel->RGroupData mapping
RGroupMatch(size_t core_index, R_DECOMP input_rgroups)
: core_idx(core_index), rgroups(std::move(input_rgroups)) {}
};
void ADD_MATCH(R_DECOMP &match, int rlabel) {
if (match.find(rlabel) == match.end())
match[rlabel] = boost::make_shared<RGroupData>();
}
struct CartesianProduct {
std::vector<size_t> permutation;
std::vector<size_t> sizes;
size_t maxPermutations;
size_t permutationCount;
CartesianProduct(const std::vector<size_t> &inputSizes)
: permutation(inputSizes.size(), 0),
sizes(inputSizes),
permutationCount(0) {
maxPermutations = 1;
for (unsigned long size : sizes)
maxPermutations *= size; // may overflow....
}
bool next() {
++permutationCount;
if (permutationCount == 1) {
return true;
}
return increment(0);
}
bool increment(size_t rowToIncrement) {
if (permutationCount > maxPermutations) return false;
permutation[rowToIncrement] += 1;
size_t max_index_of_row = sizes[rowToIncrement] - 1;
if (permutation[rowToIncrement] > max_index_of_row) {
permutation[rowToIncrement] = 0;
return increment(rowToIncrement + 1);
}
return true;
}
};
// stupid total score
double score(const std::vector<size_t> &permutation,
const std::vector<std::vector<RGroupMatch>> &matches,
const std::set<int> &labels) {
double score = 1.;
#ifdef DEBUG
std::cerr << "---------------------------------------------------"
<< std::endl;
std::cerr << "Scoring permutation "
<< " num matches: " << matches.size() << std::endl;
#endif
for (int l : labels) {
#ifdef DEBUG
std::cerr << "Label: " << l << std::endl;
#endif
std::map<std::string, int> matchSet;
std::map<std::set<int>, int> linkerMatchSet;
std::map<std::string, int> onlyH;
for (size_t m = 0; m < permutation.size(); ++m) { // for each molecule
auto rg = matches[m][permutation[m]].rgroups.find(l);
if (rg != matches[m][permutation[m]].rgroups.end()) {
#ifdef DEBUG
std::cerr << " combined: " << MolToSmiles(*rg->second->combinedMol)
<< std::endl;
std::cerr << " RGroup: " << rg->second->smiles << " "
<< rg->second->smiles.find_first_not_of("0123456789[]*H:.");
#endif
matchSet[rg->second->smiles] += 1;
// detect whether or not this is an H
if (rg->second->smiles.find_first_not_of("0123456789[]*H:.") <
rg->second->smiles.length()) {
onlyH[rg->second->smiles] = 0;
} else {
onlyH[rg->second->smiles] = 1;
}
#ifdef DEBUG
std::cerr << " " << rg->second->combinedMol->getNumAtoms(false)
<< " isH: " << onlyH[rg->second->smiles]
<< " score: " << matchSet[rg->second->smiles] << std::endl;
#endif
// XXX Use fragment counts to see if we are linking cycles?
if (rg->second->smiles.find(".") == std::string::npos &&
rg->second->attachments.size() > 1) {
linkerMatchSet[rg->second->attachments]++;
#ifdef DEBUG
std::cerr << " Linker Score: "
<< linkerMatchSet[rg->second->attachments]++ << std::endl;
#endif
}
}
}
// get the counts for each rgroup found and sort in reverse order
std::vector<float> equivalentRGroupCount;
for (std::map<std::string, int>::const_iterator it = matchSet.begin();
it != matchSet.end(); ++it) {
#ifdef DEBUG
std::cerr << " equiv: " << it->first << " " << it->second << " "
<< permutation.size() << std::endl;
#endif
// if the rgroup is hydrogens, only consider if the group is all
// hydrogen, otherwise score based on the non hydrogens
if (onlyH[it->first]) {
if (static_cast<size_t>(it->second) == permutation.size()) {
equivalentRGroupCount.push_back(static_cast<float>(it->second));
} else {
// hydrogens in a mixed group don't contribute to the score
equivalentRGroupCount.push_back(0.0);
}
} else {
equivalentRGroupCount.push_back(static_cast<float>(it->second));
}
}
std::sort(equivalentRGroupCount.begin(), equivalentRGroupCount.end(),
std::greater<float>());
double tempScore = 1.;
// score the sets from the largest to the smallest
// each smaller set gets penalized (i+1) below
// 1.0 is the perfect score
for (size_t i = 0; i < equivalentRGroupCount.size(); ++i) {
auto lscore =
equivalentRGroupCount[i] / ((i + 1) * (double)matches.size());
if (lscore > 0) tempScore *= lscore * lscore;
#ifdef DEBUG
std::cerr << " lscore^2 " << i << ": " << lscore * lscore << std::endl;
#endif
}
// overweight linkers with the same attachments points....
// because these belong to 2 rgroups we really want these to stay
// ** this heuristic really should be taken care of above **
int maxLinkerMatches = 0;
for (std::map<std::set<int>, int>::const_iterator it =
linkerMatchSet.begin();
it != linkerMatchSet.end(); ++it) {
if (it->second > 1) {
if (it->second > maxLinkerMatches) maxLinkerMatches = it->second;
}
}
#ifdef DEBUG
std::cerr << "Max Linker Matches :" << maxLinkerMatches << std::endl;
#endif
double increment = 1.0; // no change in score
double linkerIncrement = 1.0; // no change in score
if (maxLinkerMatches) {
linkerIncrement = (double)(maxLinkerMatches) / (double)matches.size();
} else {
increment = tempScore;
}
score += increment * linkerIncrement;
#ifdef DEBUG
std::cerr << "Increment: " << increment
<< " Linker_Increment: " << linkerIncrement << std::endl;
std::cerr << "increment*linkerIncrement: " << increment * linkerIncrement
<< std::endl;
std::cerr << "Score = " << score << std::endl;
#endif
}
return score;
}
} // namespace
const unsigned int EMPTY_CORE_LABEL = -100000;
struct RGroupDecompData {
// matches[mol_idx] == vector of potential matches
std::map<int, RWMol> cores;
std::map<std::string, int> newCores; // new "cores" found along the way
int newCoreLabel;
RGroupDecompositionParameters params;
std::vector<std::vector<RGroupMatch>> matches;
std::set<int> labels;
std::vector<size_t> permutation;
std::map<int, std::vector<int>> userLabels;
std::vector<int> processedRlabels;
std::map<int, boost::shared_ptr<RWMol>> labelledCores;
std::map<int, int> finalRlabelMapping;
RGroupDecompData(const RWMol &inputCore,
RGroupDecompositionParameters inputParams)
: cores(),
newCores(),
newCoreLabel(EMPTY_CORE_LABEL),
params(std::move(inputParams)) {
cores[0] = inputCore;
prepareCores();
}
RGroupDecompData(const std::vector<ROMOL_SPTR> &inputCores,
RGroupDecompositionParameters inputParams)
: cores(),
newCores(),
newCoreLabel(EMPTY_CORE_LABEL),
params(std::move(inputParams)) {
for (size_t i = 0; i < inputCores.size(); ++i) {
cores[i] = *inputCores[i].get();
}
prepareCores();
}
void prepareCores() {
size_t idx = 0;
for (auto coreIt = cores.begin(); coreIt != cores.end(); ++coreIt, ++idx) {
RWMol *alignCore = coreIt->first ? &cores[0] : nullptr;
CHECK_INVARIANT(params.prepareCore(coreIt->second, alignCore),
"Could not prepare at least one core");
labelledCores[coreIt->first] =
boost::shared_ptr<RWMol>(new RWMol(coreIt->second));
}
}
void setRlabel(Atom *atom, int rlabel) {
PRECONDITION(rlabel != 0, "RLabels must be >0");
if (params.rgroupLabelling & AtomMap) {
atom->setAtomMapNum(rlabel);
}
if (params.rgroupLabelling & MDLRGroup) {
std::string dLabel = "R" + std::to_string(rlabel);
atom->setProp(common_properties::dummyLabel, dLabel);
setAtomRLabel(atom, rlabel);
}
if (params.rgroupLabelling & Isotope) {
atom->setIsotope(rlabel + 1);
}
}
void prune() { // prune all but the current "best" permutation of matches
for (size_t mol_idx = 0; mol_idx < permutation.size(); ++mol_idx) {
std::vector<RGroupMatch> keepVector;
keepVector.push_back(matches[mol_idx][permutation[mol_idx]]);
matches[mol_idx] = keepVector;
}
permutation = std::vector<size_t>(matches.size(), 0);
}
// Return the RGroups with the current "best" permutation
// of matches.
std::vector<RGroupMatch> GetCurrentBestPermutation() const {
const bool removeAllHydrogenRGroups = params.removeAllHydrogenRGroups;
std::vector<RGroupMatch> results; // std::map<int, RGroup> > result;
for (size_t i = 0; i < permutation.size(); ++i) {
CHECK_INVARIANT(i < matches.size(),
"Best Permutation mol idx out of range");
CHECK_INVARIANT(permutation[i] < matches[i].size(),
"Selected match at permutation out of range");
results.push_back(matches[i][permutation[i]]);
}
if (removeAllHydrogenRGroups) {
// if a label is all hydrogens, remove it
// This logic is a bit tricky, find all labels that have common cores
// and analyze those sets independently.
// i.e. if core 1 doesn't have R1 then don't analyze it in when looking
// at label 1
std::map<int, std::set<int>> labelCores; // map from label->cores
for (auto &position : results) {
int core_idx = position.core_idx;
if (labelCores.find(core_idx) == labelCores.end()) {
auto core = cores.find(core_idx);
if (core != cores.end()) {
for (auto rlabels : getRlabels(core->second)) {
int rlabel = rlabels.first;
labelCores[rlabel].insert(core_idx);
}
}
}
}
for (int label : labels) {
bool allH = true;
for (auto &position : results) {
R_DECOMP::const_iterator rgroup = position.rgroups.find(label);
bool labelHasCore = labelCores[label].find(position.core_idx) !=
labelCores[label].end();
if (labelHasCore && (rgroup == position.rgroups.end() ||
!rgroup->second->isHydrogen())) {
allH = false;
break;
}
}
if (allH) {
for (auto &position : results) {
position.rgroups.erase(label);
}
}
}
}
return results;
}
class UsedLabels
{
public:
std::set<int> labels_used;
bool add(int rlabel) {
if (labels_used.find(rlabel) != labels_used.end()) {
return false;
}
labels_used.insert(rlabel);
return true;
}
int next() {
int i=1;
while( labels_used.find(i) != labels_used.end() )
++i;
labels_used.insert(i);
return i;
}
};
void relabelCore(RWMol &core, std::map<int, int> &mappings,
UsedLabels &used_labels,
const std::set<int> &indexLabels,
std::map<int, std::vector<int>> extraAtomRLabels) {
// Now remap to proper rlabel ids
// if labels are positive, they come from User labels
// if they are negative, they come from indices and should be
// numbered *after* the user labels.
//
// Some indices are attached to multiple bonds,
// these rlabels should be incrementally added last
std::map<int, Atom *> atoms = getRlabels(core);
// a core only has one labelled index
// a secondary structure extraAtomRLabels contains the number
// of bonds between this atom and the side chain
// a sidechain atom has a vector of the attachments back to the
// core that takes the place of numBondsToRlabel
std::map<int, std::vector<int>> bondsToCore;
std::vector<std::pair<Atom *, Atom *>> atomsToAdd; // adds -R if necessary
// Deal with user supplied labels
for (auto rlabels : atoms) {
int userLabel = rlabels.first;
if (userLabel < 0) continue; // not a user specified label
Atom *atom = rlabels.second;
mappings[userLabel] = userLabel;
used_labels.add(userLabel);
if (atom->getAtomicNum() == 0) { // add to existing dummy/rlabel
setRlabel(atom, userLabel);
} else { // adds new rlabel
auto *newAt = new Atom(0);
setRlabel(newAt, userLabel);
atomsToAdd.push_back(std::make_pair(atom, newAt));
}
}
// Deal with non-user supplied labels
for (auto newLabel : indexLabels) {
auto atm = atoms.find(newLabel);
if (atm == atoms.end()) {
continue;
}
Atom * atom = atm->second;
int rlabel;
auto mapping = mappings.find(newLabel);
if (mapping == mappings.end()) {
rlabel = used_labels.next();
mappings[newLabel] = rlabel;
} else {
rlabel = mapping->second;
}
if (atom->getAtomicNum() == 0) { // add to dummy
setRlabel(atom, rlabel);
} else {
auto *newAt = new Atom(0);
setRlabel(newAt, rlabel);
atomsToAdd.push_back(std::make_pair(atom, newAt));
}
}
// Deal with multiple bonds to the same label
for (auto &extraAtomRLabel : extraAtomRLabels) {
auto atm = atoms.find(extraAtomRLabel.first);
if (atm == atoms.end()) continue; // label not used in the rgroup
Atom *atom = atm->second;
for (size_t i = 0; i < extraAtomRLabel.second.size(); ++i) {
int rlabel = used_labels.next();
extraAtomRLabel.second[i] = rlabel;
// Is this necessary?
CHECK_INVARIANT(
atom->getAtomicNum() > 1,
"Multiple attachements to a dummy (or hydrogen) is weird.");
auto *newAt = new Atom(0);
setRlabel(newAt, rlabel);
atomsToAdd.push_back(std::make_pair(atom, newAt));
}
}
for (auto &i : atomsToAdd) {
core.addAtom(i.second, false, true);
core.addBond(i.first, i.second, Bond::SINGLE);
}
core.updatePropertyCache(false); // this was github #1550
}
void relabelRGroup(RGroupData &rgroup, const std::map<int, int> &mappings) {
PRECONDITION(rgroup.combinedMol.get(), "Unprocessed rgroup");
RWMol &mol = *rgroup.combinedMol.get();
if (rgroup.combinedMol->hasProp(done)) {
rgroup.labelled = true;
return;
}
mol.setProp(done, true);
std::vector<std::pair<Atom *, Atom *>> atomsToAdd; // adds -R if necessary
for (RWMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
++atIt) {
Atom *atom = *atIt;
if (atom->hasProp(SIDECHAIN_RLABELS)) {
atom->setIsotope(0);
const std::vector<int> &rlabels =
atom->getProp<std::vector<int>>(SIDECHAIN_RLABELS);
// switch on atom mappings or rlabels....
for (int rlabel : rlabels) {
auto label = mappings.find(rlabel);
CHECK_INVARIANT(label != mappings.end(), "Unprocessed mapping");
if (atom->getAtomicNum() == 0) {
setRlabel(atom, label->second);
} else {
auto *newAt = new Atom(0);
setRlabel(newAt, label->second);
atomsToAdd.push_back(std::make_pair(atom, newAt));
}
}
}
}
for (auto &i : atomsToAdd) {
mol.addAtom(i.second, false, true);
mol.addBond(i.first, i.second, Bond::SINGLE);
}
if (params.removeHydrogensPostMatch) {
bool implicitOnly = false;
bool updateExplicitCount = false;
bool sanitize = false;
MolOps::removeHs(mol, implicitOnly, updateExplicitCount, sanitize);
}
mol.updatePropertyCache(false); // this was github #1550
rgroup.labelled = true;
}
// relabel the core and sidechains using the specified user labels
// if matches exist for non labelled atoms, these are added as well
void relabel() {
std::vector<RGroupMatch> best = GetCurrentBestPermutation();
// get the labels used
std::set<int> userLabels;
std::set<int> indexLabels;
// Go through all the RGroups and find out which labels were
// actually used.
// some atoms will have multiple attachment points, i.e. cycles
// split these up into new rlabels if necessary
// These are detected at match time
// This vector will hold the extra (new) labels required
std::map<int, std::vector<int>> extraAtomRLabels;
for (auto &it : best) {
for (auto rit = it.rgroups.begin(); rit != it.rgroups.end(); ++rit) {
if (rit->first >= 0) userLabels.insert(rit->first);
if (rit->first < 0) indexLabels.insert(rit->first);
std::map<int, int> rlabelsUsedInRGroup =
rit->second->getNumBondsToRlabels();
for (auto &numBondsUsed : rlabelsUsedInRGroup) {
// Make space for the extra labels
if (numBondsUsed.second > 1) { // multiple rgroup bonds to same atom
extraAtomRLabels[numBondsUsed.first].resize(numBondsUsed.second -
1);
}
}
}
}
// Assign final RGroup labels to the cores and propogate these to
// the scaffold
finalRlabelMapping.clear();
UsedLabels used_labels;
for (std::map<int, RWMol>::const_iterator coreIt = cores.begin();
coreIt != cores.end(); ++coreIt) {
boost::shared_ptr<RWMol> labelledCore(new RWMol(coreIt->second));
labelledCores[coreIt->first] = labelledCore;
relabelCore(*labelledCore.get(), finalRlabelMapping, used_labels, indexLabels,
extraAtomRLabels);
}
for (auto &it : best) {
for (auto rit = it.rgroups.begin(); rit != it.rgroups.end(); ++rit) {
relabelRGroup(*rit->second, finalRlabelMapping);
}
}
}
// compute the number of rgroups that would be added if we
// accepted this permutation
int compute_num_added_rgroups(std::vector<size_t> &tied_permutation) {
int num_added_rgroups = 0;
for (int label : labels) {
if (label <= 0) { // label is not user supplied
for (size_t m = 0; m < tied_permutation.size(); ++m) { // for each molecule
auto rg = matches[m][tied_permutation[m]].rgroups.find(label);
if (rg != matches[m][tied_permutation[m]].rgroups.end()) {
if(rg->second->smiles.find_first_not_of("0123456789[]*H:.") <
rg->second->smiles.length()) {
num_added_rgroups +=1;//= label;
break;
}
}
}
}
}
return num_added_rgroups;
}
bool process(bool pruneMatches, bool finalize = false) {
if (matches.size() == 0) return false;
// Exhaustive search, get the MxN matrix
size_t M = matches.size(); // Number of molecules
std::vector<size_t> permutations;
size_t N = 1;
for (size_t m = 0; m < M; ++m) {
size_t sz = matches[m].size(); // # permutations for molecule m
permutations.push_back(sz);
N *= sz;
}
permutation = std::vector<size_t>(permutations.size(), 0);
// run through all possible matches and score each
// set
double best_score = 0;
std::vector<size_t> best_permutation = permutation;
std::vector<std::vector<size_t>> ties;
size_t count = 0;
#ifdef DEBUG
std::cerr << "Processing" << std::endl;
#endif
CartesianProduct iterator(permutations);
// Iterates through the permutation idx, i.e.
// [m1_permutation_idx, m2_permutation_idx, m3_permutation_idx]
while (iterator.next()) {
if (count > N) throw ValueErrorException("Next did not finish");
#ifdef DEBUG
std::cerr << "**************************************************"
<< std::endl;
#endif
double newscore = score(iterator.permutation, matches, labels);
if (fabs(newscore - best_score) < 1e-6) { // heuristic to overcome floating point comparison issues
ties.push_back(iterator.permutation);
}
else if (newscore > best_score) {
#ifdef DEBUG
std::cerr << " ===> current best:" << newscore << ">" << best_score
<< std::endl;
#endif
ties.clear();
ties.push_back(iterator.permutation);
best_score = newscore;
best_permutation = iterator.permutation;
}
}
if(ties.size() > 1) {
// choose one that doesn't add rgroups inappropriately
// an rgroup is added when
// (1) the label is <=0
// (2) the group has any substituent with heavy atoms
// XXX Might be more efficient to add this comp to the score function
int smallest_added_rgroups = 100000000;
for(auto tied_permutation: ties) {
int num_added_rgroups = compute_num_added_rgroups(tied_permutation);
if(num_added_rgroups < smallest_added_rgroups) {
smallest_added_rgroups = num_added_rgroups;
best_permutation = tied_permutation;
}
}
}
permutation = best_permutation;
if (pruneMatches || finalize) {
prune();
}
if (finalize) {
relabel();
}
return true;
}
};
RGroupDecomposition::RGroupDecomposition(
const ROMol &inputCore, const RGroupDecompositionParameters &params)
: data(new RGroupDecompData(inputCore, params)) {}
RGroupDecomposition::RGroupDecomposition(
const std::vector<ROMOL_SPTR> &cores,
const RGroupDecompositionParameters &params)
: data(new RGroupDecompData(cores, params)) {}
RGroupDecomposition::~RGroupDecomposition() { delete data; }
int RGroupDecomposition::add(const ROMol &inmol) {
// get the sidechains if possible
// Add hs for better symmeterization
RWMol mol(inmol);
bool explicitOnly = false;
bool addCoords = true;
MolOps::addHs(mol, explicitOnly, addCoords);
int core_idx = 0;
const RWMol *core = nullptr;
std::vector<MatchVectType> tmatches;
// Find the first matching core.
for (std::map<int, RWMol>::const_iterator coreIt = data->cores.begin();
coreIt != data->cores.end(); ++coreIt) {
{
const bool uniquify = false;
const bool recursionPossible = false;
const bool useChirality = true;
SubstructMatch(mol, coreIt->second, tmatches, uniquify, recursionPossible,
useChirality);
}
if (data->params.onlyMatchAtRGroups) {
// First find all the core atoms that have user
// label and but their indices into core_atoms_with_user_labels
std::set<int> core_atoms_with_user_labels;
for (auto atom : coreIt->second.atoms()) {
if (atom->hasProp(RLABEL)) {
core_atoms_with_user_labels.insert(atom->getIdx());
}
}
std::vector<MatchVectType> tmatches_filtered;
for (auto &mv : tmatches) {
bool passes_filter = true;
boost::dynamic_bitset<> target_match_indices(mol.getNumAtoms());
for (auto &match : mv) {
target_match_indices[match.second] = 1;
}
for (auto &match : mv) {
const Atom *atm = mol.getAtomWithIdx(match.second);
// is this a labelled rgroup or not?
if (core_atoms_with_user_labels.find(match.first) ==
core_atoms_with_user_labels.end()) {
// nope... if any neighbor is not part of the substructure
// make sure we are a hydrogen, otherwise, skip the match
for (const auto &nbri :
boost::make_iterator_range(mol.getAtomNeighbors(atm))) {
const auto &nbr = mol[nbri];
if (nbr->getAtomicNum() != 1 &&
!target_match_indices[nbr->getIdx()]) {
passes_filter = false;
break;
}
}
}
if (!passes_filter) break;
}
if (passes_filter) {
tmatches_filtered.push_back(mv);
}
}
tmatches = tmatches_filtered;
}
if (!tmatches.size()) {
continue;
} else {
if (tmatches.size() > 1) {
if (data->matches.size() == 0) {
// Greedy strategy just grabs the first match and
// takes the best matches from the rest
if (data->params.matchingStrategy == Greedy) tmatches.resize(1);
}
}
core = &coreIt->second;
core_idx = coreIt->first;
break;
}
}
if (core == nullptr) return -1;
// strategies
// ==========
// Exhaustive - saves all matches and optimizes later exhaustive
// May never finish due to combinitorial complexity
// Greedy - matches to *FIRST* available match
// GreedyChunks - default - process every N chunks
// Should probably scan all mols first to find match with
// smallest number of matches...
size_t size = data->matches.size();
std::vector<RGroupMatch> potentialMatches;
for (auto &tmatche : tmatches) {
boost::scoped_ptr<ROMol> tMol;
{
const bool replaceDummies = false;
const bool labelByIndex = true;
const bool requireDummyMatch = false;
tMol.reset(replaceCore(mol, *core, tmatche, replaceDummies, labelByIndex,
requireDummyMatch));
}
if (tMol) {
R_DECOMP match;
// rlabel rgroups
MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false);
for (size_t i = 0; i < fragments.size(); ++i) {
std::vector<int> attachments;
boost::shared_ptr<ROMol> &newMol = fragments[i];
newMol->setProp<int>("core", core_idx);
newMol->setProp<int>("idx", size);
newMol->setProp<int>("frag_idx", i);
for (ROMol::AtomIterator atIt = newMol->beginAtoms();
atIt != newMol->endAtoms(); ++atIt) {
Atom *tmp = *atIt;
unsigned int elno = tmp->getAtomicNum();
if (elno == 0) {
unsigned int index =
tmp->getIsotope(); // this is the index into the core
// it messes up when there are multiple ?
int rlabel;
if (core->getAtomWithIdx(index)->getPropIfPresent(RLABEL, rlabel)) {
std::vector<int> rlabelsOnSideChain;
tmp->getPropIfPresent(SIDECHAIN_RLABELS, rlabelsOnSideChain);
rlabelsOnSideChain.push_back(rlabel);
tmp->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChain);
data->labels.insert(rlabel); // keep track of all labels used
attachments.push_back(rlabel);
}
}
}
if (attachments.size() > 0) {
// reject multiple attachments?
// what to do with labelled cores ?
std::string newCoreSmi = MolToSmiles(*newMol, true);
for (size_t attach_idx = 0; attach_idx < attachments.size();
++attach_idx) {
int rlabel = attachments[attach_idx];
ADD_MATCH(match, rlabel);
match[rlabel]->add(newMol, attachments);
}
} else {
// special case, only one fragment
if (fragments.size() == 1) { // need to make a new core
// remove the sidechains
RWMol newCore(mol);
for (MatchVectType::const_iterator mvit = tmatche.begin();
mvit != tmatche.end(); ++mvit) {
const Atom *coreAtm = core->getAtomWithIdx(mvit->first);
Atom *newCoreAtm = newCore.getAtomWithIdx(mvit->second);
int rlabel;
if (coreAtm->getPropIfPresent(RLABEL, rlabel)) {
newCoreAtm->setProp<int>(RLABEL, rlabel);
}
newCoreAtm->setProp<bool>("keep", true);
}
for (int aIdx = newCore.getNumAtoms() - 1; aIdx >= 0; --aIdx) {
Atom *atom = newCore.getAtomWithIdx(aIdx);
if (!atom->hasProp("keep")) newCore.removeAtom(atom);
}
if (newCore.getNumAtoms()) {
std::string newCoreSmi = MolToSmiles(newCore, true);
// add a new core if possible
auto newcore = data->newCores.find(newCoreSmi);
int core_idx = 0;
if (newcore == data->newCores.end()) {
core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--;
data->cores[core_idx] = newCore;
return add(inmol);
}
}
}
}
}
if (match.size()) {
potentialMatches.push_back(RGroupMatch(core_idx, match));
}
}
}
if (potentialMatches.size() == 0) {
BOOST_LOG(rdWarningLog)
<< "No attachment points in side chains" << std::endl;
return -1;
}
size_t N = 1;
for (auto &matche : data->matches) {
size_t sz = matche.size();
N *= sz;
}
// oops, exponential is a pain
if (N * potentialMatches.size() > 100000) {
data->permutation = std::vector<size_t>(data->matches.size(), 0);
data->process(true);
}
data->matches.push_back(potentialMatches);
data->permutation = std::vector<size_t>(data->matches.size(), 0);
if (size) {
if (data->params.matchingStrategy & Greedy ||
(data->params.matchingStrategy & GreedyChunks && size > 1 &&
size % data->params.chunkSize == 0))
data->process(true);
}
return data->matches.size() - 1;
}
bool RGroupDecomposition::process() {
try {
const bool prune = true;
const bool finalize = true;
return data->process(prune, finalize);
} catch (...) {
return false;
}
}
std::vector<std::string> RGroupDecomposition::getRGroupLabels() const {
// this is a bit of a cheat
RGroupColumns cols = getRGroupsAsColumns();
std::vector<std::string> labels;
for(auto it : cols) {
labels.push_back(it.first);
}
std::sort(labels.begin(), labels.end());
return labels;
}
RGroupRows RGroupDecomposition::getRGroupsAsRows() const {
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
RGroupRows groups;
int molidx = 0;
for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) {
// make a new rgroup entry
groups.push_back(RGroupRow());
RGroupRow &out_rgroups = groups.back();
out_rgroups["Core"] = data->labelledCores[it->core_idx];
R_DECOMP &in_rgroups = it->rgroups;
for (R_DECOMP::const_iterator rgroup = in_rgroups.begin();
rgroup != in_rgroups.end(); ++rgroup) {
std::map<int, int>::const_iterator realLabel =
data->finalRlabelMapping.find(rgroup->first);
CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(),
"unprocessed rlabel, please call process() first.");
out_rgroups[std::string("R") + std::to_string(realLabel->second)] =
rgroup->second->combinedMol;
}
}
return groups;
}
//! return rgroups in column order group[attachment_point][molidx] = ROMol
RGroupColumns RGroupDecomposition::getRGroupsAsColumns() const {
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
RGroupColumns groups;
// collect the list of all possible RGroups:
std::map<int, size_t> rgrp_pos_map;
unsigned int ridx = 0;
for (const auto rl : data->finalRlabelMapping) {
rgrp_pos_map[rl.second] = ridx++;
}
unsigned int molidx = 0;
for (auto it = permutation.begin(); it != permutation.end(); ++it, ++molidx) {
boost::dynamic_bitset<> Rs_seen(rgrp_pos_map.size());
R_DECOMP &in_rgroups = it->rgroups;
groups["Core"].push_back(data->labelledCores[it->core_idx]);
for (R_DECOMP::const_iterator rgroup = in_rgroups.begin();
rgroup != in_rgroups.end(); ++rgroup) {
std::map<int, int>::const_iterator realLabel =
data->finalRlabelMapping.find(rgroup->first);
CHECK_INVARIANT(realLabel != data->finalRlabelMapping.end(),
"unprocessed rlabel, please call process() first.");
CHECK_INVARIANT(rgroup->second->combinedMol->hasProp(done),
"Not done! Call process()");
Rs_seen.set(rgrp_pos_map[realLabel->second]);
std::string r = std::string("R") + std::to_string(realLabel->second);
RGroupColumn &col = groups[r];
if (molidx && col.size() < (size_t)(molidx - 1)) col.resize(molidx - 1);
col.push_back(rgroup->second->combinedMol);
}
// add empty entries to columns where this molecule didn't appear
for (const auto rpr : rgrp_pos_map) {
if (!Rs_seen[rpr.second]) {
std::string r = std::string("R") + std::to_string(rpr.first);
groups[r].push_back(boost::make_shared<RWMol>());
}
}
}
return groups;
}
namespace {
std::vector<unsigned int> Decomp(RGroupDecomposition &decomp,
const std::vector<ROMOL_SPTR> &mols) {
std::vector<unsigned int> unmatched;
for (size_t i = 0; i < mols.size(); ++i) {
int v = decomp.add(*mols[i].get());
if (v == -1) unmatched.push_back(i);
}
decomp.process();
return unmatched;
}
} // namespace
unsigned int RGroupDecompose(const std::vector<ROMOL_SPTR> &cores,
const std::vector<ROMOL_SPTR> &mols,
RGroupRows &rows,
std::vector<unsigned int> *unmatchedIndices,
const RGroupDecompositionParameters &options) {
RGroupDecomposition decomp(cores, options);
std::vector<unsigned int> unmatched = Decomp(decomp, mols);
if (unmatchedIndices) *unmatchedIndices = unmatched;
rows = decomp.getRGroupsAsRows();
return mols.size() - unmatched.size();
}
unsigned int RGroupDecompose(const std::vector<ROMOL_SPTR> &cores,
const std::vector<ROMOL_SPTR> &mols,
RGroupColumns &columns,
std::vector<unsigned int> *unmatchedIndices,
const RGroupDecompositionParameters &options) {
RGroupDecomposition decomp(cores, options);
std::vector<unsigned int> unmatched = Decomp(decomp, mols);
if (unmatchedIndices) *unmatchedIndices = unmatched;
columns = decomp.getRGroupsAsColumns();
return mols.size() - unmatched.size();
}
} // namespace RDKit