mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
1017 lines
34 KiB
C++
1017 lines
34 KiB
C++
// Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior written
|
|
// permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
#include "RGroupDecomp.h"
|
|
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/Substruct/SubstructMatch.h>
|
|
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
|
#include <GraphMol/SmilesParse/SmartsWrite.h>
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
#include <GraphMol/ChemTransforms/ChemTransforms.h>
|
|
#include <GraphMol/FMCS/FMCS.h>
|
|
#include <boost/scoped_ptr.hpp>
|
|
#include <set>
|
|
#include <vector>
|
|
|
|
//#define DEBUG
|
|
|
|
namespace RDKit {
|
|
|
|
// Attachment Points
|
|
// labeled cores => isotopes
|
|
// atom mappings
|
|
// atom indices => use -1 - atom index, range is [-1, ...., -num_atoms]
|
|
namespace {
|
|
const std::string RLABEL = "tempRlabel";
|
|
const std::string SIDECHAIN_RLABELS = "sideChainRlabels";
|
|
|
|
bool setLabel(Atom *atom, int label, std::set<int> &labels, int &maxLabel,
|
|
bool relabel, const std::string &type) {
|
|
if (type == "IsotopeLabels") {
|
|
atom->setIsotope(0);
|
|
}
|
|
|
|
if (label) {
|
|
if (labels.find(label) != labels.end()) {
|
|
if (relabel)
|
|
label = maxLabel + 1;
|
|
else
|
|
// XXX FIX me - get label id
|
|
throw ValueErrorException(
|
|
std::string("Duplicate label in input, current type is:") + type);
|
|
}
|
|
|
|
atom->setProp<int>(RLABEL, label);
|
|
labels.insert(label);
|
|
maxLabel = label + 1;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool hasDummy(const RWMol &core) {
|
|
for (RWMol::ConstAtomIterator atIt = core.beginAtoms();
|
|
atIt != core.endAtoms(); ++atIt) {
|
|
if ((*atIt)->getAtomicNum() == 0) return true;
|
|
}
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool RGroupDecompositionParameters::prepareCore(RWMol &core,
|
|
const RWMol *alignCore) {
|
|
const bool relabel = labels & RelabelDuplicateLabels;
|
|
if (alignCore && (alignment & MCS)) {
|
|
std::vector<ROMOL_SPTR> mols;
|
|
mols.push_back(ROMOL_SPTR(new ROMol(core)));
|
|
mols.push_back(ROMOL_SPTR(new ROMol(*alignCore)));
|
|
MCSResult res = findMCS(mols);
|
|
if (res.isCompleted()) {
|
|
RWMol *m = SmartsToMol(res.SmartsString);
|
|
if (m) {
|
|
MatchVectType match1;
|
|
MatchVectType match2;
|
|
|
|
bool target_matched1 = SubstructMatch(core, *m, match1);
|
|
bool target_matched2 = SubstructMatch(*alignCore, *m, match2);
|
|
CHECK_INVARIANT(match1.size() == match2.size(),
|
|
"Matches should be the same size in prepareCore");
|
|
|
|
if (target_matched1 && target_matched2) {
|
|
for (size_t i = 0; i < match1.size(); ++i) {
|
|
int queryAtomIdx1 = match1[i].first;
|
|
int coreAtomIdx = match1[i].second;
|
|
int queryAtomIdx2 = match2[i].first;
|
|
int alignCoreAtomIdx = match2[i].second;
|
|
CHECK_INVARIANT(queryAtomIdx1 == queryAtomIdx2,
|
|
"query atoms aren't the same");
|
|
const Atom *coreAtm = core.getAtomWithIdx(coreAtomIdx);
|
|
const Atom *alignCoreAtm =
|
|
alignCore->getAtomWithIdx(alignCoreAtomIdx);
|
|
int rlabel = alignCoreAtm->getProp<int>(RLABEL);
|
|
coreAtm->setProp(RLABEL, rlabel);
|
|
}
|
|
}
|
|
delete m;
|
|
}
|
|
}
|
|
}
|
|
|
|
std::set<int> foundLabels;
|
|
|
|
int maxLabel = 0;
|
|
int nextOffset = 0;
|
|
std::map<int, int> atomToLabel;
|
|
|
|
for (RWMol::AtomIterator atIt = core.beginAtoms(); atIt != core.endAtoms();
|
|
++atIt) {
|
|
Atom *atom = *atIt;
|
|
bool found = false;
|
|
|
|
if (atom->hasProp(RLABEL)) found = true;
|
|
|
|
if (!found && (labels & IsotopeLabels)) {
|
|
if (setLabel(atom, rdcast<int>(atom->getIsotope()), foundLabels, maxLabel,
|
|
relabel, "IsotopeLabels"))
|
|
found = true;
|
|
}
|
|
|
|
if (!found && (labels & AtomMapLabels)) {
|
|
if (setLabel(atom, rdcast<int>(atom->getAtomMapNum()), foundLabels,
|
|
maxLabel, relabel, "AtomMapLabels"))
|
|
found = true;
|
|
}
|
|
|
|
if (!found && (labels & AtomIndexLabels)) {
|
|
if (setLabel(atom, indexOffset - atom->getIdx(), foundLabels, maxLabel,
|
|
relabel, "IndexLabels"))
|
|
nextOffset++;
|
|
found = true;
|
|
}
|
|
|
|
int rlabel;
|
|
if (atom->getPropIfPresent(RLABEL, rlabel)) {
|
|
atomToLabel[atom->getIdx()] = rlabel;
|
|
}
|
|
}
|
|
|
|
indexOffset -= nextOffset;
|
|
|
|
MolOps::AdjustQueryParameters adjustParams;
|
|
adjustParams.makeDummiesQueries = true;
|
|
adjustParams.adjustDegree = onlyMatchAtRGroups
|
|
? MolOps::AdjustDegree::HeavyDegree
|
|
: MolOps::AdjustDegree::NoAdjust;
|
|
adjustQueryProperties(core, &adjustParams);
|
|
|
|
for (std::map<int, int>::iterator it = atomToLabel.begin();
|
|
it != atomToLabel.end(); ++it)
|
|
core.getAtomWithIdx(it->first)->setProp(RLABEL, it->second);
|
|
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
// RGroup Class to hold the attached bits
|
|
struct RGroupData {
|
|
boost::shared_ptr<RWMol> combinedMol;
|
|
std::vector<boost::shared_ptr<ROMol> > mols; // All the mols in the rgroup
|
|
std::set<std::string> smilesSet; // used for rgroup equivalence
|
|
std::string
|
|
smiles; // smiles for all the mols in the rgroup (with attachments)
|
|
std::set<int> attachments; // attachment points
|
|
|
|
void add(boost::shared_ptr<ROMol> newMol,
|
|
const std::vector<int> &rlabel_attachments) {
|
|
// some fragments can be add multiple times if they are cyclic
|
|
for (size_t i = 0; i < mols.size(); ++i) {
|
|
if (newMol.get() == mols[i].get()) return;
|
|
}
|
|
|
|
std::copy(rlabel_attachments.begin(), rlabel_attachments.end(),
|
|
std::inserter(attachments, attachments.end()));
|
|
|
|
mols.push_back(newMol);
|
|
std::string smi = MolToSmiles(*newMol, true);
|
|
smilesSet.insert(smi);
|
|
if (!combinedMol.get()) {
|
|
combinedMol = boost::shared_ptr<RWMol>(new RWMol(*mols[0].get()));
|
|
} else {
|
|
ROMol *m = combineMols(*combinedMol.get(), *newMol.get());
|
|
combinedMol.reset(new RWMol(*m));
|
|
delete m;
|
|
}
|
|
smiles = getSmiles();
|
|
}
|
|
|
|
std::map<int, int> getNumBondsToRlabels() const {
|
|
std::map<int, int> rlabelsUsedCount;
|
|
|
|
for (ROMol::AtomIterator atIt = combinedMol->beginAtoms();
|
|
atIt != combinedMol->endAtoms(); ++atIt) {
|
|
Atom *atom = *atIt;
|
|
int rlabel;
|
|
if (atom->getPropIfPresent<int>(RLABEL, rlabel))
|
|
rlabelsUsedCount[rlabel] += 1;
|
|
}
|
|
return rlabelsUsedCount;
|
|
}
|
|
|
|
bool isHydrogen() const { // is the rgroup all Hs
|
|
for (size_t i = 0; i < mols.size(); ++i) {
|
|
for (ROMol::AtomIterator atIt = mols[i]->beginAtoms();
|
|
atIt != mols[i]->endAtoms(); ++atIt) {
|
|
if ((*atIt)->getAtomicNum() > 1) return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
std::string getSmiles()
|
|
const { // compute the canonical smiles for the attachments
|
|
std::string s;
|
|
for (std::set<std::string>::const_iterator it = smilesSet.begin();
|
|
it != smilesSet.end(); ++it) {
|
|
s += *it;
|
|
}
|
|
return s;
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace {
|
|
struct RGroupMatch {
|
|
size_t core_idx;
|
|
std::map<int, RGroupData> rgroups;
|
|
RGroupMatch(size_t core_index, const std::map<int, RGroupData> &input_rgroups)
|
|
: core_idx(core_index), rgroups(input_rgroups) {}
|
|
};
|
|
|
|
struct CartesianProduct {
|
|
std::vector<size_t> permutation;
|
|
std::vector<size_t> sizes;
|
|
size_t maxPermutations;
|
|
size_t permutationCount;
|
|
CartesianProduct(const std::vector<size_t> &inputSizes)
|
|
: permutation(inputSizes.size(), 0),
|
|
sizes(inputSizes),
|
|
permutationCount(0) {
|
|
maxPermutations = 1;
|
|
for (size_t i = 0; i < sizes.size(); ++i)
|
|
maxPermutations *= sizes[i]; // may overflow....
|
|
}
|
|
|
|
bool next() {
|
|
++permutationCount;
|
|
if (permutationCount == 1) {
|
|
return true;
|
|
}
|
|
|
|
return increment(0);
|
|
}
|
|
|
|
bool increment(size_t rowToIncrement) {
|
|
if (permutationCount > maxPermutations) return false;
|
|
|
|
permutation[rowToIncrement] += 1;
|
|
size_t max_index_of_row = sizes[rowToIncrement] - 1;
|
|
if (permutation[rowToIncrement] > max_index_of_row) {
|
|
permutation[rowToIncrement] = 0;
|
|
return increment(rowToIncrement + 1);
|
|
}
|
|
return true;
|
|
}
|
|
};
|
|
|
|
// stupid total score
|
|
double score(const std::vector<size_t> &permutation,
|
|
const std::vector<std::vector<RGroupMatch> > &matches,
|
|
const std::set<int> &labels) {
|
|
double score = 1.;
|
|
|
|
#ifdef DEBUG
|
|
std::cerr << "---------------------------------------------------"
|
|
<< std::endl;
|
|
std::cerr << "Scoring permutation " << std::endl;
|
|
#endif
|
|
|
|
for (std::set<int>::const_iterator label = labels.begin();
|
|
label != labels.end(); ++label) {
|
|
int l = *label;
|
|
#ifdef DEBUG
|
|
std::cerr << "Label: " << l << std::endl;
|
|
#endif
|
|
std::map<std::string, int> matchSet;
|
|
std::map<std::set<int>, int> linkerMatchSet;
|
|
// std::map<std::vector<int>, int > attachMatch;
|
|
for (size_t m = 0; m < permutation.size(); ++m) { // for each molecule
|
|
std::map<int, RGroupData>::const_iterator rg =
|
|
matches[m][permutation[m]].rgroups.find(l);
|
|
if (rg != matches[m][permutation[m]].rgroups.end()) {
|
|
#ifdef DEBUG
|
|
std::cerr << " RGroup: " << rg->second.smiles;
|
|
#endif
|
|
matchSet[rg->second.smiles]++;
|
|
#ifdef DEBUG
|
|
std::cerr << " score: " << matchSet[rg->second.smiles] << std::endl;
|
|
#endif
|
|
// XXX Use fragment counts to see if we are linking cycles?
|
|
if (rg->second.smiles.find(".") == std::string::npos &&
|
|
rg->second.attachments.size() > 1) {
|
|
linkerMatchSet[rg->second.attachments]++;
|
|
#ifdef DEBUG
|
|
std::cerr << " Linker Score: "
|
|
<< linkerMatchSet[rg->second.attachments]++ << std::endl;
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
// get the counts for each rgroup found and sort in reverse order
|
|
std::vector<int> equivalentRGroupCount;
|
|
|
|
for (std::map<std::string, int>::const_iterator it = matchSet.begin();
|
|
it != matchSet.end(); ++it) {
|
|
equivalentRGroupCount.push_back(it->second);
|
|
}
|
|
std::sort(equivalentRGroupCount.begin(), equivalentRGroupCount.end(),
|
|
std::greater<int>());
|
|
|
|
double tempScore = 1.;
|
|
// score the sets from the largest to the smallest
|
|
// each smaller set gets penalized (i+1) below
|
|
// 1.0 is the perfect score
|
|
for (size_t i = 0; i < equivalentRGroupCount.size(); ++i) {
|
|
tempScore *=
|
|
equivalentRGroupCount[i] / ((i + 1) * (double)matches.size());
|
|
}
|
|
|
|
// overweight linkers with the same attachments points....
|
|
// because these belong to 2 rgroups we really want these to stay
|
|
// ** this heuristic really should be taken care of above **
|
|
int maxLinkerMatches = 0;
|
|
for (std::map<std::set<int>, int>::const_iterator it =
|
|
linkerMatchSet.begin();
|
|
it != linkerMatchSet.end(); ++it) {
|
|
if (it->second > 1) {
|
|
if (it->second > maxLinkerMatches) maxLinkerMatches = it->second;
|
|
}
|
|
}
|
|
#ifdef DEBUG
|
|
std::cerr << "Max Linker Matches :" << maxLinkerMatches << std::endl;
|
|
#endif
|
|
double increment = 1.0; // no change in score
|
|
double linkerIncrement = 1.0; // no change in score
|
|
if (maxLinkerMatches) {
|
|
linkerIncrement = (double)(maxLinkerMatches) / (double)matches.size();
|
|
} else {
|
|
increment = tempScore;
|
|
}
|
|
|
|
score *= increment * linkerIncrement;
|
|
#ifdef DEBUG
|
|
std::cerr << "Increment: " << increment
|
|
<< " Linker_Increment: " << linkerIncrement << std::endl;
|
|
std::cerr << "increment*linkerIncrement: " << increment * linkerIncrement
|
|
<< std::endl;
|
|
std::cerr << "Score = " << score << std::endl;
|
|
#endif
|
|
}
|
|
|
|
return score;
|
|
}
|
|
}
|
|
|
|
const unsigned int EMPTY_CORE_LABEL = -100000;
|
|
|
|
struct RGroupDecompData {
|
|
// matches[mol_idx] == vector of potential matches
|
|
std::map<int, RWMol> cores;
|
|
std::map<std::string, int> newCores; // new "cores" found along the way
|
|
int newCoreLabel;
|
|
RGroupDecompositionParameters params;
|
|
|
|
std::vector<std::vector<RGroupMatch> > matches;
|
|
std::set<int> labels;
|
|
std::vector<size_t> permutation;
|
|
std::map<int, std::vector<int> > userLabels;
|
|
|
|
std::vector<int> processedRlabels;
|
|
std::map<int, boost::shared_ptr<RWMol> > labelledCores;
|
|
|
|
std::map<int, int> finalRlabelMapping;
|
|
|
|
RGroupDecompData(const RWMol &inputCore,
|
|
const RGroupDecompositionParameters &inputParams)
|
|
: cores(),
|
|
newCores(),
|
|
newCoreLabel(EMPTY_CORE_LABEL),
|
|
params(inputParams) {
|
|
cores[0] = inputCore;
|
|
prepareCores();
|
|
}
|
|
|
|
RGroupDecompData(const std::vector<ROMOL_SPTR> &inputCores,
|
|
const RGroupDecompositionParameters &inputParams)
|
|
: cores(),
|
|
newCores(),
|
|
newCoreLabel(EMPTY_CORE_LABEL),
|
|
params(inputParams) {
|
|
for (size_t i = 0; i < inputCores.size(); ++i) {
|
|
cores[i] = *inputCores[i].get();
|
|
}
|
|
|
|
prepareCores();
|
|
}
|
|
|
|
void prepareCores() {
|
|
size_t idx = 0;
|
|
for (std::map<int, RWMol>::iterator coreIt = cores.begin();
|
|
coreIt != cores.end(); ++coreIt, ++idx) {
|
|
RWMol *alignCore = coreIt->first ? &cores[0] : 0;
|
|
params.prepareCore(coreIt->second, alignCore);
|
|
labelledCores[coreIt->first] =
|
|
boost::shared_ptr<RWMol>(new RWMol(coreIt->second));
|
|
}
|
|
}
|
|
|
|
void setRlabel(Atom *atom, int rlabel) {
|
|
// XXX Fix me - use parameters to decide what to do. Currenty does
|
|
// everything
|
|
if (params.rgroupLabelling & AtomMap) atom->setAtomMapNum(rlabel);
|
|
|
|
if (params.rgroupLabelling & MDLRGroup) {
|
|
std::string dLabel = "R" + boost::lexical_cast<std::string>(rlabel);
|
|
atom->setProp(common_properties::dummyLabel, dLabel);
|
|
setAtomRLabel(atom, rlabel);
|
|
}
|
|
|
|
if (params.rgroupLabelling & Isotope) atom->setIsotope(rlabel);
|
|
}
|
|
|
|
void prune() { // prune all but the current "best" permutation of matches
|
|
for (size_t mol_idx = 0; mol_idx < permutation.size(); ++mol_idx) {
|
|
std::vector<RGroupMatch> keepVector;
|
|
keepVector.push_back(matches[mol_idx][permutation[mol_idx]]);
|
|
matches[mol_idx] = keepVector;
|
|
}
|
|
permutation = std::vector<size_t>(matches.size(), 0);
|
|
}
|
|
|
|
// Return the RGroups with the current "best" permutation
|
|
// of matches.
|
|
std::vector<RGroupMatch> GetCurrentBestPermutation() const {
|
|
const bool removeAllHydrogenRGroups = params.removeAllHydrogenRGroups;
|
|
|
|
std::vector<RGroupMatch> result; // std::map<int, RGroup> > result;
|
|
for (size_t i = 0; i < permutation.size(); ++i) {
|
|
PRECONDITION(i < matches.size(), "Best Permutation mol idx out of range");
|
|
PRECONDITION(permutation[i] < matches[i].size(),
|
|
"Selected match at permutation out of range");
|
|
result.push_back(matches[i][permutation[i]]);
|
|
}
|
|
|
|
if (removeAllHydrogenRGroups) {
|
|
// if a label is all hydrogens, remove it
|
|
for (std::set<int>::const_iterator lit = labels.begin();
|
|
lit != labels.end(); ++lit) {
|
|
bool allH = true;
|
|
for (size_t i = 0; i < result.size(); ++i) {
|
|
std::map<int, RGroupData>::const_iterator rgroup =
|
|
result[i].rgroups.find(*lit);
|
|
if (rgroup == result[i].rgroups.end() ||
|
|
!rgroup->second.isHydrogen()) {
|
|
allH = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (allH) {
|
|
for (size_t i = 0; i < result.size(); ++i) {
|
|
result[i].rgroups.erase(*lit);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void relabelCore(RWMol &mol, std::map<int, int> &mappings,
|
|
const std::set<int> &userLabels,
|
|
const std::set<int> &indexLabels,
|
|
std::map<int, std::vector<int> > extraAtomRLabels) {
|
|
// Now remap to proper rlabel ids
|
|
// if labels are positive, they come from User labels
|
|
// if they are negative, they come from indices and should be
|
|
// numbered *after* the user labels.
|
|
//
|
|
// Some indices are attached to multiple bonds,
|
|
// these rlabels should be incrementally added last
|
|
int count = 0;
|
|
std::map<int, Atom *> atoms;
|
|
|
|
// a core only has one labelled index
|
|
// a secondary structure extraAtomRLabels contains the number
|
|
// of bonds between this atom and the side chain
|
|
|
|
// a sidechain atom has a vector of the attachments back to the
|
|
// core that takes the place of numBondsToRlabel
|
|
|
|
std::map<int, std::vector<int> > bondsToCore;
|
|
|
|
for (RWMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
|
|
++atIt) {
|
|
Atom *atom = *atIt;
|
|
|
|
if (atom->hasProp(RLABEL)) {
|
|
int rlabel = (*atIt)->getProp<int>(RLABEL); // user label
|
|
PRECONDITION(atoms.find(rlabel) == atoms.end(),
|
|
"Duplicate labels in rgroup core!");
|
|
atoms[rlabel] = *atIt;
|
|
}
|
|
}
|
|
|
|
std::vector<std::pair<Atom *, Atom *> > atomsToAdd; // adds -R if necessary
|
|
|
|
// Deal with user supplied labels
|
|
for (std::set<int>::iterator it = userLabels.begin();
|
|
it != userLabels.end(); ++it) {
|
|
std::map<int, Atom *>::iterator atm = atoms.find(*it);
|
|
if (atm == atoms.end()) continue; // label not used in the rgroup
|
|
Atom *atom = atm->second;
|
|
mappings[*it] = ++count;
|
|
|
|
if (atom->getAtomicNum() == 0) { // add to existing dummy/rlabel
|
|
setRlabel(atom, count);
|
|
} else { // adds new rlabel
|
|
Atom *newAt = new Atom(0);
|
|
setRlabel(newAt, count);
|
|
atomsToAdd.push_back(std::make_pair(atom, newAt));
|
|
}
|
|
}
|
|
|
|
// Deal with non-user supplied labels
|
|
for (std::set<int>::iterator it = indexLabels.begin();
|
|
it != indexLabels.end(); ++it) {
|
|
std::map<int, Atom *>::iterator atm = atoms.find(*it);
|
|
if (atm == atoms.end()) continue; // label not used in the rgroup
|
|
|
|
Atom *atom = atm->second;
|
|
mappings[*it] = ++count;
|
|
if (atom->getAtomicNum() == 0) { // add to dummy
|
|
setRlabel(atom, count);
|
|
} else {
|
|
Atom *newAt = new Atom(0);
|
|
setRlabel(newAt, count);
|
|
atomsToAdd.push_back(std::make_pair(atom, newAt));
|
|
}
|
|
}
|
|
|
|
// Deal with multiple bonds to the same label
|
|
for (std::map<int, std::vector<int> >::iterator extraAttachments =
|
|
extraAtomRLabels.begin();
|
|
extraAttachments != extraAtomRLabels.end(); ++extraAttachments) {
|
|
std::map<int, Atom *>::iterator atm = atoms.find(extraAttachments->first);
|
|
if (atm == atoms.end()) continue; // label not used in the rgroup
|
|
Atom *atom = atm->second;
|
|
|
|
for (size_t i = 0; i < extraAttachments->second.size(); ++i) {
|
|
extraAttachments->second[i] = ++count;
|
|
// Is this necessary?
|
|
PRECONDITION(
|
|
atom->getAtomicNum() > 1,
|
|
"Multiple attachements to a dummy (or hydrogen) is weird.");
|
|
Atom *newAt = new Atom(0);
|
|
setRlabel(newAt, count);
|
|
atomsToAdd.push_back(std::make_pair(atom, newAt));
|
|
}
|
|
}
|
|
|
|
for (size_t i = 0; i < atomsToAdd.size(); ++i) {
|
|
mol.addAtom(atomsToAdd[i].second, false, true);
|
|
mol.addBond(atomsToAdd[i].first, atomsToAdd[i].second, Bond::SINGLE);
|
|
}
|
|
}
|
|
|
|
void relabelRGroup(RGroupData &rgroup, const std::map<int, int> &mappings) {
|
|
PRECONDITION(rgroup.combinedMol.get(), "Unprocessed rgroup");
|
|
const std::string done = "RLABEL_PROCESSED";
|
|
if (rgroup.combinedMol->hasProp(done)) return;
|
|
|
|
rgroup.combinedMol->setProp(done, true);
|
|
|
|
RWMol &mol = *rgroup.combinedMol.get();
|
|
|
|
std::vector<std::pair<Atom *, Atom *> > atomsToAdd; // adds -R if necessary
|
|
|
|
for (RWMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
|
|
++atIt) {
|
|
Atom *atom = *atIt;
|
|
if (atom->hasProp(SIDECHAIN_RLABELS)) {
|
|
atom->setIsotope(0);
|
|
const std::vector<int> &rlabels =
|
|
atom->getProp<std::vector<int> >(SIDECHAIN_RLABELS);
|
|
// switch on atom mappings or rlabels....
|
|
|
|
for (std::vector<int>::const_iterator rlabel = rlabels.begin();
|
|
rlabel != rlabels.end(); ++rlabel) {
|
|
std::map<int, int>::const_iterator label = mappings.find(*rlabel);
|
|
PRECONDITION(label != mappings.end(), "Unprocessed mapping");
|
|
|
|
if (atom->getAtomicNum() == 0) {
|
|
setRlabel(atom, label->second);
|
|
} else {
|
|
Atom *newAt = new Atom(0);
|
|
setRlabel(newAt, label->second);
|
|
atomsToAdd.push_back(std::make_pair(atom, newAt));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (size_t i = 0; i < atomsToAdd.size(); ++i) {
|
|
mol.addAtom(atomsToAdd[i].second, false, true);
|
|
mol.addBond(atomsToAdd[i].first, atomsToAdd[i].second, Bond::SINGLE);
|
|
}
|
|
}
|
|
|
|
// relabel the core and sidechains using the specified user labels
|
|
// if matches exist for non labelled atoms, these are added as well
|
|
void relabel() {
|
|
std::vector<RGroupMatch> best = GetCurrentBestPermutation();
|
|
|
|
// get the labels used
|
|
std::set<int> userLabels;
|
|
std::set<int> indexLabels;
|
|
|
|
// Go through all the RGroups and find out which labels were
|
|
// actually used.
|
|
|
|
// some atoms will have multiple attachment points, i.e. cycles
|
|
// split these up into new rlabels if necessary
|
|
// These are detected at match time
|
|
// This vector will hold the extra (new) labels required
|
|
std::map<int, std::vector<int> > extraAtomRLabels;
|
|
|
|
for (std::vector<RGroupMatch>::iterator it = best.begin(); it != best.end();
|
|
++it) {
|
|
for (std::map<int, RGroupData>::iterator rit = it->rgroups.begin();
|
|
rit != it->rgroups.end(); ++rit) {
|
|
if (rit->first >= 0) userLabels.insert(rit->first);
|
|
if (rit->first < 0) indexLabels.insert(rit->first);
|
|
|
|
std::map<int, int> rlabelsUsedInRGroup =
|
|
rit->second.getNumBondsToRlabels();
|
|
for (std::map<int, int>::iterator numBondsUsed =
|
|
rlabelsUsedInRGroup.begin();
|
|
numBondsUsed != rlabelsUsedInRGroup.end(); ++numBondsUsed) {
|
|
// Make space for the extra labels
|
|
if (numBondsUsed->second > 1) { // multiple
|
|
extraAtomRLabels[numBondsUsed->first].resize(numBondsUsed->second -
|
|
1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
finalRlabelMapping.clear();
|
|
for (std::map<int, RWMol>::const_iterator coreIt = cores.begin();
|
|
coreIt != cores.end(); ++coreIt) {
|
|
boost::shared_ptr<RWMol> labelledCore(new RWMol(coreIt->second));
|
|
labelledCores[coreIt->first] = labelledCore;
|
|
|
|
relabelCore(*labelledCore.get(), finalRlabelMapping, userLabels,
|
|
indexLabels, extraAtomRLabels);
|
|
}
|
|
|
|
const std::string done = "RLABEL_PROCESSED";
|
|
for (std::vector<RGroupMatch>::iterator it = best.begin(); it != best.end();
|
|
++it) {
|
|
for (std::map<int, RGroupData>::iterator rit = it->rgroups.begin();
|
|
rit != it->rgroups.end(); ++rit) {
|
|
relabelRGroup(rit->second, finalRlabelMapping);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool process(bool pruneMatches, bool finalize = false) {
|
|
if (matches.size() == 0) return false;
|
|
|
|
// Exhaustive search, get the MxN matrix
|
|
size_t M = matches.size();
|
|
std::vector<size_t> permutations;
|
|
size_t N = 1;
|
|
|
|
for (size_t m = 0; m < M; ++m) {
|
|
size_t sz = matches[m].size();
|
|
permutations.push_back(sz);
|
|
N *= sz;
|
|
}
|
|
|
|
permutation = std::vector<size_t>(permutations.size(), 0);
|
|
|
|
// run through all possible matches and score each
|
|
// set
|
|
double best_score = 0;
|
|
std::vector<size_t> best_permutation = permutation;
|
|
size_t count = 0;
|
|
#ifdef DEBUG
|
|
std::cerr << "Processing" << std::endl;
|
|
#endif
|
|
CartesianProduct iterator(permutations);
|
|
while (iterator.next()) {
|
|
if (count > N) throw ValueErrorException("Next did not finish");
|
|
#ifdef DEBUG
|
|
std::cerr << "**************************************************"
|
|
<< std::endl;
|
|
#endif
|
|
double newscore = score(iterator.permutation, matches, labels);
|
|
if (newscore > best_score) {
|
|
#ifdef DEBUG
|
|
std::cerr << " ===> current best:" << newscore << ">" << best_score
|
|
<< std::endl;
|
|
#endif
|
|
best_score = newscore;
|
|
best_permutation = iterator.permutation;
|
|
}
|
|
}
|
|
|
|
permutation = best_permutation;
|
|
if (pruneMatches || finalize) {
|
|
prune();
|
|
}
|
|
|
|
if (finalize) {
|
|
relabel();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
RGroupDecomposition::RGroupDecomposition(
|
|
const ROMol &inputCore, const RGroupDecompositionParameters ¶ms)
|
|
: data(new RGroupDecompData(inputCore, params)) {}
|
|
|
|
RGroupDecomposition::RGroupDecomposition(
|
|
const std::vector<ROMOL_SPTR> &cores,
|
|
const RGroupDecompositionParameters ¶ms)
|
|
: data(new RGroupDecompData(cores, params)) {}
|
|
|
|
RGroupDecomposition::~RGroupDecomposition() { delete data; }
|
|
|
|
int RGroupDecomposition::add(const ROMol &inmol) {
|
|
// get the sidechains if possible
|
|
// Add hs for better symmeterization
|
|
RWMol mol(inmol);
|
|
MolOps::addHs(mol);
|
|
|
|
int core_idx = 0;
|
|
const RWMol *core = 0;
|
|
std::vector<MatchVectType> tmatches;
|
|
|
|
// Find the first matching core.
|
|
for (std::map<int, RWMol>::const_iterator coreIt = data->cores.begin();
|
|
coreIt != data->cores.end(); ++coreIt) {
|
|
{
|
|
const bool uniquify = false;
|
|
const bool recursionPossible = false;
|
|
const bool useChirality = true;
|
|
SubstructMatch(mol, coreIt->second, tmatches, uniquify, recursionPossible,
|
|
useChirality);
|
|
}
|
|
|
|
if (!tmatches.size()) {
|
|
continue;
|
|
} else {
|
|
if (tmatches.size() > 1) {
|
|
if (data->matches.size() == 0) {
|
|
// Greedy strategy just grabs the first match and
|
|
// takes the best matches from the rest
|
|
if (data->params.matchingStrategy == Greedy) tmatches.resize(1);
|
|
}
|
|
}
|
|
core = &coreIt->second;
|
|
core_idx = coreIt->first;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (core == 0) return -1;
|
|
|
|
// strategies
|
|
// ==========
|
|
// Exhaustive - saves all matches and optimizes later exhaustive
|
|
// May never finish due to combinitorial complexity
|
|
// Greedy - matches to *FIRST* available match
|
|
// GreedyChunks - default - process every N chunks
|
|
|
|
// Should probably scan all mols first to find match with
|
|
// smallest number of matches...
|
|
|
|
std::vector<RGroupMatch> potentialMatches;
|
|
|
|
for (size_t match_idx = 0; match_idx < tmatches.size(); ++match_idx) {
|
|
boost::scoped_ptr<ROMol> tMol;
|
|
{
|
|
const bool replaceDummies = false;
|
|
const bool labelByIndex = true;
|
|
const bool requireDummyMatch = false;
|
|
tMol.reset(replaceCore(mol, *core, tmatches[match_idx], replaceDummies,
|
|
labelByIndex, requireDummyMatch));
|
|
}
|
|
|
|
if (tMol) {
|
|
std::map<int, RGroupData> match;
|
|
// rlabel rgroups
|
|
MOL_SPTR_VECT fragments = MolOps::getMolFrags(*tMol, false);
|
|
for (size_t i = 0; i < fragments.size(); ++i) {
|
|
std::vector<int> attachments;
|
|
boost::shared_ptr<ROMol> &newMol = fragments[i];
|
|
for (ROMol::AtomIterator atIt = newMol->beginAtoms();
|
|
atIt != newMol->endAtoms(); ++atIt) {
|
|
Atom *tmp = *atIt;
|
|
unsigned int elno = tmp->getAtomicNum();
|
|
if (elno == 0) {
|
|
unsigned int index =
|
|
tmp->getIsotope(); // this is the index into the core
|
|
// it messes up when there are multiple ?
|
|
int rlabel;
|
|
if (core->getAtomWithIdx(index)->getPropIfPresent(RLABEL, rlabel)) {
|
|
std::vector<int> rlabelsOnSideChain;
|
|
tmp->getPropIfPresent(SIDECHAIN_RLABELS, rlabelsOnSideChain);
|
|
rlabelsOnSideChain.push_back(rlabel);
|
|
tmp->setProp(SIDECHAIN_RLABELS, rlabelsOnSideChain);
|
|
|
|
data->labels.insert(rlabel); // keep track of all labels used
|
|
attachments.push_back(rlabel);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (attachments.size() > 0) {
|
|
// reject multiple attachments?
|
|
// what to do with labelled cores ?
|
|
std::string newCoreSmi = MolToSmiles(*newMol, true);
|
|
|
|
for (size_t attach_idx = 0; attach_idx < attachments.size();
|
|
++attach_idx) {
|
|
int rlabel = attachments[attach_idx];
|
|
match[rlabel].add(newMol, attachments);
|
|
}
|
|
} else {
|
|
// special case, only one fragment
|
|
if (fragments.size() == 1) { // need to make a new core
|
|
// remove the sidechains
|
|
RWMol newCore(mol);
|
|
|
|
for (MatchVectType::const_iterator mvit =
|
|
tmatches[match_idx].begin();
|
|
mvit != tmatches[match_idx].end(); ++mvit) {
|
|
const Atom *coreAtm = core->getAtomWithIdx(mvit->first);
|
|
Atom *newCoreAtm = newCore.getAtomWithIdx(mvit->second);
|
|
int rlabel;
|
|
if (coreAtm->getPropIfPresent(RLABEL, rlabel)) {
|
|
newCoreAtm->setProp<int>(RLABEL, rlabel);
|
|
}
|
|
newCoreAtm->setProp<bool>("keep", true);
|
|
}
|
|
|
|
for (int aIdx = newCore.getNumAtoms() - 1; aIdx >= 0; --aIdx) {
|
|
Atom *atom = newCore.getAtomWithIdx(aIdx);
|
|
if (!atom->hasProp("keep")) newCore.removeAtom(atom);
|
|
}
|
|
if (newCore.getNumAtoms()) {
|
|
std::string newCoreSmi = MolToSmiles(newCore, true);
|
|
// add a new core if possible
|
|
std::map<std::string, int>::iterator newcore =
|
|
data->newCores.find(newCoreSmi);
|
|
int core_idx = 0;
|
|
if (newcore == data->newCores.end()) {
|
|
core_idx = data->newCores[newCoreSmi] = data->newCoreLabel--;
|
|
data->cores[core_idx] = newCore;
|
|
return add(inmol);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (match.size()) {
|
|
potentialMatches.push_back(RGroupMatch(core_idx, match));
|
|
}
|
|
}
|
|
}
|
|
if (potentialMatches.size() == 0) {
|
|
BOOST_LOG(rdWarningLog) << "No attachment points in side chains"
|
|
<< std::endl;
|
|
|
|
return -1;
|
|
}
|
|
|
|
size_t N = 1;
|
|
for (size_t m = 0; m < data->matches.size(); ++m) {
|
|
size_t sz = data->matches[m].size();
|
|
N *= sz;
|
|
}
|
|
// oops, exponential is a pain
|
|
if (N * potentialMatches.size() > 100000) {
|
|
data->permutation = std::vector<size_t>(data->matches.size(), 0);
|
|
data->process(true);
|
|
}
|
|
|
|
data->matches.push_back(potentialMatches);
|
|
data->permutation = std::vector<size_t>(data->matches.size(), 0);
|
|
|
|
size_t size = data->matches.size();
|
|
|
|
if (size) {
|
|
if (data->params.matchingStrategy & Greedy ||
|
|
(data->params.matchingStrategy & GreedyChunks && size > 1 &&
|
|
size % data->params.chunkSize == 0))
|
|
data->process(true);
|
|
}
|
|
return data->matches.size() - 1;
|
|
}
|
|
|
|
bool RGroupDecomposition::process() {
|
|
try {
|
|
const bool prune = true;
|
|
const bool finalize = true;
|
|
return data->process(prune, finalize);
|
|
} catch (...) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
RGroupRows RGroupDecomposition::getRGroupsAsRows() const {
|
|
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
|
|
|
|
RGroupRows groups;
|
|
|
|
int molidx = 0;
|
|
for (std::vector<RGroupMatch>::iterator it = permutation.begin();
|
|
it != permutation.end(); ++it, ++molidx) {
|
|
// make a new rgroup entry
|
|
groups.push_back(RGroupRow());
|
|
RGroupRow &out_rgroups = groups.back();
|
|
out_rgroups["Core"] = data->labelledCores[it->core_idx];
|
|
|
|
std::map<int, RGroupData> &in_rgroups = it->rgroups;
|
|
|
|
for (std::map<int, RGroupData>::const_iterator rgroup = in_rgroups.begin();
|
|
rgroup != in_rgroups.end(); ++rgroup) {
|
|
std::map<int, int>::const_iterator realLabel =
|
|
data->finalRlabelMapping.find(rgroup->first);
|
|
PRECONDITION(realLabel != data->finalRlabelMapping.end(),
|
|
"unprocessed rlabel, please call process() first.");
|
|
out_rgroups[std::string("R") +
|
|
boost::lexical_cast<std::string>(realLabel->second)] =
|
|
rgroup->second.combinedMol;
|
|
}
|
|
}
|
|
return groups;
|
|
}
|
|
//! return rgroups in column order group[attachment_point][molidx] = ROMol
|
|
RGroupColumns RGroupDecomposition::getRGroupsAsColumns() const {
|
|
std::vector<RGroupMatch> permutation = data->GetCurrentBestPermutation();
|
|
|
|
RGroupColumns groups;
|
|
|
|
unsigned int molidx = 0;
|
|
for (std::vector<RGroupMatch>::iterator it = permutation.begin();
|
|
it != permutation.end(); ++it, ++molidx) {
|
|
std::map<int, RGroupData> &in_rgroups = it->rgroups;
|
|
groups["Core"].push_back(data->labelledCores[it->core_idx]);
|
|
|
|
for (std::map<int, RGroupData>::const_iterator rgroup = in_rgroups.begin();
|
|
rgroup != in_rgroups.end(); ++rgroup) {
|
|
std::map<int, int>::const_iterator realLabel =
|
|
data->finalRlabelMapping.find(rgroup->first);
|
|
PRECONDITION(realLabel != data->finalRlabelMapping.end(),
|
|
"unprocessed rlabel, please call process() first.");
|
|
std::string r = std::string("R") +
|
|
boost::lexical_cast<std::string>(realLabel->second);
|
|
RGroupColumn &col = groups[r];
|
|
if (molidx && col.size() < (size_t)(molidx - 1)) col.resize(molidx - 1);
|
|
col.push_back(rgroup->second.combinedMol);
|
|
}
|
|
}
|
|
// Now make all columns equal
|
|
for (std::map<std::string, RGroupColumn>::iterator it = groups.begin();
|
|
it != groups.end(); ++it) {
|
|
if (it->second.size() != molidx) it->second.resize(molidx);
|
|
}
|
|
return groups;
|
|
}
|
|
}
|