Files
rdkit/Code/GraphMol/ChemTransforms/MolFragmenter.cpp
Greg Landrum b0f140a55c Fixes #511
2016-03-29 16:56:15 +02:00

546 lines
18 KiB
C++

// $Id$
//
// Copyright (C) 2013 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "MolFragmenter.h"
#include <RDGeneral/utils.h>
#include <RDGeneral/Invariant.h>
#include <RDGeneral/RDLog.h>
#include <RDGeneral/Exceptions.h>
#include <GraphMol/RDKitBase.h>
#include <boost/dynamic_bitset.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/cstdint.hpp>
#include <vector>
#include <algorithm>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/flyweight.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <boost/functional/hash.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <sstream>
namespace RDKit {
namespace MolFragmenter {
std::size_t hash_value(const FragmenterBondType &fbt) {
size_t res = boost::hash<int>()((int)fbt.bondType);
boost::hash_combine(res, fbt.atom1Label);
boost::hash_combine(res, fbt.atom2Label);
return res;
}
bool operator==(const FragmenterBondType &v1, const FragmenterBondType &v2) {
return (v1.atom1Label == v2.atom1Label) && (v1.atom2Label == v2.atom2Label) &&
(v1.bondType == v2.bondType);
}
void constructFragmenterAtomTypes(
std::istream *inStream, std::map<unsigned int, std::string> &defs,
const std::string &comment, bool validate,
std::map<unsigned int, ROMOL_SPTR> *environs) {
PRECONDITION(inStream, "no stream");
defs.clear();
unsigned int line = 0;
while (!inStream->eof()) {
++line;
std::string tempStr = getLine(inStream);
if (tempStr == "" || tempStr.find(comment) == 0) continue;
std::vector<std::string> tokens;
boost::split(tokens, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
if (tokens.size() < 2) {
BOOST_LOG(rdWarningLog) << "line " << line << " is too short"
<< std::endl;
continue;
}
unsigned int idx = boost::lexical_cast<unsigned int>(tokens[0]);
if (defs.find(idx) != defs.end()) {
BOOST_LOG(rdWarningLog)
<< "definition #" << idx
<< " encountered more than once. Using the first occurance."
<< std::endl;
continue;
}
if (validate || environs) {
ROMol *p = SmartsToMol(tokens[1]);
if (!p) {
BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << tokens[1]
<< " to molecule at line " << line << std::endl;
continue;
}
if (!environs) {
delete p;
} else {
(*environs)[idx] = ROMOL_SPTR(p);
}
}
defs[idx] = tokens[1];
}
}
void constructFragmenterAtomTypes(
const std::string &str, std::map<unsigned int, std::string> &defs,
const std::string &comment, bool validate,
std::map<unsigned int, ROMOL_SPTR> *environs) {
std::stringstream istr(str);
constructFragmenterAtomTypes(&istr, defs, comment, validate, environs);
}
void constructBRICSAtomTypes(std::map<unsigned int, std::string> &defs,
std::map<unsigned int, ROMOL_SPTR> *environs) {
/*
After some discussion, the L2 definitions ("N.pl3" in the original
paper) have been removed and incorporated into a (almost) general
purpose amine definition in L5 ("N.sp3" in the paper).
The problem is one of consistency.
Based on the original definitions you should get the following
fragmentations:
C1CCCCC1NC(=O)C -> C1CCCCC1N[2*].[1*]C(=O)C
c1ccccc1NC(=O)C -> c1ccccc1[16*].[2*]N[2*].[1*]C(=O)C
This difference just didn't make sense to us. By switching to
the unified definition we end up with:
C1CCCCC1NC(=O)C -> C1CCCCC1[15*].[5*]N[5*].[1*]C(=O)C
c1ccccc1NC(=O)C -> c1ccccc1[16*].[5*]N[5*].[1*]C(=O)C
*/
const std::string BRICSdefs =
"1 [C;D3]([#0,#6,#7,#8])(=O)\n\
3 [O;D2]-;!@[#0,#6,#1]\n\
5 [N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]\n\
9 [n;+0;$(n(:[c,n,o,s]):[c,n,o,s])]\n\
10 [N;R;$(N(@C(=O))@[C,N,O,S])]\n\
11 [S;D2](-;!@[#0,#6])\n\
12 [S;D4]([#6,#0])(=O)(=O)\n\
6 [C;D3;!R](=O)-;!@[#0,#6,#7,#8]\n\
13 [C;$(C(-;@[C,N,O,S])-;@[N,O,S])]\n\
14 [c;$(c(:[c,n,o,s]):[n,o,s])]\n\
15 [C;$(C(-;@C)-;@C)]\n\
4 [C;!D1;!$(C=*)]-;!@[#6]\n\
7 [C;D2,D3]-[#6]\n\
8 [C;!R;!D1;!$(C!-*)]\n\
16 [c;$(c(:c):c)]";
constructFragmenterAtomTypes(BRICSdefs, defs, "//", true, environs);
}
void constructFragmenterBondTypes(
std::istream *inStream,
const std::map<unsigned int, std::string> &atomTypes,
std::vector<FragmenterBondType> &defs, const std::string &comment,
bool validate, bool labelByConnector) {
PRECONDITION(inStream, "no stream");
defs.clear();
defs.resize(0);
unsigned int line = 0;
while (!inStream->eof()) {
++line;
std::string tempStr = getLine(inStream);
if (tempStr == "" || tempStr.find(comment) == 0) continue;
std::vector<std::string> tokens;
boost::split(tokens, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
if (tokens.size() < 3) {
BOOST_LOG(rdWarningLog) << "line " << line << " is too short"
<< std::endl;
continue;
}
unsigned int idx1 = boost::lexical_cast<unsigned int>(tokens[0]);
if (atomTypes.find(idx1) == atomTypes.end()) {
BOOST_LOG(rdWarningLog) << "atom type #" << idx1 << " not recognized."
<< std::endl;
continue;
}
unsigned int idx2 = boost::lexical_cast<unsigned int>(tokens[1]);
if (atomTypes.find(idx2) == atomTypes.end()) {
BOOST_LOG(rdWarningLog) << "atom type #" << idx2 << " not recognized."
<< std::endl;
continue;
}
std::string sma1 = atomTypes.find(idx1)->second;
std::string sma2 = atomTypes.find(idx2)->second;
std::string smarts = "[$(" + sma1 + ")]" + tokens[2] + "[$(" + sma2 + ")]";
ROMol *p = SmartsToMol(smarts);
if (validate) {
if (!p) {
BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << smarts
<< " to molecule at line " << line << std::endl;
continue;
}
}
FragmenterBondType fbt;
fbt.atom1Type = idx1;
fbt.atom2Type = idx2;
if (labelByConnector) {
fbt.atom1Label = idx1;
fbt.atom2Label = idx2;
} else {
fbt.atom1Label = idx2;
fbt.atom2Label = idx1;
}
if (p) {
// for the purposes of replacing the bond, we'll use just the first
// character to set the bond type (if we recognize it):
switch (tokens[2][0]) {
case '-':
fbt.bondType = Bond::SINGLE;
break;
case '=':
fbt.bondType = Bond::DOUBLE;
break;
case '#':
fbt.bondType = Bond::TRIPLE;
break;
case ':':
fbt.bondType = Bond::AROMATIC;
break;
default:
fbt.bondType = p->getBondWithIdx(0)->getBondType();
}
fbt.query = ROMOL_SPTR(p);
} else {
fbt.bondType = Bond::UNSPECIFIED;
fbt.query = ROMOL_SPTR();
}
defs.push_back(fbt);
}
}
void constructFragmenterBondTypes(
const std::string &str,
const std::map<unsigned int, std::string> &atomTypes,
std::vector<FragmenterBondType> &defs, const std::string &comment,
bool validate, bool labelByConnector) {
std::stringstream istr(str);
constructFragmenterBondTypes(&istr, atomTypes, defs, comment, validate,
labelByConnector);
}
void constructBRICSBondTypes(std::vector<FragmenterBondType> &defs) {
const std::string BRICSdefs =
"// L1\n\
1 3 -;!@\n\
1 5 -;!@\n\
1 10 -;!@\n\
// L3 \n\
3 4 -;!@\n\
3 13 -;!@\n\
3 14 -;!@\n\
3 15 -;!@\n\
3 16 -;!@\n\
// L4\n\
4 5 -;!@\n\
4 11 -;!@\n\
// L5\n\
5 12 -;!@\n\
5 14 -;!@\n\
5 16 -;!@\n\
5 13 -;!@\n\
5 15 -;!@\n\
// L6\n\
6 13 -;!@\n\
6 14 -;!@\n\
6 15 -;!@\n\
6 16 -;!@\n\
// L7\n\
7 7 =;!@\n\
// L8\n\
8 9 -;!@\n\
8 10 -;!@\n\
8 13 -;!@\n\
8 14 -;!@\n\
8 15 -;!@\n\
8 16 -;!@\n\
// L9\n\
9 13 -;!@ // not in original paper\n\
9 14 -;!@ // not in original paper\n\
9 15 -;!@\n\
9 16 -;!@\n\
// L10\n\
10 13 -;!@\n\
10 14 -;!@\n\
10 15 -;!@\n\
10 16 -;!@\n\
// L11\n\
11 13 -;!@\n\
11 14 -;!@\n\
11 15 -;!@\n\
11 16 -;!@\n\
// L12\n\
// none left\n\
// L13\n\
13 14 -;!@\n\
13 15 -;!@\n\
13 16 -;!@\n\
// L14\n\
14 14 -;!@ // not in original paper\n\
14 15 -;!@\n\
14 16 -;!@\n\
// L15\n\
15 16 -;!@\n\
// L16\n\
16 16 -;!@ // not in original paper";
std::map<unsigned int, std::string> atTypes;
constructBRICSAtomTypes(atTypes);
constructFragmenterBondTypes(BRICSdefs, atTypes, defs, "//", true, false);
}
namespace {
boost::uint64_t nextBitCombo(boost::uint64_t v) {
// code from:
// http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
boost::uint64_t t = (v | (v - 1)) + 1;
return t | ((((t & -t) / (v & -v)) >> 1) - 1);
}
}
void fragmentOnSomeBonds(
const ROMol &mol, const std::vector<unsigned int> &bondIndices,
std::vector<ROMOL_SPTR> &resMols, unsigned int maxToCut, bool addDummies,
const std::vector<std::pair<unsigned int, unsigned int> > *dummyLabels,
const std::vector<Bond::BondType> *bondTypes,
std::vector<std::vector<unsigned int> > *nCutsPerAtom) {
PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()),
"bad dummyLabel vector");
PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()),
"bad bondType vector");
if (bondIndices.size() > 63)
throw ValueErrorException("currently can only fragment on up to 63 bonds");
if (!maxToCut || !mol.getNumAtoms() || !bondIndices.size()) return;
boost::uint64_t state = (0x1 << maxToCut) - 1;
boost::uint64_t stop = 0x1 << bondIndices.size();
std::vector<unsigned int> fragmentHere(maxToCut);
std::vector<std::pair<unsigned int, unsigned int> > *dummyLabelsHere = NULL;
if (dummyLabels) {
dummyLabelsHere =
new std::vector<std::pair<unsigned int, unsigned int> >(maxToCut);
}
std::vector<Bond::BondType> *bondTypesHere = NULL;
if (bondTypes) {
bondTypesHere = new std::vector<Bond::BondType>(maxToCut);
}
while (state < stop) {
unsigned int nSeen = 0;
for (unsigned int i = 0; i < bondIndices.size() && nSeen < maxToCut; ++i) {
if (state & (0x1 << i)) {
fragmentHere[nSeen] = bondIndices[i];
if (dummyLabelsHere) (*dummyLabelsHere)[nSeen] = (*dummyLabels)[i];
if (bondTypesHere) (*bondTypesHere)[nSeen] = (*bondTypes)[i];
++nSeen;
}
}
std::vector<unsigned int> *lCutsPerAtom = 0;
if (nCutsPerAtom) {
nCutsPerAtom->push_back(std::vector<unsigned int>(mol.getNumAtoms()));
lCutsPerAtom = &(nCutsPerAtom->back());
}
ROMol *nm = fragmentOnBonds(mol, fragmentHere, addDummies, dummyLabelsHere,
bondTypesHere, lCutsPerAtom);
resMols.push_back(ROMOL_SPTR(nm));
state = nextBitCombo(state);
}
delete dummyLabelsHere;
delete bondTypesHere;
}
namespace {
void checkChiralityPostMove(const ROMol &mol, const Atom *oAt, Atom *nAt,
const Bond *bond) {
INT_LIST newOrder;
ROMol::OEDGE_ITER beg, end;
boost::tie(beg, end) = mol.getAtomBonds(oAt);
while (beg != end) {
const BOND_SPTR obond = mol[*beg];
++beg;
if (obond.get() == bond) {
continue;
}
newOrder.push_back(obond->getIdx());
}
newOrder.push_back(bond->getIdx());
unsigned int nSwaps = oAt->getPerturbationOrder(newOrder);
if (nSwaps % 2) nAt->invertChirality();
}
}
ROMol *fragmentOnBonds(
const ROMol &mol, const std::vector<unsigned int> &bondIndices,
bool addDummies,
const std::vector<std::pair<unsigned int, unsigned int> > *dummyLabels,
const std::vector<Bond::BondType> *bondTypes,
std::vector<unsigned int> *nCutsPerAtom) {
PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()),
"bad dummyLabel vector");
PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()),
"bad bondType vector");
PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()),
"bad nCutsPerAtom vector");
if (nCutsPerAtom) {
BOOST_FOREACH (unsigned int &nCuts, *nCutsPerAtom) { nCuts = 0; }
}
RWMol *res = new RWMol(mol);
if (!mol.getNumAtoms()) return res;
std::vector<Bond *> bondsToRemove;
bondsToRemove.reserve(bondIndices.size());
BOOST_FOREACH (unsigned int bondIdx, bondIndices) {
bondsToRemove.push_back(res->getBondWithIdx(bondIdx));
}
for (unsigned int i = 0; i < bondsToRemove.size(); ++i) {
const Bond *bond = bondsToRemove[i];
unsigned int bidx = bond->getBeginAtomIdx();
unsigned int eidx = bond->getEndAtomIdx();
Bond::BondType bT = bond->getBondType();
res->removeBond(bidx, eidx);
if (nCutsPerAtom) {
(*nCutsPerAtom)[bidx] += 1;
(*nCutsPerAtom)[eidx] += 1;
}
if (addDummies) {
Atom *at1, *at2;
at1 = new Atom(0);
at2 = new Atom(0);
if (dummyLabels) {
at1->setIsotope((*dummyLabels)[i].first);
at2->setIsotope((*dummyLabels)[i].second);
} else {
at1->setIsotope(bidx);
at2->setIsotope(eidx);
}
unsigned int idx1 = res->addAtom(at1, false, true);
if (bondTypes) bT = (*bondTypes)[i];
res->addBond(eidx, at1->getIdx(), bT);
unsigned int idx2 = res->addAtom(at2, false, true);
res->addBond(bidx, at2->getIdx(), bT);
// figure out if we need to change the stereo tags on the atoms:
if (mol.getAtomWithIdx(bidx)->getChiralTag() ==
Atom::CHI_TETRAHEDRAL_CCW ||
mol.getAtomWithIdx(bidx)->getChiralTag() ==
Atom::CHI_TETRAHEDRAL_CW) {
checkChiralityPostMove(mol, mol.getAtomWithIdx(bidx),
res->getAtomWithIdx(bidx),
mol.getBondBetweenAtoms(bidx, eidx));
}
if (mol.getAtomWithIdx(eidx)->getChiralTag() ==
Atom::CHI_TETRAHEDRAL_CCW ||
mol.getAtomWithIdx(eidx)->getChiralTag() ==
Atom::CHI_TETRAHEDRAL_CW) {
checkChiralityPostMove(mol, mol.getAtomWithIdx(eidx),
res->getAtomWithIdx(eidx),
mol.getBondBetweenAtoms(bidx, eidx));
}
for (ROMol::ConformerIterator confIt = res->beginConformers();
confIt != res->endConformers(); ++confIt) {
Conformer *conf = (*confIt).get();
conf->setAtomPos(idx1, conf->getAtomPos(bidx));
conf->setAtomPos(idx2, conf->getAtomPos(eidx));
}
} else {
// was github issue 429
Atom *tatom = res->getAtomWithIdx(bidx);
if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) {
tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1);
}
tatom = res->getAtomWithIdx(eidx);
if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) {
tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1);
}
}
}
res->clearComputedProps();
return static_cast<ROMol *>(res);
}
ROMol *fragmentOnBonds(const ROMol &mol,
const std::vector<FragmenterBondType> &bondPatterns,
const std::map<unsigned int, ROMOL_SPTR> *atomEnvirons,
std::vector<unsigned int> *nCutsPerAtom) {
PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()),
"bad nCutsPerAtom vector");
std::vector<unsigned int> bondIndices;
std::vector<std::pair<unsigned int, unsigned int> > dummyLabels;
std::vector<Bond::BondType> bondTypes;
std::map<unsigned int, bool> environsMatch;
if (atomEnvirons) {
for (std::map<unsigned int, ROMOL_SPTR>::const_iterator iter =
atomEnvirons->begin();
iter != atomEnvirons->end(); ++iter) {
MatchVectType mv;
environsMatch[iter->first] = SubstructMatch(mol, *(iter->second), mv);
}
}
boost::dynamic_bitset<> bondsUsed(mol.getNumBonds(), 0);
// the bond definitions are organized (more or less) general -> specific, so
// loop
// over them backwards
BOOST_REVERSE_FOREACH(const FragmenterBondType &fbt, bondPatterns) {
if (fbt.query->getNumAtoms() != 2 || fbt.query->getNumBonds() != 1) {
BOOST_LOG(rdErrorLog)
<< "fragmentation queries must have 2 atoms and 1 bond" << std::endl;
continue;
}
if (atomEnvirons &&
(!environsMatch[fbt.atom1Type] || !environsMatch[fbt.atom2Type]))
continue;
// std::cerr<<" >>> "<<fbt.atom1Label<<" "<<fbt.atom2Label<<std::endl;
std::vector<MatchVectType> bondMatches;
SubstructMatch(mol, *fbt.query.get(), bondMatches);
BOOST_FOREACH (const MatchVectType &mv, bondMatches) {
const Bond *bond = mol.getBondBetweenAtoms(mv[0].second, mv[1].second);
// std::cerr<<" "<<bond->getIdx()<<std::endl;
TEST_ASSERT(bond);
if (bondsUsed[bond->getIdx()]) {
// BOOST_LOG(rdWarningLog)<<"bond #"<<bond->getIdx()<<" matched multiple
// times in decomposition. Later matches ignored."<<std::endl;
continue;
}
bondsUsed.set(bond->getIdx());
bondIndices.push_back(bond->getIdx());
if (bond->getBeginAtomIdx() == static_cast<unsigned int>(mv[0].second)) {
dummyLabels.push_back(std::make_pair(fbt.atom1Label, fbt.atom2Label));
} else {
dummyLabels.push_back(std::make_pair(fbt.atom2Label, fbt.atom1Label));
}
bondTypes.push_back(fbt.bondType);
}
}
return fragmentOnBonds(mol, bondIndices, true, &dummyLabels, &bondTypes,
nCutsPerAtom);
}
boost::flyweight<std::vector<FragmenterBondType>,
boost::flyweights::no_tracking>
bondPatterns;
boost::flyweight<std::map<unsigned int, ROMOL_SPTR>,
boost::flyweights::no_tracking>
atomEnvs;
ROMol *fragmentOnBRICSBonds(const ROMol &mol) {
if (bondPatterns.get().size() == 0) {
std::map<unsigned int, std::string> adefs;
std::map<unsigned int, ROMOL_SPTR> aenvs;
constructBRICSAtomTypes(adefs, &aenvs);
atomEnvs = aenvs;
std::vector<FragmenterBondType> tbondPatterns;
constructBRICSBondTypes(tbondPatterns);
bondPatterns = tbondPatterns;
}
return fragmentOnBonds(mol, bondPatterns, &(atomEnvs.get()));
}
} // end of namespace MolFragmenter
} // end of namespace RDKit