Files
rdkit/Code/GraphMol/ReducedGraphs/ReducedGraphs.cpp
2015-10-18 10:01:00 -04:00

239 lines
9.0 KiB
C++

//
// Copyright (C) 2013 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <Numerics/Vector.h>
#include "ReducedGraphs.h"
#include <boost/flyweight.hpp>
#include <boost/flyweight/key_value.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <boost/ref.hpp>
//#define VERBOSE_FINGERPRINTING 1
namespace RDKit{
namespace {
// FIX: this is duplicated here and in the MorganFingerprints code
class ss_matcher {
public:
ss_matcher() {};
ss_matcher(const std::string &pattern){
RDKit::RWMol *p=RDKit::SmartsToMol(pattern);
TEST_ASSERT(p);
m_matcher.reset(p);
};
//const RDKit::ROMOL_SPTR &getMatcher() const { return m_matcher; };
const RDKit::ROMol *getMatcher() const { return m_matcher.get(); };
private:
RDKit::ROMOL_SPTR m_matcher;
};
}
namespace ReducedGraphs {
// Definitions for feature points adapted from:
// Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
const int nFeatures=5;
const char *smartsPatterns[nFeatures]={
"[$([N;!H0;v3,v4&+1]),\
$([O,S;H1;+0]),\
n&H1&+0]", // Donor
"[$([O,S;H1;v2;!$(*-*=[O,N,P,S])]),\
$([O;H0;v2]),\
$([O,S;v1;-]),\
$([N;v3;!$(N-*=[O,N,P,S])]),\
n&H0&+0,\
$([o;+0;!$([o]:n);!$([o]:c:n)])]", // Acceptor
"[#7;+,\
$([N;H2&+0][$([C,a]);!$([C,a](=O))]),\
$([N;H1&+0]([$([C,a]);!$([C,a](=O))])[$([C,a]);!$([C,a](=O))]),\
$([N;H0&+0]([C;!$(C(=O))])([C;!$(C(=O))])[C;!$(C(=O))])]", // Positive
"[$([C,S](=[O,S,P])-[O;H1,-1])]", // Negative
"[$([C;D3,D4](-[CH3])-[CH3]),$([S;D2](-C)-C)]" // Hydrophobic, note that this needs to be last. // FIX: not at all sure that this is right
};
std::vector<std::string> defaultFeatureSmarts(smartsPatterns,smartsPatterns+nFeatures);
typedef boost::flyweight<boost::flyweights::key_value<std::string,ss_matcher>,boost::flyweights::no_tracking > pattern_flyweight;
void getErGAtomTypes(const ROMol &mol,
std::vector<boost::dynamic_bitset<> > &types,
std::vector<const ROMol *> *patterns=0){
unsigned int nAtoms=mol.getNumAtoms();
std::vector<const ROMol *> featureMatchers;
if(!patterns){
featureMatchers.reserve(defaultFeatureSmarts.size());
for(std::vector<std::string>::const_iterator smaIt=defaultFeatureSmarts.begin();
smaIt!=defaultFeatureSmarts.end();++smaIt){
const ROMol *matcher=pattern_flyweight(*smaIt).get().getMatcher();
CHECK_INVARIANT(matcher,"bad smarts");
featureMatchers.push_back(matcher);
}
patterns=&featureMatchers;
}
types.resize(patterns->size());
for(unsigned int i=0;i<patterns->size();++i){
types[i].resize(nAtoms);
types[i].reset();
//unsigned int mask=1<<i;
std::vector<MatchVectType> matchVect;
// to maintain thread safety, we have to copy the pattern
// molecules:
SubstructMatch(mol,ROMol(*(*patterns)[i],true),matchVect);
for(std::vector<MatchVectType>::const_iterator mvIt=matchVect.begin();
mvIt!=matchVect.end();++mvIt){
types[i].set((*mvIt)[0].second);
}
}
} // end of getAtomTypes;
RDNumeric::DoubleVector *getErGFingerprint(const ROMol &mol,
std::vector<boost::dynamic_bitset<> > *atomTypes,
double fuzzIncrement,
unsigned int minPath,
unsigned int maxPath){
ROMol *rg=generateMolExtendedReducedGraph(mol,atomTypes);
#ifdef VERBOSE_FINGERPRINTING
rg->updatePropertyCache(false);
std::cerr<<" reduced graph smiles: "<<MolToSmiles(*rg,false,false,-1,false)<<std::endl;
#endif
RDNumeric::DoubleVector *res=generateErGFingerprintForReducedGraph(*rg,atomTypes,fuzzIncrement,minPath,maxPath);
delete rg;
return res;
}
RDNumeric::DoubleVector *generateErGFingerprintForReducedGraph(const ROMol &mol,
std::vector<boost::dynamic_bitset<> > *atomTypes,
double fuzzIncrement,
unsigned int minPath,
unsigned int maxPath){
PRECONDITION(maxPath>minPath,"maxPath<=minPath");
// FIX: this isn't doing the special handling for flip/flop bits
unsigned int nTypes=nFeatures;
if(atomTypes) nTypes = atomTypes->size();
nTypes += 1; // need the extra one for aromatic features
unsigned int nBins = maxPath-minPath;
unsigned int vSize = (nTypes*(nTypes+1))/2 * (maxPath-minPath+1);
RDNumeric::DoubleVector *res=new RDNumeric::DoubleVector(vSize,0.0);
// we need the topological distance matrix:
double *dm=MolOps::getDistanceMat(mol);
// cache the atom type vectors:
std::vector<std::list<int> > tvs;
tvs.reserve(mol.getNumAtoms());
for(ROMol::ConstAtomIterator atIt=mol.beginAtoms();atIt!=mol.endAtoms();++atIt){
const std::list<int> &tv=(*atIt)->getProp<std::list<int> >("_ErGAtomTypes");
tvs.push_back(tv);
}
// now loop and set the bits between features
for(unsigned int i=0;i<mol.getNumAtoms();++i){
if(tvs[i].empty()) continue;
#ifdef VERBOSE_FINGERPRINTING
std::cerr<<" atom: "<<i<<" ";
std::copy(tvs[i].begin(),tvs[i].end(),std::ostream_iterator<int>(std::cerr,", "));
std::cerr<<std::endl;
#endif
for(unsigned int j=i+1;j<mol.getNumAtoms();++j){
if(tvs[j].empty()) continue;
int dist = int(dm[i*mol.getNumAtoms()+j]);
if(dist<minPath || dist>maxPath) continue;
BOOST_FOREACH(int ti,tvs[i]){
BOOST_FOREACH(int tj,tvs[j]){
int ijMin=std::min(ti,tj);
int ijMax=std::max(ti,tj);
int block;
if(ijMin>0){
block= ijMin*nTypes - (ijMin*(ijMin+1))/2;
} else {
block = 0;
}
block += ijMax;
unsigned int bin=block*nBins + (dist-minPath);
(*res)[bin] += 1;
if(dist>minPath && fuzzIncrement>0) (*res)[bin-1] += fuzzIncrement;
if(dist<maxPath && fuzzIncrement>0) (*res)[bin+1] += fuzzIncrement;
}
}
}
}
return res;
}
ROMol *generateMolExtendedReducedGraph(const ROMol &mol,
std::vector<boost::dynamic_bitset<> > *atomTypes
){
std::vector<boost::dynamic_bitset<> > *latomTypes=0;
if(!atomTypes){
latomTypes = new std::vector<boost::dynamic_bitset<> >();
atomTypes = latomTypes;
getErGAtomTypes(mol,*atomTypes);
}
RWMol *res = new RWMol(mol);
const int aliphaticFlag = atomTypes->size()-1; // the last type
const int aromaticFlag = atomTypes->size();
for(ROMol::AtomIterator atIt=res->beginAtoms();atIt!=res->endAtoms();++atIt){
std::list<int> tv;
tv.clear();
for(unsigned int i=0;i<atomTypes->size();++i){
if((*atomTypes)[i][(*atIt)->getIdx()]) tv.push_back(i);
}
(*atIt)->setProp("_ErGAtomTypes",tv);
}
// start by adding dummies at the ring centroids
BOOST_FOREACH(const INT_VECT &ring,mol.getRingInfo()->atomRings()){
if(ring.size()<8){
int nIdx=res->addAtom(new Atom(0),false,true);
int nAromatic=0,nSP2=0;
BOOST_FOREACH(int idx,ring){
res->addBond(idx,nIdx,Bond::SINGLE);
if(mol.getAtomWithIdx(idx)->getIsAromatic()){
++nAromatic;
} else if(mol.getAtomWithIdx(idx)->getHybridization()==Atom::SP2) {
++nSP2;
}
}
std::list<int> tv;
if(nAromatic>=2 || nSP2 >= ring.size()/2) tv.push_back(aromaticFlag);
else tv.push_back(aliphaticFlag);
res->getAtomWithIdx(nIdx)->setProp("_ErGAtomTypes",tv);
}
}
// now remove any degree-two ring atoms that have no features:
for(int i=mol.getNumAtoms()-1;i>=0;--i){
if(mol.getRingInfo()->numAtomRings(i) && mol.getAtomWithIdx(i)->getDegree()==2 &&
res->getAtomWithIdx(i)->getProp<std::list<int> >("_ErGAtomTypes").empty()) {
res->removeAtom(i);
}
}
// FIX: still need to do the "highly fused rings" simplification for things like adamantane
if(latomTypes) delete latomTypes;
return res;
}
} // end of namespace ReducedGraphs
} // end of namespace RDKit