Files
rdkit/Code/GraphMol/Subgraphs/Subgraphs.cpp
2007-11-14 06:09:52 +00:00

515 lines
18 KiB
C++
Executable File

// $Id$
//
// Copyright (C) 2003-2006 Rational Discovery LLC
//
// @@ All Rights Reserved @@
//
#include <GraphMol/RDKitBase.h>
#include "Subgraphs.h"
#include "SubgraphUtils.h"
#include <RDGeneral/utils.h>
#include <iostream>
#include <algorithm>
namespace RDKit {
namespace Subgraphs {
INT_INT_VECT_MAP getNbrsList(const ROMol &mol, bool useHs) {
int nAtoms = mol.getNumAtoms();
// create a list of neighbors for each bond
// The python version (subgraph.py) did not take care of atoms being
// hydrogens (and if a user would like to ignore them)
// a list of list of bonds is no longer appropriate here
// some bond IDs may be missing to index the list on.
// so using an associative container.
INT_INT_VECT_MAP nbrs;
for (int i = 0; i < nAtoms; i++) {
const Atom *atom = mol.getAtomWithIdx(i);
// if are at a hydrogen and we are not interested in bonds connecting to them
// move on
if( atom->getAtomicNum()!=1 || useHs ){
ROMol::OEDGE_ITER bIt1,end;
ROMol::GRAPH_MOL_BOND_PMAP::const_type pMap = mol.getBondPMap();
boost::tie(bIt1,end) = mol.getAtomBonds(atom);
while(bIt1!=end){
Bond *bond1 = pMap[*bIt1];
// if this bond connect to a hydrogen and we are not interested
// in it ignore
if( bond1->getOtherAtom(atom)->getAtomicNum() != 1 || useHs ){
int bid1 = bond1->getIdx();
if (nbrs.find(bid1) == nbrs.end()) {
INT_VECT nlst;
nbrs[bid1] = nlst;
}
ROMol::OEDGE_ITER bIt2;
bIt2 = mol.getAtomBonds(atom).first;
while(bIt2 != end){
Bond *bond2 = pMap[*bIt2];
int bid2 = bond2->getIdx();
if (bid1 != bid2 &&
(bond2->getOtherAtom(atom)->getAtomicNum() != 1 || useHs ) ){
nbrs[bid1].push_back(bid2); //FIX: pathListType should probably be container of pointers ??
}
++bIt2;
}
}
++bIt1;
}
}
}
return nbrs;
}
// Everything passed here by reference
void recurseWalk(INT_INT_VECT_MAP &nbrs, // neighbors for each bond
PATH_TYPE &spath, // the current path to be build upon
INT_VECT &cands, // neighbors of current path
unsigned int targetLen, // the maximum subgraph len we are interested in
INT_VECT forbidden, // bonds that have been covered already
// we don't want reference passing for forbidden,
// it gets altered through the processand we want
// fresh start everytime we buble back up to "FindAllSubGraphs"
PATH_LIST &res // the final list of subgraphs
)
{
// end case for recursion
if (spath.size() == targetLen) {
res.push_back(spath);
return;
}
// if the path is already bigger than target length don't do anything
if (spath.size() > targetLen) {
return;
}
// we have the cndidates that can be used to add to the existing path
// try extending the subgraphs
while (cands.size() != 0) {
int next = cands.back(); // start with the last one in the candidate list
cands.pop_back();
//cands.erase(remove(cands.begin(), cands.end(), next), cands.end());
if (std::find(forbidden.begin(), forbidden.end(), next) == forbidden.end()) {
// this bond should not appear in the later subgraphs
forbidden.push_back(next);
// update a local stack before the next recursive call
INT_VECT tstack = cands;
for (INT_VECT::iterator bid=nbrs[next].begin(); bid != nbrs[next].end(); bid++) {
if (std::find(forbidden.begin(), forbidden.end(), *bid) == forbidden.end()) {
tstack.push_back(*bid);
}
}
PATH_TYPE tpath = spath;
tpath.push_back(next);
recurseWalk(nbrs,tpath, tstack, targetLen, forbidden, res);
}
}
}
// Everything passed here by reference
void recurseWalkRange(INT_INT_VECT_MAP &nbrs, // neighbors for each bond
PATH_TYPE &spath, // the current path to be build upon
INT_VECT &cands, // neighbors of current path
unsigned int lowerLen, // lower limit of the subgraph lengths we are interested in
unsigned int upperLen, // the maximum subgraph len we are interested in
INT_VECT forbidden, // bonds that have been covered already
// we don't want reference passing for forbidden,
// it gets altered through the processand we want
// fresh start everytime we buble back up to "FindAllSubGraphs"
INT_PATH_LIST_MAP &res // the final list of subgraphs
)
{
unsigned int nsize = spath.size();
if ((nsize >= lowerLen) && (nsize <= upperLen)) {
if (res.find(nsize) == res.end()) {
PATH_LIST ordern;
res[nsize] = ordern;
}
res[nsize].push_back(spath);
}
// end case for recursion
if (nsize == upperLen) {
return;
}
// if the path is already bigger than desired size
if (nsize > upperLen) {
return;
}
// we have the cndidates that can be used to add to the existing path
// try extending the subgraphs
while (cands.size() != 0) {
int next = cands.back(); // start with the last one in the candidate list
cands.pop_back();
//cands.erase(remove(cands.begin(), cands.end(), next), cands.end());
if (std::find(forbidden.begin(), forbidden.end(), next) == forbidden.end()) {
// this bond should not appear in the later subgraphs
forbidden.push_back(next);
// update a local stack before the next recursive call
INT_VECT tstack = cands;
for (INT_VECT::iterator bid=nbrs[next].begin(); bid != nbrs[next].end(); bid++) {
if (std::find(forbidden.begin(), forbidden.end(), *bid) == forbidden.end()) {
tstack.push_back(*bid);
}
}
PATH_TYPE tpath = spath;
tpath.push_back(next);
recurseWalkRange(nbrs,tpath, tstack, lowerLen, upperLen, forbidden, res);
}
}
}
void dumpVIV(VECT_INT_VECT v){
VECT_INT_VECT::iterator i;
INT_VECT::iterator j;
for(i=v.begin();i!=v.end();i++){
for(j=i->begin();j!=i->end();j++){
std::cout << *j << " ";
}
std::cout << std::endl;
}
}
PATH_LIST
extendPaths(int *adjMat,unsigned int dim,const PATH_LIST &paths,int allowRingClosures=-1)
{
PRECONDITION(adjMat,"no matrix");
//
// extend each of the currently active paths by adding
// a single adjacent index to the end of each
//
PATH_LIST res;
PATH_LIST::const_iterator path;
for(path=paths.begin(); path!=paths.end(); path++){
unsigned int endIdx = (*path)[path->size()-1];
unsigned int iTab = endIdx*dim;
for(unsigned int otherIdx = 0; otherIdx < dim; otherIdx++){
if( adjMat[iTab+otherIdx] == 1){
// test 1: make sure the new atom is not already
// in the path
PATH_TYPE::const_iterator loc;
loc = std::find(path->begin(),path->end(),otherIdx);
// The two conditions for adding the atom are:
// 1) it's not there already
// 2) it's there, but ring closures are allowed and this
// will be the last addition to the path.
if ( loc == path->end() ){
// the easy case
PATH_TYPE newPath=*path;
newPath.push_back(otherIdx);
res.push_back(newPath);
} else if (allowRingClosures>2 &&
path->size()==allowRingClosures-1) {
// We *might* be adding the atom, but we need to make sure
// that we're not just duplicating the second to last
// element of the path:
PATH_TYPE::const_reverse_iterator rIt=path->rbegin();
rIt++;
if( *rIt != otherIdx ){
INT_VECT newPath=*path;
newPath.push_back(otherIdx);
res.push_back(newPath);
}
}
}
}
}
return res;
}
PATH_LIST
pathFinderHelper(int *adjMat,unsigned int dim,unsigned int targetLen)
{
PRECONDITION(adjMat,"no matrix");
// finds all paths of length N using an adjacency matrix,
// which is constructed elsewhere
int startLength=0;
PATH_LIST paths;
paths.clear();
// start a path at each possible index
for(unsigned int i=0;i<dim;i++){
PATH_TYPE tPath;
tPath.push_back(i);
paths.push_back(tPath);
}
// and build them up one index at a time:
startLength = 1;
for(unsigned int length=startLength;length<targetLen;length++){
// extend each path:
paths = extendPaths(adjMat,dim,paths,targetLen);
}
PATH_LIST newPaths;
// now loop through the paths to make sure that we don't
// have any that are reverse duplicates:
for(PATH_LIST::iterator path=paths.begin(); path != paths.end(); path++ ){
if( std::find(newPaths.begin(),newPaths.end(),*path) == newPaths.end() ){
PATH_TYPE revPath(*path);
std::reverse(revPath.begin(),revPath.end());
if( std::find(newPaths.begin(),newPaths.end(),revPath) == newPaths.end() ){
// this one is a keeper;
newPaths.push_back(*path);
}
}
}
return newPaths;
}
} // end of Subgraphs namespace
PATH_LIST findAllSubgraphsOfLengthN (const ROMol &mol, unsigned int targetLen,
bool useHs){
/*********************************************
FIX: Lots of issues here:
- pathListType is defined as a container of "pathType", should it be a container
of "pointers to pathtype"
- to make few things clear it might be useful to typdef a "subgraphListType"
even if it is exactly same as the "pathListType", just to not confuse between
path vs. subgraph definitions
- To make it consistent with the python version of this function in "subgraph.py"
it return a "list of paths" instead of a "list of list of paths" (see
"GetPathsUpTolength" in "molgraphs.cpp")
****************************************************************************/
INT_VECT forbidden;
// this should be the only dependence on mol object:
INT_INT_VECT_MAP nbrs = Subgraphs::getNbrsList(mol, useHs);
// Start path at each bond
PATH_LIST res;
INT_VECT cands;
PATH_TYPE spath;
// start paths at each bond:
INT_INT_VECT_MAP::iterator nbi;
for (nbi = nbrs.begin(); nbi != nbrs.end(); nbi++) {
// don't come back to this bond in the later subgraphs
int i = (*nbi).first;
if (! (std::find(forbidden.begin(), forbidden.end(), i) == forbidden.end())) {
continue;
}
forbidden.push_back(i);
// start the recursive path building with the current bond
spath.clear();
spath.push_back(i);
// neighbors of this bond are the next candidates
cands = nbrs[i];
// now call teh recursive function
// little bit different from the python version
// the result list of paths is passed as a reference, instead of on the fly
// appending
Subgraphs::recurseWalk(nbrs, spath, cands, targetLen, forbidden, res);
}
nbrs.clear();
return res;
}
INT_PATH_LIST_MAP findAllSubgraphsOfLengthsMtoN(const ROMol &mol, unsigned int lowerLen,
unsigned int upperLen, bool useHs){
CHECK_INVARIANT(lowerLen <= upperLen, "");
INT_VECT forbidden;
INT_INT_VECT_MAP nbrs = Subgraphs::getNbrsList(mol, useHs);
// Start path at each bond
INT_PATH_LIST_MAP res;
for (unsigned int idx = lowerLen; idx <= upperLen; idx++) {
PATH_LIST ordern;
res[idx] = ordern;
}
INT_VECT cands;
PATH_TYPE spath;
// start paths at each bond:
INT_INT_VECT_MAP::iterator nbi;
for (nbi = nbrs.begin(); nbi != nbrs.end(); nbi++) {
// don't come back to this bond in the later subgraphs
int i = (*nbi).first;
if (! (std::find(forbidden.begin(), forbidden.end(), i) == forbidden.end())) {
continue;
}
forbidden.push_back(i);
// start the recursive path building with the current bond
spath.clear();
spath.push_back(i);
// neighbors of this bond are the next candidates
cands = nbrs[i];
// now call teh recursive function
// little bit different from the python version
// the result list of paths is passed as a reference, instead of on the fly
// appending
Subgraphs::recurseWalkRange(nbrs, spath, cands, lowerLen, upperLen, forbidden, res);
}
nbrs.clear();
return res; //FIX : need some verbose testing code here
}
PATH_LIST findUniqueSubgraphsOfLengthN (const ROMol &mol, unsigned int targetLen,
bool useHs,bool useBO)
{
// start by finding all subgraphs, then uniquify
PATH_LIST allSubgraphs=findAllSubgraphsOfLengthN(mol,targetLen,useHs);
PATH_LIST res = Subgraphs::uniquifyPaths(mol,allSubgraphs,useBO);
return res;
}
// ----------------------------------------------
//
// You may find yourself wondering: "what's the difference between
// a subgraph and path?" Well, let me tell you: there's a big
// diffference!
//
// Subgraphs are potentially branched, whereas paths (in our
// terminology at least) cannot be. So, the following graph:
//
// C--0--C--1--C--3--C
// |
// 2
// |
// C
// has 3 subgraphs of length 3: (0,1,2),(0,1,3),(2,1,3)
// but only 2 paths of length 3: (0,1,3),(2,1,3)
//
//
// ----------------------------------------------
//
// Args:
//
// adjMat: the adjacency matrix
// dim: number of rows in the adjacency matrix
// paths: paths to be extended
// allowRingClosures: if > 2, paths will be allowed where the final
// point is a duplicate (closes rings). In this case,
// allowRingClosures should equal the target path length.
//
PATH_LIST
findAllPathsOfLengthN(const ROMol &mol,unsigned int targetLen,bool useBonds,
bool useHs) {
//
// We can't be clever here and just use the bond adjacency matrix
// to solve this problem when useBonds is true. This is because
// the bond adjacency matrices for the molecules C1CC1 and CC(C)C
// are indistinguishable. In the second case, t-butane (and
// anything else with a T junction), we'll get some subgraphs mixed
// in with the paths. So we have to construct paths of atoms and
// then convert them into bond paths.
//
int *adjMat,dim;
PATH_LIST res;
res.clear();
dim = mol.getNumAtoms();
adjMat = new int[dim*dim];
memset((void *)adjMat,0,dim*dim*sizeof(int));
// generate the adjacency matrix by hand by looping over the bonds
ROMol::ConstBondIterator bondIt;
for(bondIt=mol.beginBonds();bondIt!=mol.endBonds();bondIt++){
Atom *beg=(*bondIt)->getBeginAtom();
Atom *end=(*bondIt)->getEndAtom();
// check for H, which we might be skipping
if(useHs || (beg->getAtomicNum()!=1 && end->getAtomicNum()!=1)){
adjMat[beg->getIdx()*dim+end->getIdx()] = 1;
adjMat[end->getIdx()*dim+beg->getIdx()] = 1;
}
}
// if we're using bonds, we'll need to find paths of length N+1,
// then convert them
if(useBonds) targetLen+=1;
// find the paths themselves
PATH_LIST atomPaths=Subgraphs::pathFinderHelper(adjMat,dim,targetLen);
// clean up the adjacency matrix
delete [] adjMat;
PATH_LIST bondPaths;
PATH_LIST::const_iterator vivI;
PATH_TYPE::const_iterator ivI;
//
// convert the atom paths into bond paths:
// We need to do this to be able to uniquify paths in a
// straightforward manner, see below.
//
//
if(useBonds || targetLen>1){
//bondPaths.reserve(atomPaths.size());
for(vivI=atomPaths.begin();vivI!=atomPaths.end();vivI++){
const PATH_TYPE &resi=*vivI;
PATH_TYPE locV;
locV.reserve(targetLen);
for(unsigned int j=0;j<targetLen-1;j++){
const Bond *bond=mol.getBondBetweenAtoms(resi[j],resi[j+1]);
locV.push_back(bond->getIdx());
}
bondPaths.push_back(locV);
}
}
//--------------------------------------------------------
// loop through all the paths we have and make sure that there are
// no duplicates (duplicate = contains identical bond indices)
//
// We'll use the standard product-of-primes trick to compute
// invariants for each path in order to identify duples.
//
// We need to use the bond paths for this duplicate finding
// because, in rings, there can be many paths which share atom
// indices but which have different bond compositions. For example,
// there is only one "atom unique" path of length 5 bonds (6 atoms)
// through a 6-ring, but there are six bond paths.
//
std::vector<double> invars;
INT_VECT keepers;
if(useBonds || targetLen>1){
PATH_LIST::const_iterator bondPath,atomPath;
for(bondPath=bondPaths.begin(),atomPath=atomPaths.begin();
bondPath!=bondPaths.end();
bondPath++,atomPath++){
double invar=1.0;
for(ivI=bondPath->begin();ivI!=bondPath->end();ivI++){
invar *= firstThousandPrimes[*ivI];
}
if(std::find(invars.begin(),invars.end(),invar)==invars.end()){
invars.push_back(invar);
// this is a new one:
if(useBonds){
res.push_back(*bondPath);
}
else{
res.push_back(*atomPath);
}
}
}
} else {
res = atomPaths;
}
return res;
}
} // end of RDKit namespace