// $Id$ // // Copyright (C) 2003-2006 Rational Discovery LLC // // @@ All Rights Reserved @@ // #include #include "Subgraphs.h" #include "SubgraphUtils.h" #include #include #include namespace Subgraphs { using namespace RDKit; INT_INT_VECT_MAP getNbrsList(const ROMol &mol, bool useHs) { int nAtoms = mol.getNumAtoms(); // create a list of neighbors for each bond // The python version (subgraph.py) did not take care of atoms being // hydrogens (and if a user would like to ignore them) // a list of list of bonds is no longer appropriate here // some bond IDs may be missing to index the list on. // so using an associative container. INT_INT_VECT_MAP nbrs; for (int i = 0; i < nAtoms; i++) { const Atom *atom = mol.getAtomWithIdx(i); // if are at a hydrogen and we are not interested in bonds connecting to them // move on if( atom->getAtomicNum()!=1 || useHs ){ ROMol::OEDGE_ITER bIt1,end; ROMol::GRAPH_MOL_BOND_PMAP::const_type pMap = mol.getBondPMap(); boost::tie(bIt1,end) = mol.getAtomBonds(atom); while(bIt1!=end){ Bond *bond1 = pMap[*bIt1]; // if this bond connect to a hydrogen and we are not interested // in it ignore if( bond1->getOtherAtom(atom)->getAtomicNum() != 1 || useHs ){ int bid1 = bond1->getIdx(); if (nbrs.find(bid1) == nbrs.end()) { INT_VECT nlst; nbrs[bid1] = nlst; } ROMol::OEDGE_ITER bIt2; bIt2 = mol.getAtomBonds(atom).first; while(bIt2 != end){ Bond *bond2 = pMap[*bIt2]; int bid2 = bond2->getIdx(); if (bid1 != bid2 && (bond2->getOtherAtom(atom)->getAtomicNum() != 1 || useHs ) ){ //if (nbrs.find(bid1) == nbrs.end()) { //INT_VECT nlst; //nbrs[bid1] = nlst; //} nbrs[bid1].push_back(bid2); //FIX: pathListType should probably be container of pointers ?? } bIt2++; } } bIt1++; } } } return nbrs; } // Everything passed here by reference void recurseWalk(INT_INT_VECT_MAP &nbrs, // neighbors for each bond PATH_TYPE &spath, // the current path to be build upon INT_VECT &cands, // neighbors of current path unsigned int targetLen, // the maximum subgraph len we are interested in INT_VECT forbidden, // bonds that have been covered already // we don't want reference passing for forbidden, // it gets altered through the processand we want // fresh start everytime we buble back up to "FindAllSubGraphs" PATH_LIST &res // the final list of subgraphs ) { // end case for recursion if (spath.size() == targetLen) { res.push_back(spath); return; } // if the path is already bigger than target length don't do anything if (spath.size() > targetLen) { return; } // we have the cndidates that can be used to add to the existing path // try extending the subgraphs while (cands.size() != 0) { int next = cands.back(); // start with the last one in the candidate list cands.pop_back(); //cands.erase(remove(cands.begin(), cands.end(), next), cands.end()); if (std::find(forbidden.begin(), forbidden.end(), next) == forbidden.end()) { // this bond should not appear in the later subgraphs forbidden.push_back(next); // update a local stack before the next recursive call INT_VECT tstack = cands; for (INT_VECT::iterator bid=nbrs[next].begin(); bid != nbrs[next].end(); bid++) { if (std::find(forbidden.begin(), forbidden.end(), *bid) == forbidden.end()) { tstack.push_back(*bid); } } PATH_TYPE tpath = spath; tpath.push_back(next); recurseWalk(nbrs,tpath, tstack, targetLen, forbidden, res); } } } // Everything passed here by reference void recurseWalkRange(INT_INT_VECT_MAP &nbrs, // neighbors for each bond PATH_TYPE &spath, // the current path to be build upon INT_VECT &cands, // neighbors of current path unsigned int lowerLen, // lower limit of the subgraph lengths we are interested in unsigned int upperLen, // the maximum subgraph len we are interested in INT_VECT forbidden, // bonds that have been covered already // we don't want reference passing for forbidden, // it gets altered through the processand we want // fresh start everytime we buble back up to "FindAllSubGraphs" INT_PATH_LIST_MAP &res // the final list of subgraphs ) { unsigned int nsize = spath.size(); if ((nsize >= lowerLen) && (nsize <= upperLen)) { if (res.find(nsize) == res.end()) { PATH_LIST ordern; res[nsize] = ordern; } res[nsize].push_back(spath); } // end case for recursion if (nsize == upperLen) { return; } // if the path is already bigger than desired size if (nsize > upperLen) { return; } // we have the cndidates that can be used to add to the existing path // try extending the subgraphs while (cands.size() != 0) { int next = cands.back(); // start with the last one in the candidate list cands.pop_back(); //cands.erase(remove(cands.begin(), cands.end(), next), cands.end()); if (std::find(forbidden.begin(), forbidden.end(), next) == forbidden.end()) { // this bond should not appear in the later subgraphs forbidden.push_back(next); // update a local stack before the next recursive call INT_VECT tstack = cands; for (INT_VECT::iterator bid=nbrs[next].begin(); bid != nbrs[next].end(); bid++) { if (std::find(forbidden.begin(), forbidden.end(), *bid) == forbidden.end()) { tstack.push_back(*bid); } } PATH_TYPE tpath = spath; tpath.push_back(next); recurseWalkRange(nbrs,tpath, tstack, lowerLen, upperLen, forbidden, res); } } } void dumpVIV(VECT_INT_VECT v){ VECT_INT_VECT::iterator i; INT_VECT::iterator j; for(i=v.begin();i!=v.end();i++){ for(j=i->begin();j!=i->end();j++){ std::cout << *j << " "; } std::cout << std::endl; } } PATH_LIST extendPaths(int *adjMat,unsigned int dim,const PATH_LIST &paths,int allowRingClosures=-1) { PRECONDITION(adjMat,"no matrix"); // // extend each of the currently active paths by adding // a single adjacent index to the end of each // PATH_LIST res; PATH_LIST::const_iterator path; for(path=paths.begin(); path!=paths.end(); path++){ unsigned int endIdx = (*path)[path->size()-1]; unsigned int iTab = endIdx*dim; for(unsigned int otherIdx = 0; otherIdx < dim; otherIdx++){ if( adjMat[iTab+otherIdx] == 1){ // test 1: make sure the new atom is not already // in the path PATH_TYPE::const_iterator loc; loc = std::find(path->begin(),path->end(),otherIdx); // The two conditions for adding the atom are: // 1) it's not there already // 2) it's there, but ring closures are allowed and this // will be the last addition to the path. if ( loc == path->end() ){ // the easy case PATH_TYPE newPath=*path; newPath.push_back(otherIdx); res.push_back(newPath); } else if (allowRingClosures>2 && path->size()==allowRingClosures-1) { // We *might* be adding the atom, but we need to make sure // that we're not just duplicating the second to last // element of the path: PATH_TYPE::const_reverse_iterator rIt=path->rbegin(); rIt++; if( *rIt != otherIdx ){ INT_VECT newPath=*path; newPath.push_back(otherIdx); res.push_back(newPath); } } } } } return res; } PATH_LIST pathFinderHelper(int *adjMat,unsigned int dim,unsigned int targetLen) { PRECONDITION(adjMat,"no matrix"); // finds all paths of length N using an adjacency matrix, // which is constructed elsewhere int startLength=0; PATH_LIST paths; paths.clear(); // start a path at each possible index for(unsigned int i=0;i>>>>>> len: " << length+1 << std::endl;; //dumpVIV(paths); } PATH_LIST newPaths; // now loop through the paths to make sure that we don't // have any that are reverse duplicates: for(PATH_LIST::iterator path=paths.begin(); path != paths.end(); path++ ){ if( std::find(newPaths.begin(),newPaths.end(),*path) == newPaths.end() ){ PATH_TYPE revPath(*path); std::reverse(revPath.begin(),revPath.end()); if( std::find(newPaths.begin(),newPaths.end(),revPath) == newPaths.end() ){ // this one is a keeper; newPaths.push_back(*path); } } } return newPaths; } } // end of Subgraphs namespace namespace RDKit{ PATH_LIST findAllSubgraphsOfLengthN (const ROMol &mol, unsigned int targetLen, bool useHs, bool verbose) { /********************************************* FIX: Lots of issues here: - pathListType is defined as a container of "pathType", should it be a container of "pointers to pathtype" - to make few things clear it might be useful to typdef a "subgraphListType" even if it is exactly same as the "pathListType", just to not confuse between path vs. subgraph definitions - To make it consistent with the python version of this function in "subgraph.py" it return a "list of paths" instead of a "list of list of paths" (see "GetPathsUpTolength" in "molgraphs.cpp") ****************************************************************************/ INT_VECT forbidden; // this should be the only dependence on mol object: INT_INT_VECT_MAP nbrs = Subgraphs::getNbrsList(mol, useHs); // Start path at each bond PATH_LIST res; INT_VECT cands; PATH_TYPE spath; // start paths at each bond: INT_INT_VECT_MAP::iterator nbi; for (nbi = nbrs.begin(); nbi != nbrs.end(); nbi++) { // don't come back to this bond in the later subgraphs int i = (*nbi).first; if (! (std::find(forbidden.begin(), forbidden.end(), i) == forbidden.end())) { continue; } forbidden.push_back(i); // start the recursive path building with the current bond spath.clear(); spath.push_back(i); // neighbors of this bond are the next candidates cands = nbrs[i]; // now call teh recursive function // little bit different from the python version // the result list of paths is passed as a reference, instead of on the fly // appending Subgraphs::recurseWalk(nbrs, spath, cands, targetLen, forbidden, res); } nbrs.clear(); return res; //FIX : need some verbose testing code here } INT_PATH_LIST_MAP findAllSubgraphsOfLengthsMtoN(const ROMol &mol, unsigned int lowerLen, unsigned int upperLen, bool useHs, bool verbose) { CHECK_INVARIANT(lowerLen <= upperLen, ""); INT_VECT forbidden; INT_INT_VECT_MAP nbrs = Subgraphs::getNbrsList(mol, useHs); // Start path at each bond INT_PATH_LIST_MAP res; for (unsigned int idx = lowerLen; idx <= upperLen; idx++) { PATH_LIST ordern; res[idx] = ordern; } INT_VECT cands; PATH_TYPE spath; // start paths at each bond: INT_INT_VECT_MAP::iterator nbi; for (nbi = nbrs.begin(); nbi != nbrs.end(); nbi++) { // don't come back to this bond in the later subgraphs int i = (*nbi).first; if (! (std::find(forbidden.begin(), forbidden.end(), i) == forbidden.end())) { continue; } forbidden.push_back(i); // start the recursive path building with the current bond spath.clear(); spath.push_back(i); // neighbors of this bond are the next candidates cands = nbrs[i]; // now call teh recursive function // little bit different from the python version // the result list of paths is passed as a reference, instead of on the fly // appending Subgraphs::recurseWalkRange(nbrs, spath, cands, lowerLen, upperLen, forbidden, res); } nbrs.clear(); return res; //FIX : need some verbose testing code here } PATH_LIST findUniqueSubgraphsOfLengthN (const ROMol &mol, unsigned int targetLen, bool useHs,bool useBO) { // start by finding all subgraphs, then uniquify PATH_LIST allSubgraphs=findAllSubgraphsOfLengthN(mol,targetLen,useHs); PATH_LIST res = Subgraphs::uniquifyPaths(mol,allSubgraphs,useBO); return res; } // ---------------------------------------------- // // You may find yourself wondering: "what's the difference between // a subgraph and path?" Well, let me tell you: there's a big // diffference! // // Subgraphs are potentially branched, whereas paths (in our // terminology at least) cannot be. So, the following graph: // // C--0--C--1--C--3--C // | // 2 // | // C // has 3 subgraphs of length 3: (0,1,2),(0,1,3),(2,1,3) // but only 2 paths of length 3: (0,1,3),(2,1,3) // // // ---------------------------------------------- // // Args: // // adjMat: the adjacency matrix // dim: number of rows in the adjacency matrix // paths: paths to be extended // allowRingClosures: if > 2, paths will be allowed where the final // point is a duplicate (closes rings). In this case, // allowRingClosures should equal the target path length. // PATH_LIST findAllPathsOfLengthN(const ROMol &mol,unsigned int targetLen,bool useBonds, bool useHs) { // // We can't be clever here and just use the bond adjacency matrix // to solve this problem when useBonds is true. This is because // the bond adjacency matrices for the molecules C1CC1 and CC(C)C // are indistinguishable. In the second case, t-butane (and // anything else with a T junction), we'll get some subgraphs mixed // in with the paths. So we have to construct paths of atoms and // then convert them into bond paths. // int *adjMat,dim; PATH_LIST res; res.clear(); dim = mol.getNumAtoms(); adjMat = new int[dim*dim]; memset((void *)adjMat,0,dim*dim*sizeof(int)); // generate the adjacency matrix by hand by looping over the bonds ROMol::ConstBondIterator bondIt; for(bondIt=mol.beginBonds();bondIt!=mol.endBonds();bondIt++){ Atom *beg=(*bondIt)->getBeginAtom(); Atom *end=(*bondIt)->getEndAtom(); // check for H, which we might be skipping if(useHs || (beg->getAtomicNum()!=1 && end->getAtomicNum()!=1)){ adjMat[beg->getIdx()*dim+end->getIdx()] = 1; adjMat[end->getIdx()*dim+beg->getIdx()] = 1; } } // if we're using bonds, we'll need to find paths of length N+1, // then convert them if(useBonds) targetLen+=1; // find the paths themselves PATH_LIST atomPaths=Subgraphs::pathFinderHelper(adjMat,dim,targetLen); // clean up the adjacency matrix delete [] adjMat; PATH_LIST bondPaths; PATH_LIST::const_iterator vivI; PATH_TYPE::const_iterator ivI; // // convert the atom paths into bond paths: // We need to do this to be able to uniquify paths in a // straightforward manner, see below. // // if(useBonds || targetLen>1){ //bondPaths.reserve(atomPaths.size()); for(vivI=atomPaths.begin();vivI!=atomPaths.end();vivI++){ const PATH_TYPE &resi=*vivI; PATH_TYPE locV; locV.reserve(targetLen); for(unsigned int j=0;jgetIdx()); } bondPaths.push_back(locV); } } //-------------------------------------------------------- // loop through all the paths we have and make sure that there are // no duplicates (duplicate = contains identical bond indices) // // We'll use the standard product-of-primes trick to compute // invariants for each path in order to identify duples. // // We need to use the bond paths for this duplicate finding // because, in rings, there can be many paths which share atom // indices but which have different bond compositions. For example, // there is only one "atom unique" path of length 5 bonds (6 atoms) // through a 6-ring, but there are six bond paths. // std::vector invars; INT_VECT keepers; if(useBonds || targetLen>1){ PATH_LIST::const_iterator bondPath,atomPath; for(bondPath=bondPaths.begin(),atomPath=atomPaths.begin(); bondPath!=bondPaths.end(); bondPath++,atomPath++){ double invar=1.0; for(ivI=bondPath->begin();ivI!=bondPath->end();ivI++){ invar *= firstThousandPrimes[*ivI]; } if(std::find(invars.begin(),invars.end(),invar)==invars.end()){ invars.push_back(invar); // this is a new one: if(useBonds){ res.push_back(*bondPath); } else{ res.push_back(*atomPath); } } } } else { res = atomPaths; } return res; } } // end of namespace