// // Copyright (C) 2003-2022 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include "Subgraphs.h" #include "SubgraphUtils.h" #include #include #include #include #include namespace RDKit { namespace Subgraphs { void getNbrsList(const ROMol &mol, bool useHs, INT_INT_VECT_MAP &nbrs) { nbrs.clear(); int nAtoms = mol.getNumAtoms(); // create a list of neighbors for each bond // The python version (subgraph.py) did not take care of atoms being // hydrogens (and if a user would like to ignore them) // a list of list of bonds is no longer appropriate here // some bond IDs may be missing to index the list on. // so using an associative container. for (int i = 0; i < nAtoms; i++) { const Atom *atom = mol.getAtomWithIdx(i); // if are at a hydrogen and we are not interested in bonds connecting to // them // move on if (useHs || atom->getAtomicNum() != 1) { ROMol::OEDGE_ITER bIt1, end; boost::tie(bIt1, end) = mol.getAtomBonds(atom); while (bIt1 != end) { const Bond *bond1 = mol[*bIt1]; // if this bond connect to a hydrogen and we are not interested // in it ignore if (useHs || bond1->getOtherAtom(atom)->getAtomicNum() != 1) { int bid1 = bond1->getIdx(); if (nbrs.find(bid1) == nbrs.end()) { INT_VECT nlst; nbrs[bid1] = nlst; } ROMol::OEDGE_ITER bIt2 = mol.getAtomBonds(atom).first; while (bIt2 != end) { const Bond *bond2 = mol[*bIt2]; int bid2 = bond2->getIdx(); if (bid1 != bid2 && (useHs || bond2->getOtherAtom(atom)->getAtomicNum() != 1)) { nbrs[bid1].push_back(bid2); // FIX: pathListType should probably // be container of pointers ?? } ++bIt2; } } ++bIt1; } } } } // Everything passed here by reference void recurseWalk( INT_INT_VECT_MAP &nbrs, // neighbors for each bond PATH_TYPE &spath, // the current path to be build upon INT_VECT &cands, // neighbors of current path unsigned int targetLen, // the maximum subgraph len we are interested in boost::dynamic_bitset<> forbidden, // bonds that have been covered already // we don't want reference passing for forbidden, // it gets altered through the processand we want // fresh start every time we buble back up to "FindAllSubGraphs" PATH_LIST &res // the final list of subgraphs ) { // end case for recursion if (spath.size() == targetLen) { res.push_back(spath); return; } // if the path is already bigger than target length don't do anything if (spath.size() > targetLen) { return; } // we have the candidates that can be used to add to the existing path // try extending the subgraphs while (cands.size() != 0) { int next = cands.back(); // start with the last one in the candidate list cands.pop_back(); // cands.erase(remove(cands.begin(), cands.end(), next), cands.end()); if (!forbidden[next]) { // this bond should not appear in the later subgraphs forbidden[next] = 1; // update a local stack before the next recursive call INT_VECT tstack = cands; for (int &bid : nbrs[next]) { if (!forbidden[bid]) { tstack.push_back(bid); } } PATH_TYPE tpath = spath; tpath.push_back(next); recurseWalk(nbrs, tpath, tstack, targetLen, forbidden, res); } } } // Everything passed here by reference void recurseWalkRange( INT_INT_VECT_MAP &nbrs, // neighbors for each bond PATH_TYPE &spath, // the current path to be build upon INT_VECT &cands, // neighbors of current path unsigned int lowerLen, // lower limit of the subgraph lengths we are interested in unsigned int upperLen, // the maximum subgraph len we are interested in boost::dynamic_bitset<> forbidden, // bonds that have been covered already // we don't want reference passing for forbidden, // it gets altered through the processand we want // fresh start every time we buble back up to "FindAllSubGraphs" INT_PATH_LIST_MAP &res // the final list of subgraphs ) { unsigned int nsize = spath.size(); if ((nsize >= lowerLen) && (nsize <= upperLen)) { // if (res.find(nsize) == res.end()) { // PATH_LIST ordern; // res[nsize] = ordern; //} res[nsize].push_back(spath); } // end case for recursion if (nsize == upperLen) { return; } // if the path is already bigger than desired size if (nsize > upperLen) { return; } // we have the candidates that can be used to add to the existing path // try extending the subgraphs while (cands.size() != 0) { int next = cands.back(); // start with the last one in the candidate list cands.pop_back(); // cands.erase(remove(cands.begin(), cands.end(), next), cands.end()); if (!forbidden[next]) { // this bond should not appear in the later subgraphs forbidden[next] = 1; // update a local stack before the next recursive call INT_VECT tstack = cands; for (int &bid : nbrs[next]) { if (!forbidden[bid]) { tstack.push_back(bid); } } PATH_TYPE tpath = spath; tpath.push_back(next); recurseWalkRange(nbrs, tpath, tstack, lowerLen, upperLen, forbidden, res); } } } PATH_LIST extendPaths(int *adjMat, unsigned int dim, const PATH_LIST &paths, int allowRingClosures = -1, double *distMat = nullptr) { PRECONDITION(adjMat, "no matrix"); // // extend each of the currently active paths by adding // a single adjacent index to the end of each // PATH_LIST res; PATH_LIST::const_iterator path; for (path = paths.begin(); path != paths.end(); ++path) { unsigned int endIdx = (*path)[path->size() - 1]; unsigned int iTab = endIdx * dim; for (unsigned int otherIdx = 0; otherIdx < dim; otherIdx++) { if (adjMat[iTab + otherIdx] == 1) { if (distMat && distMat[path->front() * dim + otherIdx] - path->size() < -0.001) { continue; } // test 1: make sure the new atom is not already // in the path auto loc = std::find(path->begin(), path->end(), static_cast(otherIdx)); // The two conditions for adding the atom are: // 1) it's not there already // 2) it's there, but ring closures are allowed and this // will be the last addition to the path. if (loc == path->end()) { // the easy case // PATH_TYPE newPath=*path; // newPath.push_back(otherIdx); // res.push_back(newPath); res.push_back(*path); res.rbegin()->push_back(otherIdx); } else if (allowRingClosures > 2 && static_cast(path->size()) == allowRingClosures - 1) { // We *might* be adding the atom, but we need to make sure // that we're not just duplicating the second to last // element of the path: auto rIt = path->rbegin(); rIt++; if (*rIt != static_cast(otherIdx)) { // PATH_TYPE newPath=*path; // newPath.push_back(otherIdx); // res.push_back(newPath); res.push_back(*path); res.rbegin()->push_back(otherIdx); } } } } } return res; } INT_PATH_LIST_MAP pathFinderHelper(int *adjMat, unsigned int dim, unsigned int minLen, unsigned int maxLen, int rootedAtAtom, double *distMat) { PRECONDITION(adjMat, "no matrix"); PRECONDITION(minLen <= maxLen, "bad lengths provided"); // finds all paths of length N using an adjacency matrix, // which is constructed elsewhere INT_PATH_LIST_MAP res; PATH_LIST paths; paths.clear(); if (rootedAtAtom < 0) { // start a path at each possible index for (unsigned int i = 0; i < dim; i++) { PATH_TYPE tPath; tPath.push_back(i); paths.push_back(tPath); } } else if (rootedAtAtom < static_cast(dim)) { // only start a path at the atom of interest: PATH_TYPE tPath; tPath.push_back(rootedAtAtom); paths.push_back(tPath); } else { return res; } // and build them up one index at a time: for (unsigned int length = 1; length < maxLen; length++) { // extend each path: if (length >= minLen) { res[length] = paths; } paths = extendPaths(adjMat, dim, paths, maxLen, distMat); } res[maxLen] = paths; return res; } } // namespace Subgraphs PATH_LIST findAllSubgraphsOfLengthN(const ROMol &mol, unsigned int targetLen, bool useHs, int rootedAtAtom) { /********************************************* FIX: Lots of issues here: - pathListType is defined as a container of "pathType", should it be a container of "pointers to pathtype" - to make few things clear it might be useful to typedef a "subgraphListType" even if it is exactly same as the "pathListType", just to not confuse between path vs. subgraph definitions - To make it consistent with the python version of this function in "subgraph.py" it return a "list of paths" instead of a "list of list of paths" (see "GetPathsUpTolength" in "molgraphs.cpp") ****************************************************************************/ boost::dynamic_bitset<> forbidden(mol.getNumBonds()); // this should be the only dependence on mol object: INT_INT_VECT_MAP nbrs; Subgraphs::getNbrsList(mol, useHs, nbrs); // Start path at each bond PATH_LIST res; // start paths at each bond: for (auto nbi = nbrs.begin(); nbi != nbrs.end(); ++nbi) { // don't come back to this bond in the later subgraphs int i = (*nbi).first; // if we're only returning paths rooted at a particular atom, check now // that this bond involves that atom: if (rootedAtAtom >= 0 && mol.getBondWithIdx(i)->getBeginAtomIdx() != static_cast(rootedAtAtom) && mol.getBondWithIdx(i)->getEndAtomIdx() != static_cast(rootedAtAtom)) { continue; } if (forbidden[i]) { continue; } forbidden[i] = 1; // start the recursive path building with the current bond PATH_TYPE spath; spath.clear(); spath.push_back(i); // neighbors of this bond are the next candidates INT_VECT cands = nbrs[i]; // now call the recursive function // little bit different from the python version // the result list of paths is passed as a reference, instead of on the fly // appending Subgraphs::recurseWalk(nbrs, spath, cands, targetLen, forbidden, res); } nbrs.clear(); return res; } INT_PATH_LIST_MAP findAllSubgraphsOfLengthsMtoN(const ROMol &mol, unsigned int lowerLen, unsigned int upperLen, bool useHs, int rootedAtAtom) { PRECONDITION(lowerLen <= upperLen, ""); boost::dynamic_bitset<> forbidden(mol.getNumBonds()); INT_INT_VECT_MAP nbrs; Subgraphs::getNbrsList(mol, useHs, nbrs); // Start path at each bond INT_PATH_LIST_MAP res; for (unsigned int idx = lowerLen; idx <= upperLen; idx++) { PATH_LIST ordern; res[idx] = ordern; } // start paths at each bond: for (auto nbi = nbrs.begin(); nbi != nbrs.end(); nbi++) { int i = (*nbi).first; // if we're only returning paths rooted at a particular atom, check now // that this bond involves that atom: if (rootedAtAtom >= 0 && mol.getBondWithIdx(i)->getBeginAtomIdx() != static_cast(rootedAtAtom) && mol.getBondWithIdx(i)->getEndAtomIdx() != static_cast(rootedAtAtom)) { continue; } // don't come back to this bond in the later subgraphs if (forbidden[i]) { continue; } forbidden[i] = 1; // start the recursive path building with the current bond PATH_TYPE spath; spath.clear(); spath.push_back(i); // neighbors of this bond are the next candidates INT_VECT cands = nbrs[i]; // now call the recursive function // little bit different from the python version // the result list of paths is passed as a reference, instead of on the fly // appending Subgraphs::recurseWalkRange(nbrs, spath, cands, lowerLen, upperLen, forbidden, res); } nbrs.clear(); return res; // FIX : need some verbose testing code here } PATH_LIST findUniqueSubgraphsOfLengthN(const ROMol &mol, unsigned int targetLen, bool useHs, bool useBO, int rootedAtAtom) { // start by finding all subgraphs, then uniquify PATH_LIST allSubgraphs = findAllSubgraphsOfLengthN(mol, targetLen, useHs, rootedAtAtom); PATH_LIST res = Subgraphs::uniquifyPaths(mol, allSubgraphs, useBO); return res; } // ---------------------------------------------- // // You may find yourself wondering: "what's the difference between // a subgraph and path?" Well, let me tell you: there's a big // difference! // // Subgraphs are potentially branched, whereas paths (in our // terminology at least) cannot be. So, the following graph: // // C--0--C--1--C--3--C // | // 2 // | // C // has 3 subgraphs of length 3: (0,1,2),(0,1,3),(2,1,3) // but only 2 paths of length 3: (0,1,3),(2,1,3) // // // ---------------------------------------------- // // Args: // // adjMat: the adjacency matrix // dim: number of rows in the adjacency matrix // paths: paths to be extended // allowRingClosures: if > 2, paths will be allowed where the final // point is a duplicate (closes rings). In this case, // allowRingClosures should equal the target path length. // INT_PATH_LIST_MAP findAllPathsOfLengthsMtoN(const ROMol &mol, unsigned int lowerLen, unsigned int upperLen, bool useBonds, bool useHs, int rootedAtAtom, bool onlyShortestPaths) { // // We can't be clever here and just use the bond adjacency matrix // to solve this problem when useBonds is true. This is because // the bond adjacency matrices for the molecules C1CC1 and CC(C)C // are indistinguishable. In the second case, t-butane (and // anything else with a T junction), we'll get some subgraphs mixed // in with the paths. So we have to construct paths of atoms and // then convert them into bond paths. // PRECONDITION(lowerLen <= upperLen, ""); // the molecule owns the distance matrix pointer (if we need to get it) double *distMat = onlyShortestPaths ? MolOps::getDistanceMat(mol) : nullptr; int *adjMat, dim; dim = mol.getNumAtoms(); adjMat = new int[dim * dim]; memset((void *)adjMat, 0, dim * dim * sizeof(int)); if (!distMat) { // generate the adjacency matrix by hand by looping over the bonds for (const auto bond : mol.bonds()) { Atom *beg = bond->getBeginAtom(); Atom *end = bond->getEndAtom(); // check for H, which we might be skipping if (useHs || (beg->getAtomicNum() != 1 && end->getAtomicNum() != 1)) { adjMat[beg->getIdx() * dim + end->getIdx()] = 1; adjMat[end->getIdx() * dim + beg->getIdx()] = 1; } } } else { // if we have the distance matrix, we can just loop over that: for (auto i = 0; i < dim; ++i) { for (auto j = i + 1; j < dim; ++j) { if (fabs(distMat[i * dim + j] - 1) < 1e-4) { adjMat[i * dim + j] = 1; adjMat[j * dim + i] = 1; } } } } // if we're using bonds, we'll need to find paths of length N+1, // then convert them if (useBonds) { ++lowerLen; ++upperLen; } // find the paths themselves INT_PATH_LIST_MAP atomPaths = Subgraphs::pathFinderHelper( adjMat, dim, lowerLen, upperLen, rootedAtAtom, distMat); // clean up the adjacency matrix delete[] adjMat; INT_PATH_LIST_MAP res; // //-------------------------------------------------------- // loop through all the paths we have and make sure that there are // no duplicates (duplicate = contains identical bond indices) // // We need to use the bond paths for this duplicate finding // because, in rings, there can be many paths which share atom // indices but which have different bond compositions. For example, // there is only one "atom unique" path of length 5 bonds (6 atoms) // through a 6-ring, but there are six bond paths. // if (!useBonds && lowerLen >= 1) { res[1] = atomPaths[1]; } if (useBonds || upperLen > 1) { for (unsigned int i = lowerLen; i <= upperLen; ++i) { if (i <= 1) { continue; } std::vector> invars; for (PATH_LIST::const_iterator vivI = atomPaths[i].begin(); vivI != atomPaths[i].end(); ++vivI) { boost::dynamic_bitset<> invar(mol.getNumBonds()); const PATH_TYPE &resi = *vivI; PATH_TYPE locV; locV.reserve(i); for (unsigned int j = 0; j < i - 1; j++) { const Bond *bond = mol.getBondBetweenAtoms(resi[j], resi[j + 1]); locV.push_back(bond->getIdx()); invar.set(bond->getIdx()); } if (std::find(invars.begin(), invars.end(), invar) == invars.end()) { invars.push_back(invar); if (useBonds) { res[i - 1].push_back(locV); } else { res[i].push_back(resi); } } } } } return res; } PATH_LIST findAllPathsOfLengthN(const ROMol &mol, unsigned int targetLen, bool useBonds, bool useHs, int rootedAtAtom, bool onlyShortestPaths) { return findAllPathsOfLengthsMtoN(mol, targetLen, targetLen, useBonds, useHs, rootedAtAtom, onlyShortestPaths)[targetLen]; } PATH_TYPE findAtomEnvironmentOfRadiusN( const ROMol &mol, unsigned int radius, unsigned int rootedAtAtom, bool useHs, bool enforceSize, std::unordered_map *atomMap) { if (rootedAtAtom >= mol.getNumAtoms()) { throw ValueErrorException("bad atom index"); } if (atomMap) { atomMap->clear(); } PATH_TYPE res; if (atomMap) { (*atomMap)[rootedAtAtom] = 0; } if (radius == 0) { return res; } // Return empty path if radius=0 std::list> nbrStack; ROMol::OEDGE_ITER beg, end; boost::tie(beg, end) = mol.getAtomBonds(mol.getAtomWithIdx(rootedAtAtom)); while (beg != end) { const Bond *bond = mol[*beg]; if (useHs || mol.getAtomWithIdx(bond->getOtherAtomIdx(rootedAtAtom)) ->getAtomicNum() != 1) { nbrStack.emplace_back(rootedAtAtom, bond->getIdx()); } ++beg; } boost::dynamic_bitset<> bondsIn(mol.getNumBonds()); unsigned int i; for (i = 0; i < radius; ++i) { if (nbrStack.empty()) { break; } std::list> nextLayer; while (!nbrStack.empty()) { int bondIdx, startAtom; boost::tie(startAtom, bondIdx) = nbrStack.front(); nbrStack.pop_front(); if (!bondsIn.test(bondIdx)) { bondsIn.set(bondIdx); res.push_back(bondIdx); // add the next set of neighbors: int oAtom = mol.getBondWithIdx(bondIdx)->getOtherAtomIdx(startAtom); if (atomMap) { if (atomMap->find(oAtom) == atomMap->end()) { (*atomMap)[oAtom] = i + 1; } else { (*atomMap)[oAtom] = std::min(atomMap->at(oAtom), i + 1); } } // if we're going to do another iteration, then push the neighbors from // this round onto the stack if (i < radius - 1) { for (const auto bond : mol.atomBonds(mol.getAtomWithIdx(oAtom))) { if (!bondsIn.test(bond->getIdx())) { if (useHs || mol.getAtomWithIdx(bond->getOtherAtomIdx(oAtom)) ->getAtomicNum() != 1) { nextLayer.emplace_back(oAtom, bond->getIdx()); } } } } } } nbrStack = std::move(nextLayer); } if (i != radius && enforceSize) { // If there are no paths found with the requested radius, user can choose // whether or not to return nothing in this case. If enforceSize=true, this // is similar to the previous bahviour (return an empty path/vector). // Otherwise, it collect every path within the requested radius. This is // similar to maxPath(mol, res) <= radius. res.clear(); res.resize(0); if (atomMap) { atomMap->clear(); } } return res; } } // namespace RDKit