rdkit/Code/GraphMol/Subgraphs/SubgraphUtils.cpp

// $Id$
//
//  Copyright (C) 2003-2013 Greg Landrum and Rational Discovery LLC
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include "SubgraphUtils.h"
#include "Subgraphs.h"
#include <RDGeneral/utils.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Subset.h>
#include <GraphMol/QueryAtom.h>
#include <GraphMol/QueryBond.h>
#include <algorithm>
#include <iostream>
#include <map>
#include <RDGeneral/hash/hash.hpp>

namespace RDKit {
namespace Subgraphs {
ROMol *pathToSubmol(const ROMol &mol, const PATH_TYPE &path, bool useQuery) {
  INT_MAP_INT aIdxMap;
  return pathToSubmol(mol, path, useQuery, aIdxMap);
}

ROMol *pathToSubmol(const ROMol &mol, const PATH_TYPE &path, bool useQuery,
                    INT_MAP_INT &atomIdxMap) {
  SubsetOptions options;
  options.copyAsQuery = useQuery;
  options.copyCoordinates = true;
  options.method = SubsetMethod::BONDS;

  std::vector<unsigned int> upath{path.begin(), path.end()};
  SubsetInfo subsetInfo;
  auto res = copyMolSubset(mol, upath, subsetInfo, options);
  atomIdxMap.clear();
  for (auto mapping : subsetInfo.atomMapping) {
    atomIdxMap[mapping.first] = mapping.second;
  }

  return res.release();
}

PATH_TYPE bondListFromAtomList(const ROMol &mol, const PATH_TYPE &atomIds) {
  PATH_TYPE bids;
  unsigned int natms = atomIds.size();
  if (natms <= 1) {
    return bids;  // FIX: should probably throw an exception
  }
  for (unsigned int i = 0; i < natms; i++) {
    for (unsigned int j = i + 1; j < natms; j++) {
      const Bond *bnd = mol.getBondBetweenAtoms(atomIds[i], atomIds[j]);
      if (bnd) {
        int bid = bnd->getIdx();
        bids.push_back(bid);
      }
    }
  }
  return bids;
}

using std::int32_t;
using std::uint32_t;
DiscrimTuple calcPathDiscriminators(const ROMol &mol, const PATH_TYPE &path,
                                    bool useBO,
                                    std::vector<std::uint32_t> *extraInvars) {
  if (extraInvars) {
    CHECK_INVARIANT(extraInvars->size() == mol.getNumAtoms(),
                    "bad extra invars");
  }

  // Start by collecting the atoms in the path and their degrees
  std::vector<int32_t> atomsUsed(mol.getNumAtoms(),
                                 -1);  // map from atom index->path index
  std::vector<const Atom *> atoms;     // to contain the atoms in the path
  std::vector<uint32_t> pathDegrees;   // degrees of each atom *in the path*
  for (int pathIter : path) {
    const Bond *bond = mol.getBondWithIdx(pathIter);
    if (atomsUsed[bond->getBeginAtomIdx()] < 0) {
      atomsUsed[bond->getBeginAtomIdx()] = static_cast<int>(atoms.size());
      atoms.push_back(bond->getBeginAtom());
      pathDegrees.push_back(1);
    } else {
      pathDegrees[atomsUsed[bond->getBeginAtomIdx()]] += 1;
    }
    if (atomsUsed[bond->getEndAtomIdx()] < 0) {
      atomsUsed[bond->getEndAtomIdx()] = static_cast<int>(atoms.size());
      atoms.push_back(bond->getEndAtom());
      pathDegrees.push_back(1);
    } else {
      pathDegrees[atomsUsed[bond->getEndAtomIdx()]] += 1;
    }
  }

  // Calculate the atomic invariants
  unsigned int nAtoms = atoms.size();
  std::vector<uint32_t> invars(nAtoms);
  for (unsigned int i = 0; i < nAtoms; ++i) {
    const Atom *atom = atoms[i];
    uint32_t invar = atom->getAtomicNum();
    gboost::hash_combine(invar, pathDegrees[i]);
    gboost::hash_combine(invar, atom->getFormalCharge());
    int deltaMass = static_cast<int>(
        atom->getMass() -
        PeriodicTable::getTable()->getAtomicWeight(atom->getAtomicNum()));
    gboost::hash_combine(invar, deltaMass);
    if (atom->getIsAromatic()) {
      gboost::hash_combine(invar, 1);
    }
    if (extraInvars) {
      gboost::hash_combine(invar, (*extraInvars)[atom->getIdx()]);
    }
    invars[i] = invar;
  }

  // now do the Morgan iterations:
  // the most number of cycles we need for the atoms on the edges
  // to feel each other is pathSize/2
  // EFF: it may be worth revisiting this at some point to see
  // if the iteration count can be even smaller (and if it
  // makes a difference in runtime)
  unsigned int nCycles = path.size() / 2 + 1;
  gboost::hash<std::vector<uint32_t>> vectHasher;
  for (unsigned int cycle = 0; cycle < nCycles; ++cycle) {
    // let each atom feel it's neighbors:
    std::vector<std::vector<uint32_t>> locInvars(nAtoms);
    for (int pathIter : path) {
      const Bond *bond = mol.getBondWithIdx(pathIter);
      uint32_t v1 = invars[atomsUsed[bond->getBeginAtomIdx()]];
      uint32_t v2 = invars[atomsUsed[bond->getEndAtomIdx()]];
      if (useBO) {
        gboost::hash_combine(v1, static_cast<uint32_t>(bond->getBondType()));
        gboost::hash_combine(v2, static_cast<uint32_t>(bond->getBondType()));
      }
      locInvars[atomsUsed[bond->getBeginAtomIdx()]].push_back(v2);
      locInvars[atomsUsed[bond->getEndAtomIdx()]].push_back(v1);
    }
    // we need to sort by the neighbor invariants to be order
    // independent:
    for (unsigned int i = 0; i < nAtoms; ++i) {
      std::sort(locInvars[i].begin(), locInvars[i].end());
      invars[i] = vectHasher(locInvars[i]);
    }
  }

  // again, a sort for order independence:
  std::sort(invars.begin(), invars.end());
  uint32_t pathInvar = vectHasher(invars);

  // also include the path size (bond count) and number of atoms
  // in the discriminator
  return std::make_tuple(pathInvar, path.size(), nAtoms);
}

//
// This is intended for use on either subgraphs or paths.
//  The entries in PATH_LIST should refer to bonds though (not
//  atoms)
//
PATH_LIST uniquifyPaths(const ROMol &mol, const PATH_LIST &allPaths,
                        bool useBO) {
  PATH_LIST res;
  std::vector<DiscrimTuple> discrimsSeen;
  for (const auto &allPath : allPaths) {
    DiscrimTuple discrims = calcPathDiscriminators(mol, allPath, useBO);
    if (std::find(discrimsSeen.begin(), discrimsSeen.end(), discrims) ==
        discrimsSeen.end()) {
      discrimsSeen.push_back(discrims);
      res.push_back(allPath);
    }
  }
  return res;
}
}  // end of namespace Subgraphs
}  // end of namespace RDKit