rdkit/Code/GraphMol/ChemTransforms/MolFragmenter.cpp

// $Id$
//
//  Copyright (C) 2013 Greg Landrum
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

#include "MolFragmenter.h"
#include <RDGeneral/utils.h>
#include <RDGeneral/Invariant.h>
#include <RDGeneral/RDLog.h>
#include <RDGeneral/Exceptions.h>
#include <GraphMol/RDKitBase.h>
#include <boost/dynamic_bitset.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/cstdint.hpp>
#include <vector>
#include <algorithm>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/StreamOps.h>

#include <RDGeneral/BoostStartInclude.h>
#include <boost/flyweight.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <boost/functional/hash.hpp>
#include <RDGeneral/BoostEndInclude.h>

#include <sstream>

namespace RDKit {
namespace MolFragmenter {
std::size_t hash_value(const FragmenterBondType &fbt) {
  size_t res = boost::hash<int>()((int)fbt.bondType);
  boost::hash_combine(res, fbt.atom1Label);
  boost::hash_combine(res, fbt.atom2Label);
  return res;
}
bool operator==(const FragmenterBondType &v1, const FragmenterBondType &v2) {
  return (v1.atom1Label == v2.atom1Label) && (v1.atom2Label == v2.atom2Label) &&
         (v1.bondType == v2.bondType);
}
void constructFragmenterAtomTypes(
    std::istream *inStream, std::map<unsigned int, std::string> &defs,
    const std::string &comment, bool validate,
    std::map<unsigned int, ROMOL_SPTR> *environs) {
  PRECONDITION(inStream, "no stream");
  defs.clear();
  unsigned int line = 0;
  while (!inStream->eof()) {
    ++line;
    std::string tempStr = getLine(inStream);
    if (tempStr == "" || tempStr.find(comment) == 0) continue;
    std::vector<std::string> tokens;
    boost::split(tokens, tempStr, boost::is_any_of(" \t"),
                 boost::token_compress_on);
    if (tokens.size() < 2) {
      BOOST_LOG(rdWarningLog) << "line " << line << " is too short"
                              << std::endl;
      continue;
    }
    unsigned int idx = boost::lexical_cast<unsigned int>(tokens[0]);
    if (defs.find(idx) != defs.end()) {
      BOOST_LOG(rdWarningLog)
          << "definition #" << idx
          << " encountered more than once. Using the first occurance."
          << std::endl;
      continue;
    }
    if (validate || environs) {
      ROMol *p = SmartsToMol(tokens[1]);
      if (!p) {
        BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << tokens[1]
                                << " to molecule at line " << line << std::endl;
        continue;
      }
      if (!environs) {
        delete p;
      } else {
        (*environs)[idx] = ROMOL_SPTR(p);
      }
    }
    defs[idx] = tokens[1];
  }
}
void constructFragmenterAtomTypes(
    const std::string &str, std::map<unsigned int, std::string> &defs,
    const std::string &comment, bool validate,
    std::map<unsigned int, ROMOL_SPTR> *environs) {
  std::stringstream istr(str);
  constructFragmenterAtomTypes(&istr, defs, comment, validate, environs);
}
void constructBRICSAtomTypes(std::map<unsigned int, std::string> &defs,
                             std::map<unsigned int, ROMOL_SPTR> *environs) {
  /*
     After some discussion, the L2 definitions ("N.pl3" in the original
     paper) have been removed and incorporated into a (almost) general
     purpose amine definition in L5 ("N.sp3" in the paper).

     The problem is one of consistency.
        Based on the original definitions you should get the following
        fragmentations:
          C1CCCCC1NC(=O)C -> C1CCCCC1N[2*].[1*]C(=O)C
          c1ccccc1NC(=O)C -> c1ccccc1[16*].[2*]N[2*].[1*]C(=O)C
        This difference just didn't make sense to us. By switching to
        the unified definition we end up with:
          C1CCCCC1NC(=O)C -> C1CCCCC1[15*].[5*]N[5*].[1*]C(=O)C
          c1ccccc1NC(=O)C -> c1ccccc1[16*].[5*]N[5*].[1*]C(=O)C
  */
  const std::string BRICSdefs =
      "1 [C;D3]([#0,#6,#7,#8])(=O)\n\
3 [O;D2]-;!@[#0,#6,#1]\n\
5 [N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]\n\
9 [n;+0;$(n(:[c,n,o,s]):[c,n,o,s])]\n\
10 [N;R;$(N(@C(=O))@[C,N,O,S])]\n\
11 [S;D2](-;!@[#0,#6])\n\
12 [S;D4]([#6,#0])(=O)(=O)\n\
6 [C;D3;!R](=O)-;!@[#0,#6,#7,#8]\n\
13 [C;$(C(-;@[C,N,O,S])-;@[N,O,S])]\n\
14 [c;$(c(:[c,n,o,s]):[n,o,s])]\n\
15 [C;$(C(-;@C)-;@C)]\n\
4 [C;!D1;!$(C=*)]-;!@[#6]\n\
7 [C;D2,D3]-[#6]\n\
8 [C;!R;!D1;!$(C!-*)]\n\
16 [c;$(c(:c):c)]";
  constructFragmenterAtomTypes(BRICSdefs, defs, "//", true, environs);
}

void constructFragmenterBondTypes(
    std::istream *inStream,
    const std::map<unsigned int, std::string> &atomTypes,
    std::vector<FragmenterBondType> &defs, const std::string &comment,
    bool validate, bool labelByConnector) {
  PRECONDITION(inStream, "no stream");
  defs.clear();
  defs.resize(0);

  unsigned int line = 0;
  while (!inStream->eof()) {
    ++line;
    std::string tempStr = getLine(inStream);
    if (tempStr == "" || tempStr.find(comment) == 0) continue;
    std::vector<std::string> tokens;
    boost::split(tokens, tempStr, boost::is_any_of(" \t"),
                 boost::token_compress_on);
    if (tokens.size() < 3) {
      BOOST_LOG(rdWarningLog) << "line " << line << " is too short"
                              << std::endl;
      continue;
    }
    unsigned int idx1 = boost::lexical_cast<unsigned int>(tokens[0]);
    if (atomTypes.find(idx1) == atomTypes.end()) {
      BOOST_LOG(rdWarningLog) << "atom type #" << idx1 << " not recognized."
                              << std::endl;
      continue;
    }
    unsigned int idx2 = boost::lexical_cast<unsigned int>(tokens[1]);
    if (atomTypes.find(idx2) == atomTypes.end()) {
      BOOST_LOG(rdWarningLog) << "atom type #" << idx2 << " not recognized."
                              << std::endl;
      continue;
    }
    std::string sma1 = atomTypes.find(idx1)->second;
    std::string sma2 = atomTypes.find(idx2)->second;
    std::string smarts = "[$(" + sma1 + ")]" + tokens[2] + "[$(" + sma2 + ")]";
    ROMol *p = SmartsToMol(smarts);
    if (validate) {
      if (!p) {
        BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << smarts
                                << " to molecule at line " << line << std::endl;
        continue;
      }
    }
    FragmenterBondType fbt;
    fbt.atom1Type = idx1;
    fbt.atom2Type = idx2;
    if (labelByConnector) {
      fbt.atom1Label = idx1;
      fbt.atom2Label = idx2;
    } else {
      fbt.atom1Label = idx2;
      fbt.atom2Label = idx1;
    }
    if (p) {
      // for the purposes of replacing the bond, we'll use just the first
      // character to set the bond type (if we recognize it):
      switch (tokens[2][0]) {
        case '-':
          fbt.bondType = Bond::SINGLE;
          break;
        case '=':
          fbt.bondType = Bond::DOUBLE;
          break;
        case '#':
          fbt.bondType = Bond::TRIPLE;
          break;
        case ':':
          fbt.bondType = Bond::AROMATIC;
          break;
        default:
          fbt.bondType = p->getBondWithIdx(0)->getBondType();
      }
      fbt.query = ROMOL_SPTR(p);
    } else {
      fbt.bondType = Bond::UNSPECIFIED;
      fbt.query = ROMOL_SPTR();
    }
    defs.push_back(fbt);
  }
}

void constructFragmenterBondTypes(
    const std::string &str,
    const std::map<unsigned int, std::string> &atomTypes,
    std::vector<FragmenterBondType> &defs, const std::string &comment,
    bool validate, bool labelByConnector) {
  std::stringstream istr(str);
  constructFragmenterBondTypes(&istr, atomTypes, defs, comment, validate,
                               labelByConnector);
}
void constructBRICSBondTypes(std::vector<FragmenterBondType> &defs) {
  const std::string BRICSdefs =
      "// L1\n\
1 3 -;!@\n\
1 5 -;!@\n\
1 10 -;!@\n\
// L3 \n\
3 4 -;!@\n\
3 13 -;!@\n\
3 14 -;!@\n\
3 15 -;!@\n\
3 16 -;!@\n\
// L4\n\
4 5 -;!@\n\
4 11 -;!@\n\
// L5\n\
5 12 -;!@\n\
5 14 -;!@\n\
5 16 -;!@\n\
5 13 -;!@\n\
5 15 -;!@\n\
// L6\n\
6 13 -;!@\n\
6 14 -;!@\n\
6 15 -;!@\n\
6 16 -;!@\n\
// L7\n\
7 7 =;!@\n\
// L8\n\
8 9 -;!@\n\
8 10 -;!@\n\
8 13 -;!@\n\
8 14 -;!@\n\
8 15 -;!@\n\
8 16 -;!@\n\
// L9\n\
9 13 -;!@ // not in original paper\n\
9 14 -;!@ // not in original paper\n\
9 15 -;!@\n\
9 16 -;!@\n\
// L10\n\
10 13 -;!@\n\
10 14 -;!@\n\
10 15 -;!@\n\
10 16 -;!@\n\
// L11\n\
11 13 -;!@\n\
11 14 -;!@\n\
11 15 -;!@\n\
11 16 -;!@\n\
// L12\n\
// none left\n\
// L13\n\
13 14 -;!@\n\
13 15 -;!@\n\
13 16 -;!@\n\
// L14\n\
14 14 -;!@ // not in original paper\n\
14 15 -;!@\n\
14 16 -;!@\n\
// L15\n\
15 16 -;!@\n\
// L16\n\
16 16 -;!@ // not in original paper";
  std::map<unsigned int, std::string> atTypes;
  constructBRICSAtomTypes(atTypes);
  constructFragmenterBondTypes(BRICSdefs, atTypes, defs, "//", true, false);
}

namespace {
boost::uint64_t nextBitCombo(boost::uint64_t v) {
  // code from:
  // http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation
  boost::uint64_t t = (v | (v - 1)) + 1;
  return t | ((((t & -t) / (v & -v)) >> 1) - 1);
}
}

void fragmentOnSomeBonds(
    const ROMol &mol, const std::vector<unsigned int> &bondIndices,
    std::vector<ROMOL_SPTR> &resMols, unsigned int maxToCut, bool addDummies,
    const std::vector<std::pair<unsigned int, unsigned int> > *dummyLabels,
    const std::vector<Bond::BondType> *bondTypes,
    std::vector<std::vector<unsigned int> > *nCutsPerAtom) {
  PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()),
               "bad dummyLabel vector");
  PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()),
               "bad bondType vector");
  if (bondIndices.size() > 63)
    throw ValueErrorException("currently can only fragment on up to 63 bonds");
  if (!maxToCut || !mol.getNumAtoms() || !bondIndices.size()) return;

  boost::uint64_t state = (0x1 << maxToCut) - 1;
  boost::uint64_t stop = 0x1 << bondIndices.size();
  std::vector<unsigned int> fragmentHere(maxToCut);
  std::vector<std::pair<unsigned int, unsigned int> > *dummyLabelsHere = NULL;
  if (dummyLabels) {
    dummyLabelsHere =
        new std::vector<std::pair<unsigned int, unsigned int> >(maxToCut);
  }
  std::vector<Bond::BondType> *bondTypesHere = NULL;
  if (bondTypes) {
    bondTypesHere = new std::vector<Bond::BondType>(maxToCut);
  }
  while (state < stop) {
    unsigned int nSeen = 0;
    for (unsigned int i = 0; i < bondIndices.size() && nSeen < maxToCut; ++i) {
      if (state & (0x1 << i)) {
        fragmentHere[nSeen] = bondIndices[i];
        if (dummyLabelsHere) (*dummyLabelsHere)[nSeen] = (*dummyLabels)[i];
        if (bondTypesHere) (*bondTypesHere)[nSeen] = (*bondTypes)[i];
        ++nSeen;
      }
    }
    std::vector<unsigned int> *lCutsPerAtom = 0;
    if (nCutsPerAtom) {
      nCutsPerAtom->push_back(std::vector<unsigned int>(mol.getNumAtoms()));
      lCutsPerAtom = &(nCutsPerAtom->back());
    }
    ROMol *nm = fragmentOnBonds(mol, fragmentHere, addDummies, dummyLabelsHere,
                                bondTypesHere, lCutsPerAtom);
    resMols.push_back(ROMOL_SPTR(nm));

    state = nextBitCombo(state);
  }
  delete dummyLabelsHere;
  delete bondTypesHere;
}

namespace {
void checkChiralityPostMove(const ROMol &mol, const Atom *oAt, Atom *nAt,
                            const Bond *bond) {
  INT_LIST newOrder;
  ROMol::OEDGE_ITER beg, end;
  boost::tie(beg, end) = mol.getAtomBonds(oAt);
  while (beg != end) {
    const BOND_SPTR obond = mol[*beg];
    ++beg;
    if (obond.get() == bond) {
      continue;
    }
    newOrder.push_back(obond->getIdx());
  }
  newOrder.push_back(bond->getIdx());
  unsigned int nSwaps = oAt->getPerturbationOrder(newOrder);
  if (nSwaps % 2) nAt->invertChirality();
}
}

ROMol *fragmentOnBonds(
    const ROMol &mol, const std::vector<unsigned int> &bondIndices,
    bool addDummies,
    const std::vector<std::pair<unsigned int, unsigned int> > *dummyLabels,
    const std::vector<Bond::BondType> *bondTypes,
    std::vector<unsigned int> *nCutsPerAtom) {
  PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()),
               "bad dummyLabel vector");
  PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()),
               "bad bondType vector");
  PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()),
               "bad nCutsPerAtom vector");
  if (nCutsPerAtom) {
    BOOST_FOREACH (unsigned int &nCuts, *nCutsPerAtom) { nCuts = 0; }
  }
  RWMol *res = new RWMol(mol);
  if (!mol.getNumAtoms()) return res;

  std::vector<Bond *> bondsToRemove;
  bondsToRemove.reserve(bondIndices.size());
  BOOST_FOREACH (unsigned int bondIdx, bondIndices) {
    bondsToRemove.push_back(res->getBondWithIdx(bondIdx));
  }
  for (unsigned int i = 0; i < bondsToRemove.size(); ++i) {
    const Bond *bond = bondsToRemove[i];
    unsigned int bidx = bond->getBeginAtomIdx();
    unsigned int eidx = bond->getEndAtomIdx();
    Bond::BondType bT = bond->getBondType();
    res->removeBond(bidx, eidx);
    if (nCutsPerAtom) {
      (*nCutsPerAtom)[bidx] += 1;
      (*nCutsPerAtom)[eidx] += 1;
    }
    if (addDummies) {
      Atom *at1, *at2;
      at1 = new Atom(0);
      at2 = new Atom(0);
      if (dummyLabels) {
        at1->setIsotope((*dummyLabels)[i].first);
        at2->setIsotope((*dummyLabels)[i].second);
      } else {
        at1->setIsotope(bidx);
        at2->setIsotope(eidx);
      }
      unsigned int idx1 = res->addAtom(at1, false, true);
      if (bondTypes) bT = (*bondTypes)[i];
      res->addBond(eidx, at1->getIdx(), bT);
      unsigned int idx2 = res->addAtom(at2, false, true);
      res->addBond(bidx, at2->getIdx(), bT);

      // figure out if we need to change the stereo tags on the atoms:
      if (mol.getAtomWithIdx(bidx)->getChiralTag() ==
              Atom::CHI_TETRAHEDRAL_CCW ||
          mol.getAtomWithIdx(bidx)->getChiralTag() ==
              Atom::CHI_TETRAHEDRAL_CW) {
        checkChiralityPostMove(mol, mol.getAtomWithIdx(bidx),
                               res->getAtomWithIdx(bidx),
                               mol.getBondBetweenAtoms(bidx, eidx));
      }
      if (mol.getAtomWithIdx(eidx)->getChiralTag() ==
              Atom::CHI_TETRAHEDRAL_CCW ||
          mol.getAtomWithIdx(eidx)->getChiralTag() ==
              Atom::CHI_TETRAHEDRAL_CW) {
        checkChiralityPostMove(mol, mol.getAtomWithIdx(eidx),
                               res->getAtomWithIdx(eidx),
                               mol.getBondBetweenAtoms(bidx, eidx));
      }

      for (ROMol::ConformerIterator confIt = res->beginConformers();
           confIt != res->endConformers(); ++confIt) {
        Conformer *conf = (*confIt).get();
        conf->setAtomPos(idx1, conf->getAtomPos(bidx));
        conf->setAtomPos(idx2, conf->getAtomPos(eidx));
      }
    } else {
      // was github issue 429
      Atom *tatom = res->getAtomWithIdx(bidx);
      if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) {
        tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1);
      }
      tatom = res->getAtomWithIdx(eidx);
      if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) {
        tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1);
      }
    }
  }
  res->clearComputedProps();
  return static_cast<ROMol *>(res);
}

ROMol *fragmentOnBonds(const ROMol &mol,
                       const std::vector<FragmenterBondType> &bondPatterns,
                       const std::map<unsigned int, ROMOL_SPTR> *atomEnvirons,
                       std::vector<unsigned int> *nCutsPerAtom) {
  PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()),
               "bad nCutsPerAtom vector");
  std::vector<unsigned int> bondIndices;
  std::vector<std::pair<unsigned int, unsigned int> > dummyLabels;
  std::vector<Bond::BondType> bondTypes;

  std::map<unsigned int, bool> environsMatch;
  if (atomEnvirons) {
    for (std::map<unsigned int, ROMOL_SPTR>::const_iterator iter =
             atomEnvirons->begin();
         iter != atomEnvirons->end(); ++iter) {
      MatchVectType mv;
      environsMatch[iter->first] = SubstructMatch(mol, *(iter->second), mv);
    }
  }

  boost::dynamic_bitset<> bondsUsed(mol.getNumBonds(), 0);
  // the bond definitions are organized (more or less) general -> specific, so
  // loop
  // over them backwards
  BOOST_REVERSE_FOREACH(const FragmenterBondType &fbt, bondPatterns) {
    if (fbt.query->getNumAtoms() != 2 || fbt.query->getNumBonds() != 1) {
      BOOST_LOG(rdErrorLog)
          << "fragmentation queries must have 2 atoms and 1 bond" << std::endl;
      continue;
    }
    if (atomEnvirons &&
        (!environsMatch[fbt.atom1Type] || !environsMatch[fbt.atom2Type]))
      continue;
    // std::cerr<<"  >>> "<<fbt.atom1Label<<" "<<fbt.atom2Label<<std::endl;
    std::vector<MatchVectType> bondMatches;
    SubstructMatch(mol, *fbt.query.get(), bondMatches);
    BOOST_FOREACH (const MatchVectType &mv, bondMatches) {
      const Bond *bond = mol.getBondBetweenAtoms(mv[0].second, mv[1].second);
      // std::cerr<<"          "<<bond->getIdx()<<std::endl;
      TEST_ASSERT(bond);
      if (bondsUsed[bond->getIdx()]) {
        // BOOST_LOG(rdWarningLog)<<"bond #"<<bond->getIdx()<<" matched multiple
        // times in decomposition. Later matches ignored."<<std::endl;
        continue;
      }
      bondsUsed.set(bond->getIdx());
      bondIndices.push_back(bond->getIdx());
      if (bond->getBeginAtomIdx() == static_cast<unsigned int>(mv[0].second)) {
        dummyLabels.push_back(std::make_pair(fbt.atom1Label, fbt.atom2Label));
      } else {
        dummyLabels.push_back(std::make_pair(fbt.atom2Label, fbt.atom1Label));
      }
      bondTypes.push_back(fbt.bondType);
    }
  }
  return fragmentOnBonds(mol, bondIndices, true, &dummyLabels, &bondTypes,
                         nCutsPerAtom);
}

boost::flyweight<std::vector<FragmenterBondType>,
                 boost::flyweights::no_tracking>
    bondPatterns;
boost::flyweight<std::map<unsigned int, ROMOL_SPTR>,
                 boost::flyweights::no_tracking>
    atomEnvs;
ROMol *fragmentOnBRICSBonds(const ROMol &mol) {
  if (bondPatterns.get().size() == 0) {
    std::map<unsigned int, std::string> adefs;
    std::map<unsigned int, ROMOL_SPTR> aenvs;
    constructBRICSAtomTypes(adefs, &aenvs);
    atomEnvs = aenvs;
    std::vector<FragmenterBondType> tbondPatterns;
    constructBRICSBondTypes(tbondPatterns);
    bondPatterns = tbondPatterns;
  }
  return fragmentOnBonds(mol, bondPatterns, &(atomEnvs.get()));
}

}  // end of namespace MolFragmenter
}  // end of namespace RDKit