// $Id$ // // Copyright (C) 2013 Greg Landrum // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "MolFragmenter.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace RDKit { namespace MolFragmenter { std::size_t hash_value(const FragmenterBondType &fbt) { size_t res = boost::hash()((int)fbt.bondType); boost::hash_combine(res, fbt.atom1Label); boost::hash_combine(res, fbt.atom2Label); return res; } bool operator==(const FragmenterBondType &v1, const FragmenterBondType &v2) { return (v1.atom1Label == v2.atom1Label) && (v1.atom2Label == v2.atom2Label) && (v1.bondType == v2.bondType); } void constructFragmenterAtomTypes( std::istream *inStream, std::map &defs, const std::string &comment, bool validate, std::map *environs) { PRECONDITION(inStream, "no stream"); defs.clear(); unsigned int line = 0; while (!inStream->eof()) { ++line; std::string tempStr = getLine(inStream); if (tempStr == "" || tempStr.find(comment) == 0) continue; std::vector tokens; boost::split(tokens, tempStr, boost::is_any_of(" \t"), boost::token_compress_on); if (tokens.size() < 2) { BOOST_LOG(rdWarningLog) << "line " << line << " is too short" << std::endl; continue; } unsigned int idx = boost::lexical_cast(tokens[0]); if (defs.find(idx) != defs.end()) { BOOST_LOG(rdWarningLog) << "definition #" << idx << " encountered more than once. Using the first occurance." << std::endl; continue; } if (validate || environs) { ROMol *p = SmartsToMol(tokens[1]); if (!p) { BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << tokens[1] << " to molecule at line " << line << std::endl; continue; } if (!environs) { delete p; } else { (*environs)[idx] = ROMOL_SPTR(p); } } defs[idx] = tokens[1]; } } void constructFragmenterAtomTypes( const std::string &str, std::map &defs, const std::string &comment, bool validate, std::map *environs) { std::stringstream istr(str); constructFragmenterAtomTypes(&istr, defs, comment, validate, environs); } void constructBRICSAtomTypes(std::map &defs, std::map *environs) { /* After some discussion, the L2 definitions ("N.pl3" in the original paper) have been removed and incorporated into a (almost) general purpose amine definition in L5 ("N.sp3" in the paper). The problem is one of consistency. Based on the original definitions you should get the following fragmentations: C1CCCCC1NC(=O)C -> C1CCCCC1N[2*].[1*]C(=O)C c1ccccc1NC(=O)C -> c1ccccc1[16*].[2*]N[2*].[1*]C(=O)C This difference just didn't make sense to us. By switching to the unified definition we end up with: C1CCCCC1NC(=O)C -> C1CCCCC1[15*].[5*]N[5*].[1*]C(=O)C c1ccccc1NC(=O)C -> c1ccccc1[16*].[5*]N[5*].[1*]C(=O)C */ const std::string BRICSdefs = "1 [C;D3]([#0,#6,#7,#8])(=O)\n\ 3 [O;D2]-;!@[#0,#6,#1]\n\ 5 [N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]\n\ 9 [n;+0;$(n(:[c,n,o,s]):[c,n,o,s])]\n\ 10 [N;R;$(N(@C(=O))@[C,N,O,S])]\n\ 11 [S;D2](-;!@[#0,#6])\n\ 12 [S;D4]([#6,#0])(=O)(=O)\n\ 6 [C;D3;!R](=O)-;!@[#0,#6,#7,#8]\n\ 13 [C;$(C(-;@[C,N,O,S])-;@[N,O,S])]\n\ 14 [c;$(c(:[c,n,o,s]):[n,o,s])]\n\ 15 [C;$(C(-;@C)-;@C)]\n\ 4 [C;!D1;!$(C=*)]-;!@[#6]\n\ 7 [C;D2,D3]-[#6]\n\ 8 [C;!R;!D1;!$(C!-*)]\n\ 16 [c;$(c(:c):c)]"; constructFragmenterAtomTypes(BRICSdefs, defs, "//", true, environs); } void constructFragmenterBondTypes( std::istream *inStream, const std::map &atomTypes, std::vector &defs, const std::string &comment, bool validate, bool labelByConnector) { PRECONDITION(inStream, "no stream"); defs.clear(); defs.resize(0); unsigned int line = 0; while (!inStream->eof()) { ++line; std::string tempStr = getLine(inStream); if (tempStr == "" || tempStr.find(comment) == 0) continue; std::vector tokens; boost::split(tokens, tempStr, boost::is_any_of(" \t"), boost::token_compress_on); if (tokens.size() < 3) { BOOST_LOG(rdWarningLog) << "line " << line << " is too short" << std::endl; continue; } unsigned int idx1 = boost::lexical_cast(tokens[0]); if (atomTypes.find(idx1) == atomTypes.end()) { BOOST_LOG(rdWarningLog) << "atom type #" << idx1 << " not recognized." << std::endl; continue; } unsigned int idx2 = boost::lexical_cast(tokens[1]); if (atomTypes.find(idx2) == atomTypes.end()) { BOOST_LOG(rdWarningLog) << "atom type #" << idx2 << " not recognized." << std::endl; continue; } std::string sma1 = atomTypes.find(idx1)->second; std::string sma2 = atomTypes.find(idx2)->second; std::string smarts = "[$(" + sma1 + ")]" + tokens[2] + "[$(" + sma2 + ")]"; ROMol *p = SmartsToMol(smarts); if (validate) { if (!p) { BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << smarts << " to molecule at line " << line << std::endl; continue; } } FragmenterBondType fbt; fbt.atom1Type = idx1; fbt.atom2Type = idx2; if (labelByConnector) { fbt.atom1Label = idx1; fbt.atom2Label = idx2; } else { fbt.atom1Label = idx2; fbt.atom2Label = idx1; } if (p) { // for the purposes of replacing the bond, we'll use just the first // character to set the bond type (if we recognize it): switch (tokens[2][0]) { case '-': fbt.bondType = Bond::SINGLE; break; case '=': fbt.bondType = Bond::DOUBLE; break; case '#': fbt.bondType = Bond::TRIPLE; break; case ':': fbt.bondType = Bond::AROMATIC; break; default: fbt.bondType = p->getBondWithIdx(0)->getBondType(); } fbt.query = ROMOL_SPTR(p); } else { fbt.bondType = Bond::UNSPECIFIED; fbt.query = ROMOL_SPTR(); } defs.push_back(fbt); } } void constructFragmenterBondTypes( const std::string &str, const std::map &atomTypes, std::vector &defs, const std::string &comment, bool validate, bool labelByConnector) { std::stringstream istr(str); constructFragmenterBondTypes(&istr, atomTypes, defs, comment, validate, labelByConnector); } void constructBRICSBondTypes(std::vector &defs) { const std::string BRICSdefs = "// L1\n\ 1 3 -;!@\n\ 1 5 -;!@\n\ 1 10 -;!@\n\ // L3 \n\ 3 4 -;!@\n\ 3 13 -;!@\n\ 3 14 -;!@\n\ 3 15 -;!@\n\ 3 16 -;!@\n\ // L4\n\ 4 5 -;!@\n\ 4 11 -;!@\n\ // L5\n\ 5 12 -;!@\n\ 5 14 -;!@\n\ 5 16 -;!@\n\ 5 13 -;!@\n\ 5 15 -;!@\n\ // L6\n\ 6 13 -;!@\n\ 6 14 -;!@\n\ 6 15 -;!@\n\ 6 16 -;!@\n\ // L7\n\ 7 7 =;!@\n\ // L8\n\ 8 9 -;!@\n\ 8 10 -;!@\n\ 8 13 -;!@\n\ 8 14 -;!@\n\ 8 15 -;!@\n\ 8 16 -;!@\n\ // L9\n\ 9 13 -;!@ // not in original paper\n\ 9 14 -;!@ // not in original paper\n\ 9 15 -;!@\n\ 9 16 -;!@\n\ // L10\n\ 10 13 -;!@\n\ 10 14 -;!@\n\ 10 15 -;!@\n\ 10 16 -;!@\n\ // L11\n\ 11 13 -;!@\n\ 11 14 -;!@\n\ 11 15 -;!@\n\ 11 16 -;!@\n\ // L12\n\ // none left\n\ // L13\n\ 13 14 -;!@\n\ 13 15 -;!@\n\ 13 16 -;!@\n\ // L14\n\ 14 14 -;!@ // not in original paper\n\ 14 15 -;!@\n\ 14 16 -;!@\n\ // L15\n\ 15 16 -;!@\n\ // L16\n\ 16 16 -;!@ // not in original paper"; std::map atTypes; constructBRICSAtomTypes(atTypes); constructFragmenterBondTypes(BRICSdefs, atTypes, defs, "//", true, false); } namespace { boost::uint64_t nextBitCombo(boost::uint64_t v) { // code from: // http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation boost::uint64_t t = (v | (v - 1)) + 1; return t | ((((t & -t) / (v & -v)) >> 1) - 1); } } void fragmentOnSomeBonds( const ROMol &mol, const std::vector &bondIndices, std::vector &resMols, unsigned int maxToCut, bool addDummies, const std::vector > *dummyLabels, const std::vector *bondTypes, std::vector > *nCutsPerAtom) { PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()), "bad dummyLabel vector"); PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()), "bad bondType vector"); if (bondIndices.size() > 63) throw ValueErrorException("currently can only fragment on up to 63 bonds"); if (!maxToCut || !mol.getNumAtoms() || !bondIndices.size()) return; boost::uint64_t state = (0x1 << maxToCut) - 1; boost::uint64_t stop = 0x1 << bondIndices.size(); std::vector fragmentHere(maxToCut); std::vector > *dummyLabelsHere = NULL; if (dummyLabels) { dummyLabelsHere = new std::vector >(maxToCut); } std::vector *bondTypesHere = NULL; if (bondTypes) { bondTypesHere = new std::vector(maxToCut); } while (state < stop) { unsigned int nSeen = 0; for (unsigned int i = 0; i < bondIndices.size() && nSeen < maxToCut; ++i) { if (state & (0x1 << i)) { fragmentHere[nSeen] = bondIndices[i]; if (dummyLabelsHere) (*dummyLabelsHere)[nSeen] = (*dummyLabels)[i]; if (bondTypesHere) (*bondTypesHere)[nSeen] = (*bondTypes)[i]; ++nSeen; } } std::vector *lCutsPerAtom = 0; if (nCutsPerAtom) { nCutsPerAtom->push_back(std::vector(mol.getNumAtoms())); lCutsPerAtom = &(nCutsPerAtom->back()); } ROMol *nm = fragmentOnBonds(mol, fragmentHere, addDummies, dummyLabelsHere, bondTypesHere, lCutsPerAtom); resMols.push_back(ROMOL_SPTR(nm)); state = nextBitCombo(state); } delete dummyLabelsHere; delete bondTypesHere; } namespace { void checkChiralityPostMove(const ROMol &mol, const Atom *oAt, Atom *nAt, const Bond *bond) { INT_LIST newOrder; ROMol::OEDGE_ITER beg, end; boost::tie(beg, end) = mol.getAtomBonds(oAt); while (beg != end) { const BOND_SPTR obond = mol[*beg]; ++beg; if (obond.get() == bond) { continue; } newOrder.push_back(obond->getIdx()); } newOrder.push_back(bond->getIdx()); unsigned int nSwaps = oAt->getPerturbationOrder(newOrder); if (nSwaps % 2) nAt->invertChirality(); } } ROMol *fragmentOnBonds( const ROMol &mol, const std::vector &bondIndices, bool addDummies, const std::vector > *dummyLabels, const std::vector *bondTypes, std::vector *nCutsPerAtom) { PRECONDITION((!dummyLabels || dummyLabels->size() == bondIndices.size()), "bad dummyLabel vector"); PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()), "bad bondType vector"); PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()), "bad nCutsPerAtom vector"); if (nCutsPerAtom) { BOOST_FOREACH (unsigned int &nCuts, *nCutsPerAtom) { nCuts = 0; } } RWMol *res = new RWMol(mol); if (!mol.getNumAtoms()) return res; std::vector bondsToRemove; bondsToRemove.reserve(bondIndices.size()); BOOST_FOREACH (unsigned int bondIdx, bondIndices) { bondsToRemove.push_back(res->getBondWithIdx(bondIdx)); } for (unsigned int i = 0; i < bondsToRemove.size(); ++i) { const Bond *bond = bondsToRemove[i]; unsigned int bidx = bond->getBeginAtomIdx(); unsigned int eidx = bond->getEndAtomIdx(); Bond::BondType bT = bond->getBondType(); res->removeBond(bidx, eidx); if (nCutsPerAtom) { (*nCutsPerAtom)[bidx] += 1; (*nCutsPerAtom)[eidx] += 1; } if (addDummies) { Atom *at1, *at2; at1 = new Atom(0); at2 = new Atom(0); if (dummyLabels) { at1->setIsotope((*dummyLabels)[i].first); at2->setIsotope((*dummyLabels)[i].second); } else { at1->setIsotope(bidx); at2->setIsotope(eidx); } unsigned int idx1 = res->addAtom(at1, false, true); if (bondTypes) bT = (*bondTypes)[i]; res->addBond(eidx, at1->getIdx(), bT); unsigned int idx2 = res->addAtom(at2, false, true); res->addBond(bidx, at2->getIdx(), bT); // figure out if we need to change the stereo tags on the atoms: if (mol.getAtomWithIdx(bidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW || mol.getAtomWithIdx(bidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW) { checkChiralityPostMove(mol, mol.getAtomWithIdx(bidx), res->getAtomWithIdx(bidx), mol.getBondBetweenAtoms(bidx, eidx)); } if (mol.getAtomWithIdx(eidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW || mol.getAtomWithIdx(eidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW) { checkChiralityPostMove(mol, mol.getAtomWithIdx(eidx), res->getAtomWithIdx(eidx), mol.getBondBetweenAtoms(bidx, eidx)); } for (ROMol::ConformerIterator confIt = res->beginConformers(); confIt != res->endConformers(); ++confIt) { Conformer *conf = (*confIt).get(); conf->setAtomPos(idx1, conf->getAtomPos(bidx)); conf->setAtomPos(idx2, conf->getAtomPos(eidx)); } } else { // was github issue 429 Atom *tatom = res->getAtomWithIdx(bidx); if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) { tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1); } tatom = res->getAtomWithIdx(eidx); if (tatom->getIsAromatic() && tatom->getAtomicNum() != 6) { tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1); } } } res->clearComputedProps(); return static_cast(res); } ROMol *fragmentOnBonds(const ROMol &mol, const std::vector &bondPatterns, const std::map *atomEnvirons, std::vector *nCutsPerAtom) { PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()), "bad nCutsPerAtom vector"); std::vector bondIndices; std::vector > dummyLabels; std::vector bondTypes; std::map environsMatch; if (atomEnvirons) { for (std::map::const_iterator iter = atomEnvirons->begin(); iter != atomEnvirons->end(); ++iter) { MatchVectType mv; environsMatch[iter->first] = SubstructMatch(mol, *(iter->second), mv); } } boost::dynamic_bitset<> bondsUsed(mol.getNumBonds(), 0); // the bond definitions are organized (more or less) general -> specific, so // loop // over them backwards BOOST_REVERSE_FOREACH(const FragmenterBondType &fbt, bondPatterns) { if (fbt.query->getNumAtoms() != 2 || fbt.query->getNumBonds() != 1) { BOOST_LOG(rdErrorLog) << "fragmentation queries must have 2 atoms and 1 bond" << std::endl; continue; } if (atomEnvirons && (!environsMatch[fbt.atom1Type] || !environsMatch[fbt.atom2Type])) continue; // std::cerr<<" >>> "< bondMatches; SubstructMatch(mol, *fbt.query.get(), bondMatches); BOOST_FOREACH (const MatchVectType &mv, bondMatches) { const Bond *bond = mol.getBondBetweenAtoms(mv[0].second, mv[1].second); // std::cerr<<" "<getIdx()<getIdx()]) { // BOOST_LOG(rdWarningLog)<<"bond #"<getIdx()<<" matched multiple // times in decomposition. Later matches ignored."<getIdx()); bondIndices.push_back(bond->getIdx()); if (bond->getBeginAtomIdx() == static_cast(mv[0].second)) { dummyLabels.push_back(std::make_pair(fbt.atom1Label, fbt.atom2Label)); } else { dummyLabels.push_back(std::make_pair(fbt.atom2Label, fbt.atom1Label)); } bondTypes.push_back(fbt.bondType); } } return fragmentOnBonds(mol, bondIndices, true, &dummyLabels, &bondTypes, nCutsPerAtom); } boost::flyweight, boost::flyweights::no_tracking> bondPatterns; boost::flyweight, boost::flyweights::no_tracking> atomEnvs; ROMol *fragmentOnBRICSBonds(const ROMol &mol) { if (bondPatterns.get().size() == 0) { std::map adefs; std::map aenvs; constructBRICSAtomTypes(adefs, &aenvs); atomEnvs = aenvs; std::vector tbondPatterns; constructBRICSBondTypes(tbondPatterns); bondPatterns = tbondPatterns; } return fragmentOnBonds(mol, bondPatterns, &(atomEnvs.get())); } } // end of namespace MolFragmenter } // end of namespace RDKit