// // Copyright (C) 2013-2025 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "MolFragmenter.h" #include #include #include #include #include #include #include #include #include #include #include "ChemTransforms.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace RDKit { namespace MolFragmenter { std::size_t hash_value(const FragmenterBondType &fbt) { size_t res = boost::hash()((int)fbt.bondType); boost::hash_combine(res, fbt.atom1Label); boost::hash_combine(res, fbt.atom2Label); return res; } bool operator==(const FragmenterBondType &v1, const FragmenterBondType &v2) { return (v1.atom1Label == v2.atom1Label) && (v1.atom2Label == v2.atom2Label) && (v1.bondType == v2.bondType); } void constructFragmenterAtomTypes( std::istream *inStream, std::map &defs, const std::string &comment, bool validate, std::map *environs) { PRECONDITION(inStream, "no stream"); defs.clear(); unsigned int line = 0; while (!inStream->eof() && !inStream->fail()) { ++line; std::string tempStr = getLine(inStream); if (tempStr == "" || tempStr.find(comment) == 0) { continue; } std::vector tokens; boost::split(tokens, tempStr, boost::is_any_of(" \t"), boost::token_compress_on); if (tokens.size() < 2) { BOOST_LOG(rdWarningLog) << "line " << line << " is too short" << std::endl; continue; } auto idx = boost::lexical_cast(tokens[0]); if (defs.find(idx) != defs.end()) { BOOST_LOG(rdWarningLog) << "definition #" << idx << " encountered more than once. Using the first occurrence." << std::endl; continue; } if (validate || environs) { ROMol *p = SmartsToMol(tokens[1]); if (!p) { BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << tokens[1] << " to molecule at line " << line << std::endl; continue; } if (!environs) { delete p; } else { (*environs)[idx] = ROMOL_SPTR(p); } } defs[idx] = tokens[1]; } } void constructFragmenterAtomTypes( const std::string &str, std::map &defs, const std::string &comment, bool validate, std::map *environs) { std::stringstream istr(str); constructFragmenterAtomTypes(&istr, defs, comment, validate, environs); } void constructBRICSAtomTypes(std::map &defs, std::map *environs) { /* After some discussion, the L2 definitions ("N.pl3" in the original paper) have been removed and incorporated into a (almost) general purpose amine definition in L5 ("N.sp3" in the paper). The problem is one of consistency. Based on the original definitions you should get the following fragmentations: C1CCCCC1NC(=O)C -> C1CCCCC1N[2*].[1*]C(=O)C c1ccccc1NC(=O)C -> c1ccccc1[16*].[2*]N[2*].[1*]C(=O)C This difference just didn't make sense to us. By switching to the unified definition we end up with: C1CCCCC1NC(=O)C -> C1CCCCC1[15*].[5*]N[5*].[1*]C(=O)C c1ccccc1NC(=O)C -> c1ccccc1[16*].[5*]N[5*].[1*]C(=O)C */ const std::string BRICSdefs = "1 [C;D3]([#0,#6,#7,#8])(=O)\n\ 3 [O;D2]-;!@[#0,#6,#1]\n\ 5 [N;!D1;!$(N=*);!$(N-[!#6;!#16;!#0;!#1]);!$([N;R]@[C;R]=O)]\n\ 9 [n;+0;$(n(:[c,n,o,s]):[c,n,o,s])]\n\ 10 [N;R;$(N(@C(=O))@[C,N,O,S])]\n\ 11 [S;D2](-;!@[#0,#6])\n\ 12 [S;D4]([#6,#0])(=O)(=O)\n\ 6 [C;D3;!R](=O)-;!@[#0,#6,#7,#8]\n\ 13 [C;$(C(-;@[C,N,O,S])-;@[N,O,S])]\n\ 14 [c;$(c(:[c,n,o,s]):[n,o,s])]\n\ 15 [C;$(C(-;@C)-;@C)]\n\ 4 [C;!D1;!$(C=*)]-;!@[#6]\n\ 7 [C;D2,D3]-[#6]\n\ 8 [C;!R;!D1;!$(C!-*)]\n\ 16 [c;$(c(:c):c)]"; constructFragmenterAtomTypes(BRICSdefs, defs, "//", true, environs); } void constructFragmenterBondTypes( std::istream *inStream, const std::map &atomTypes, std::vector &defs, const std::string &comment, bool validate, bool labelByConnector) { PRECONDITION(inStream, "no stream"); defs.clear(); defs.resize(0); unsigned int line = 0; while (!inStream->eof() && !inStream->fail()) { ++line; std::string tempStr = getLine(inStream); if (tempStr == "" || tempStr.find(comment) == 0) { continue; } std::vector tokens; boost::split(tokens, tempStr, boost::is_any_of(" \t"), boost::token_compress_on); if (tokens.size() < 3) { BOOST_LOG(rdWarningLog) << "line " << line << " is too short" << std::endl; continue; } auto idx1 = boost::lexical_cast(tokens[0]); if (atomTypes.find(idx1) == atomTypes.end()) { BOOST_LOG(rdWarningLog) << "atom type #" << idx1 << " not recognized." << std::endl; continue; } auto idx2 = boost::lexical_cast(tokens[1]); if (atomTypes.find(idx2) == atomTypes.end()) { BOOST_LOG(rdWarningLog) << "atom type #" << idx2 << " not recognized." << std::endl; continue; } std::string sma1 = atomTypes.find(idx1)->second; std::string sma2 = atomTypes.find(idx2)->second; std::string smarts = "[$(" + sma1 + ")]" + tokens[2] + "[$(" + sma2 + ")]"; ROMol *p = SmartsToMol(smarts); if (validate) { if (!p) { BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << smarts << " to molecule at line " << line << std::endl; continue; } } FragmenterBondType fbt; fbt.atom1Type = idx1; fbt.atom2Type = idx2; if (labelByConnector) { fbt.atom1Label = idx1; fbt.atom2Label = idx2; } else { fbt.atom1Label = idx2; fbt.atom2Label = idx1; } if (p) { // for the purposes of replacing the bond, we'll use just the first // character to set the bond type (if we recognize it): switch (tokens[2][0]) { case '-': fbt.bondType = Bond::SINGLE; break; case '=': fbt.bondType = Bond::DOUBLE; break; case '#': fbt.bondType = Bond::TRIPLE; break; case ':': fbt.bondType = Bond::AROMATIC; break; default: fbt.bondType = p->getBondWithIdx(0)->getBondType(); } fbt.query = ROMOL_SPTR(p); } else { fbt.bondType = Bond::UNSPECIFIED; fbt.query = ROMOL_SPTR(); } defs.push_back(fbt); } } void constructFragmenterBondTypes( const std::string &str, const std::map &atomTypes, std::vector &defs, const std::string &comment, bool validate, bool labelByConnector) { std::stringstream istr(str); constructFragmenterBondTypes(&istr, atomTypes, defs, comment, validate, labelByConnector); } void constructBRICSBondTypes(std::vector &defs) { const std::string BRICSdefs = "// L1\n\ 1 3 -;!@\n\ 1 5 -;!@\n\ 1 10 -;!@\n\ // L3 \n\ 3 4 -;!@\n\ 3 13 -;!@\n\ 3 14 -;!@\n\ 3 15 -;!@\n\ 3 16 -;!@\n\ // L4\n\ 4 5 -;!@\n\ 4 11 -;!@\n\ // L5\n\ 5 12 -;!@\n\ 5 14 -;!@\n\ 5 16 -;!@\n\ 5 13 -;!@\n\ 5 15 -;!@\n\ // L6\n\ 6 13 -;!@\n\ 6 14 -;!@\n\ 6 15 -;!@\n\ 6 16 -;!@\n\ // L7\n\ 7 7 =;!@\n\ // L8\n\ 8 9 -;!@\n\ 8 10 -;!@\n\ 8 13 -;!@\n\ 8 14 -;!@\n\ 8 15 -;!@\n\ 8 16 -;!@\n\ // L9\n\ 9 13 -;!@ // not in original paper\n\ 9 14 -;!@ // not in original paper\n\ 9 15 -;!@\n\ 9 16 -;!@\n\ // L10\n\ 10 13 -;!@\n\ 10 14 -;!@\n\ 10 15 -;!@\n\ 10 16 -;!@\n\ // L11\n\ 11 13 -;!@\n\ 11 14 -;!@\n\ 11 15 -;!@\n\ 11 16 -;!@\n\ // L12\n\ // none left\n\ // L13\n\ 13 14 -;!@\n\ 13 15 -;!@\n\ 13 16 -;!@\n\ // L14\n\ 14 14 -;!@ // not in original paper\n\ 14 15 -;!@\n\ 14 16 -;!@\n\ // L15\n\ 15 16 -;!@\n\ // L16\n\ 16 16 -;!@ // not in original paper"; std::map atTypes; constructBRICSAtomTypes(atTypes); constructFragmenterBondTypes(BRICSdefs, atTypes, defs, "//", true, false); } namespace { std::uint64_t nextBitCombo(std::uint64_t v) { // code from: // http://graphics.stanford.edu/~seander/bithacks.html#NextBitPermutation std::uint64_t t = (v | (v - 1)) + 1; return t | ((((t & -t) / (v & -v)) >> 1) - 1); } } // namespace void fragmentOnSomeBonds( const ROMol &mol, const std::vector &bondIndices, std::vector &resMols, unsigned int maxToCut, bool addDummies, const std::vector> *dummyLabels, const std::vector *bondTypes, std::vector> *nCutsPerAtom) { PRECONDITION((!dummyLabels || dummyLabels->size() >= bondIndices.size()), "bad dummyLabel vector"); PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()), "bad bondType vector"); if (bondIndices.size() > 63) { throw ValueErrorException("currently can only fragment on up to 63 bonds"); } if (!maxToCut || !mol.getNumAtoms() || !bondIndices.size()) { return; } std::uint64_t state = (0x1L << maxToCut) - 1; std::uint64_t stop = 0x1L << bondIndices.size(); std::vector fragmentHere(maxToCut); std::vector> *dummyLabelsHere = nullptr; if (dummyLabels) { dummyLabelsHere = new std::vector>(maxToCut); } std::vector *bondTypesHere = nullptr; if (bondTypes) { bondTypesHere = new std::vector(maxToCut); } while (state < stop) { unsigned int nSeen = 0; for (unsigned int i = 0; i < bondIndices.size() && nSeen < maxToCut; ++i) { if (state & (0x1L << i)) { fragmentHere[nSeen] = bondIndices[i]; if (dummyLabelsHere) { (*dummyLabelsHere)[nSeen] = (*dummyLabels)[i]; } if (bondTypesHere) { (*bondTypesHere)[nSeen] = (*bondTypes)[i]; } ++nSeen; } } std::vector *lCutsPerAtom = nullptr; if (nCutsPerAtom) { nCutsPerAtom->push_back(std::vector(mol.getNumAtoms())); lCutsPerAtom = &(nCutsPerAtom->back()); } ROMol *nm = fragmentOnBonds(mol, fragmentHere, addDummies, dummyLabelsHere, bondTypesHere, lCutsPerAtom); resMols.emplace_back(nm); state = nextBitCombo(state); } delete dummyLabelsHere; delete bondTypesHere; } namespace { void checkChiralityPostMove(const ROMol &mol, const Atom *oAt, Atom *nAt, const Bond *bond) { static const std::string newBondOrder = "_newBondOrder"; INT_LIST newOrder; INT_LIST incomingOrder; const int check_bond_index = static_cast(bond->getIdx()); // since we may call this function more than once, we need to keep track of // whether or not we've already been called and what the new atom order is. // we do this with a property. // this was github #1734 if (nAt->getPropIfPresent(newBondOrder, incomingOrder)) { for (int bidx : incomingOrder) { if (bidx != check_bond_index) { newOrder.push_back(bidx); } } } else { for (auto obond : mol.atomBonds(oAt)) { if (obond != bond) { newOrder.push_back(obond->getIdx()); } } } newOrder.push_back(bond->getIdx()); nAt->setProp(newBondOrder, newOrder, true); unsigned int nSwaps = oAt->getPerturbationOrder(newOrder); // std::copy(newOrder.begin(), newOrder.end(), // std::ostream_iterator(std::cerr, ", ")); // std::cerr << std::endl; // std::cerr<<"ccpm: "<getIdx()<<"->"<getIdx()<<" bond: // "<getIdx()<<" swaps: "<setChiralTag(oAt->getChiralTag()); if (nSwaps % 2) { nAt->invertChirality(); } } constexpr const char *molfragSaveStereo = "_molfragSaveStereo"; std::vector>> getNbrBondStereo( RWMol &mol, const Bond *bnd) { PRECONDITION(bnd, "null bond"); // loop over neighboring double bonds and remove their stereo atom std::vector>> res; const auto bgn = bnd->getBeginAtom(); const auto end = bnd->getEndAtom(); for (const auto *atom : {bgn, end}) { for (auto obnd : mol.atomBonds(atom)) { if (obnd->getIdx() != bnd->getIdx() && !obnd->getStereoAtoms().empty()) { obnd->setProp(molfragSaveStereo, obnd->getStereo()); res.emplace_back(obnd, obnd->getStereoAtoms()); } } } return res; } } // namespace ROMol *fragmentOnBonds( const ROMol &mol, const std::vector &bondIndices, bool addDummies, const std::vector> *dummyLabels, const std::vector *bondTypes, std::vector *nCutsPerAtom) { PRECONDITION((!dummyLabels || dummyLabels->size() >= bondIndices.size()), "bad dummyLabel vector"); PRECONDITION((!bondTypes || bondTypes->size() == bondIndices.size()), "bad bondType vector"); PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()), "bad nCutsPerAtom vector"); if (nCutsPerAtom) { for (auto &nCuts : *nCutsPerAtom) { nCuts = 0; } } auto *res = new RWMol(mol); if (!mol.getNumAtoms()) { return res; } std::vector bondsToRemove; bondsToRemove.reserve(bondIndices.size()); for (auto bondIdx : bondIndices) { bondsToRemove.push_back(res->getBondWithIdx(bondIdx)); } for (unsigned int i = 0; i < bondsToRemove.size(); ++i) { const Bond *bond = bondsToRemove[i]; unsigned int bidx = bond->getBeginAtomIdx(); unsigned int eidx = bond->getEndAtomIdx(); Bond::BondType bT = bond->getBondType(); Bond::BondDir bD = bond->getBondDir(); unsigned int bondidx; auto nbr_bond_stereo = getNbrBondStereo(*res, bond); res->removeBond(bidx, eidx); if (nCutsPerAtom) { (*nCutsPerAtom)[bidx] += 1; (*nCutsPerAtom)[eidx] += 1; } if (addDummies) { Atom *at1, *at2; at1 = new Atom(0); at2 = new Atom(0); if (dummyLabels) { at1->setIsotope((*dummyLabels)[i].first); at2->setIsotope((*dummyLabels)[i].second); } else { at1->setIsotope(bidx); at2->setIsotope(eidx); } unsigned int idx1 = res->addAtom(at1, false, true); if (bondTypes) { bT = (*bondTypes)[i]; } bondidx = res->addBond(at1->getIdx(), eidx, bT) - 1; // the dummy replaces the original start atom, so the // direction will be ok as long as it's one of the // states associated with double bond stereo if (bD == Bond::ENDDOWNRIGHT || bD == Bond::ENDUPRIGHT) { res->getBondWithIdx(bondidx)->setBondDir(bD); } unsigned int idx2 = res->addAtom(at2, false, true); bondidx = res->addBond(bidx, at2->getIdx(), bT) - 1; // this bond starts at the same atom, so its direction should always be // correct: res->getBondWithIdx(bondidx)->setBondDir(bD); // restore stereo atoms for (auto &stereo_atoms : nbr_bond_stereo) { std::replace(stereo_atoms.second.begin(), stereo_atoms.second.end(), bidx, idx1); std::replace(stereo_atoms.second.begin(), stereo_atoms.second.end(), eidx, idx2); stereo_atoms.first->getStereoAtoms().swap(stereo_atoms.second); stereo_atoms.first->setStereo( stereo_atoms.first->getProp(molfragSaveStereo)); stereo_atoms.first->clearProp(molfragSaveStereo); } // figure out if we need to change the stereo tags on the atoms: if (mol.getAtomWithIdx(bidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW || mol.getAtomWithIdx(bidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW) { checkChiralityPostMove(mol, mol.getAtomWithIdx(bidx), res->getAtomWithIdx(bidx), mol.getBondBetweenAtoms(bidx, eidx)); } if (mol.getAtomWithIdx(eidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW || mol.getAtomWithIdx(eidx)->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW) { checkChiralityPostMove(mol, mol.getAtomWithIdx(eidx), res->getAtomWithIdx(eidx), mol.getBondBetweenAtoms(bidx, eidx)); } for (auto confIt = res->beginConformers(); confIt != res->endConformers(); ++confIt) { Conformer *conf = (*confIt).get(); conf->setAtomPos(idx1, conf->getAtomPos(bidx)); conf->setAtomPos(idx2, conf->getAtomPos(eidx)); } } else { // was github issues 429, 6034 for (auto idx : {bidx, eidx}) { if (auto tatom = res->getAtomWithIdx(idx); tatom->getNoImplicit() || (tatom->getIsAromatic() && tatom->getAtomicNum() != 6)) { tatom->setNumExplicitHs(tatom->getNumExplicitHs() + 1); } else { tatom->updatePropertyCache(false); } } } } res->clearComputedProps(); return static_cast(res); } ROMol *fragmentOnBonds(const ROMol &mol, const std::vector &bondPatterns, const std::map *atomEnvirons, std::vector *nCutsPerAtom) { PRECONDITION((!nCutsPerAtom || nCutsPerAtom->size() == mol.getNumAtoms()), "bad nCutsPerAtom vector"); std::vector bondIndices; std::vector> dummyLabels; std::vector bondTypes; std::map environsMatch; if (atomEnvirons) { for (const auto &atomEnviron : *atomEnvirons) { MatchVectType mv; environsMatch[atomEnviron.first] = SubstructMatch(mol, *(atomEnviron.second), mv); } } boost::dynamic_bitset<> bondsUsed(mol.getNumBonds(), 0); // the bond definitions are organized (more or less) general -> specific, so // loop over them backwards for (const auto &fbt : boost::adaptors::reverse(bondPatterns)) { if (fbt.query->getNumAtoms() != 2 || fbt.query->getNumBonds() != 1) { BOOST_LOG(rdErrorLog) << "fragmentation queries must have 2 atoms and 1 bond" << std::endl; continue; } if (atomEnvirons && (!environsMatch[fbt.atom1Type] || !environsMatch[fbt.atom2Type])) { continue; } // std::cerr<<" >>> "< bondMatches; SubstructMatch(mol, *fbt.query.get(), bondMatches); for (const auto &mv : bondMatches) { const Bond *bond = mol.getBondBetweenAtoms(mv[0].second, mv[1].second); // std::cerr<<" "<getIdx()<getIdx()]) { // BOOST_LOG(rdWarningLog)<<"bond #"<getIdx()<<" matched multiple // times in decomposition. Later matches ignored."<getIdx()); bondIndices.push_back(bond->getIdx()); if (bond->getBeginAtomIdx() == static_cast(mv[0].second)) { dummyLabels.emplace_back(fbt.atom1Label, fbt.atom2Label); } else { dummyLabels.emplace_back(fbt.atom2Label, fbt.atom1Label); } bondTypes.push_back(fbt.bondType); } } return fragmentOnBonds(mol, bondIndices, true, &dummyLabels, &bondTypes, nCutsPerAtom); } boost::flyweight, boost::flyweights::no_tracking> bondPatterns; boost::flyweight, boost::flyweights::no_tracking> atomEnvs; ROMol *fragmentOnBRICSBonds(const ROMol &mol) { if (bondPatterns.get().size() == 0) { std::map adefs; std::map aenvs; constructBRICSAtomTypes(adefs, &aenvs); atomEnvs = aenvs; std::vector tbondPatterns; constructBRICSBondTypes(tbondPatterns); bondPatterns = tbondPatterns; } return fragmentOnBonds(mol, bondPatterns, &(atomEnvs.get())); } } // namespace MolFragmenter namespace { const unsigned int NOLABEL = std::numeric_limits::max(); // Get the atom label - this might be useful as a util class unsigned int get_label(const Atom *a, const MolzipParams &p) { PRECONDITION(a, "bad atom in MolZip::get_label") unsigned int idx = NOLABEL; switch (p.label) { case MolzipLabel::AtomMapNumber: if (a->getAtomicNum() == 0) { auto mapno = a->getAtomMapNum(); return mapno ? mapno : NOLABEL; } break; case MolzipLabel::Isotope: if (a->getAtomicNum() == 0) { auto iso = a->getIsotope(); return iso ? iso : NOLABEL; } break; case MolzipLabel::AtomType: idx = std::distance(p.atomSymbols.begin(), std::find(p.atomSymbols.begin(), p.atomSymbols.end(), a->getSymbol())); if (idx == p.atomSymbols.size()) { idx = NOLABEL; } break; case MolzipLabel::FragmentOnBonds: // shouldn't ever get here CHECK_INVARIANT( 0, "FragmentOnBonds is not an atom label, it is an atom index"); break; case MolzipLabel::AtomProperty: a->getPropIfPresent(p.atomProperty, idx); break; default: CHECK_INVARIANT(0, "bogus MolZipLabel value in MolZip::get_label"); } return idx; } // Return the connected atom // n.b. There can be only one connection from a mapped atom Atom *get_other_atom(Atom *a) { PRECONDITION(a, "null atom in MolZip::get_other_atom"); auto &m = a->getOwningMol(); if (m.getAtomDegree(a) != 1) { return nullptr; } return m[*m.getAtomNeighbors(a).first]; } int num_swaps_to_interconvert(std::vector &orders) { int nswaps = 0; std::vector seen(orders.size()); for (size_t i = 0; i < orders.size(); ++i) { if (!seen[i]) { auto j = i; while (orders[j] != i) { j = orders[j]; CHECK_INVARIANT( j < orders.size(), "molzip: bond index outside of number of bonds for atom") seen[j] = true; nswaps++; } } } return nswaps; } // Simple bookkeeping class to bond attachments and handle stereo struct ZipBond { Atom *a = nullptr; // atom being bonded Atom *a_dummy = nullptr; // Labelled atom, i.e. [*:1]-C will bond the C to something Atom *b = nullptr; // atom being bonded Atom *b_dummy = nullptr; // Labelled atom, i.e. [*:1]-O will bond the O to something Atom *a_link = nullptr; // Link bonds have six atoms, [*:a][*b].[*:a]C.[*:b]D, four // dummies and two atoms Atom *b_link = nullptr; bool isLinker = false; // is this a straight linker bond Bond::BondType linkerBondType; // The linker bond type // Backup the original chirality mark_chirality must be called first // as it checks the datastructure for validity; void mark_chirality() const { PRECONDITION(a, "Must have a begin atom to bond"); PRECONDITION(b, "Must have an end atom to bond"); PRECONDITION(a_dummy, "Must have a begin dummy atom"); PRECONDITION(b_dummy, "Must have an end dummy atom"); mark(a, a_dummy, b); mark(b, b_dummy, a); } // bond a<->b for now only use single bonds // XXX FIX ME take the highest bond order. bool bond(RWMol &newmol, const MolzipParams ¶ms) const { if (!a || !b || !a_dummy || !b_dummy) { BOOST_LOG(rdWarningLog) << "Incomplete atom labelling, cannot make bond" << std::endl; return false; } // Fragment on bonds allows multiple links to the same atom // i.e. C.[1C].[1C] // otherwise throw an invariant error CHECK_INVARIANT( params.label == MolzipLabel::FragmentOnBonds || !a->getOwningMol().getBondBetweenAtoms(a->getIdx(), b->getIdx()), "molzip: zipped Bond already exists, perhaps labels are duplicated"); if (!a->getOwningMol().getBondBetweenAtoms(a->getIdx(), b->getIdx())) { CHECK_INVARIANT(&a->getOwningMol() == &newmol, "Owning mol is not the combined molecule!!"); if (isLinker) { // This is the easy bit, just link a and b, and schedule a_dummy and // b_dummy // for deletion CHECK_INVARIANT( a && b && a_dummy && b_dummy && a_link && b_link, "molzip: Link Bond is missing one or more labelled atoms"); newmol.addBond(a, b, linkerBondType); a_link->setProp("__molzip_used", true); b_link->setProp("__molzip_used", true); } else { auto bnd = newmol.getBondBetweenAtoms(a->getIdx(), a_dummy->getIdx()); CHECK_INVARIANT( bnd != nullptr, "molzip: begin atom and specified dummy atom connection " "are not bonded.") auto bond_type_a = bnd->getBondType(); auto bond_dir_a = bnd->getBondDir(); auto a_is_start = bnd->getBeginAtom() == a; bnd = newmol.getBondBetweenAtoms(b->getIdx(), b_dummy->getIdx()); CHECK_INVARIANT(bnd != nullptr, "molzip: end atom and specified dummy connection atom " "are not bonded.") auto bond_type_b = bnd->getBondType(); auto bond_dir_b = bnd->getBondDir(); auto b_is_start = bnd->getBeginAtom() == b; unsigned int bnd_idx = 0; // Fusion bond-dir logic table // a-* b-* => a-b // < = wedge // a<* b-* => a* b-* => a>b // a-* b>* => a a>b Bond::BondDir bond_dir{Bond::BondDir::NONE}; auto start = a; auto end = b; if (bond_dir_a != Bond::BondDir::NONE && bond_dir_b != Bond::BondDir::NONE) { // are we consistent between the two bond orders check for the case of // fragment on bonds where a<* and b>* or a>* and b<* when < is either // a hash or wedge bond but not both. bool consistent_directions = false; if (bond_dir_a == bond_dir_b) { if ((a_is_start != b_is_start)) { consistent_directions = true; } } if (!consistent_directions) { BOOST_LOG(rdWarningLog) << "inconsistent bond directions when merging fragments, ignoring..." << std::endl; bond_dir_a = bond_dir_b = Bond::BondDir::NONE; } else { bond_dir_b = Bond::BondDir::NONE; } } if (bond_dir_a != Bond::BondDir::NONE) { if (!a_is_start) { start = b; end = a; } bond_dir = bond_dir_a; } else if (bond_dir_b != Bond::BondDir::NONE) { if (b_is_start) { start = b; end = a; } bond_dir = bond_dir_b; } if (bond_type_a != Bond::BondType::SINGLE) { bnd_idx = newmol.addBond(start, end, bond_type_a); } else if (bond_type_b != Bond::BondType::SINGLE) { bnd_idx = newmol.addBond(start, end, bond_type_b); } else { bnd_idx = newmol.addBond(start, end, Bond::BondType::SINGLE); } newmol.getBondWithIdx(bnd_idx - 1)->setBondDir(bond_dir); } } a_dummy->setProp("__molzip_used", true); b_dummy->setProp("__molzip_used", true); return true; } // Restore the marked chirality (mark_chirality must be called first) void restore_chirality(std::set &already_checked) const { PRECONDITION(a, "Must have a begin atom to bond"); PRECONDITION(b, "Must have an end atom to bond"); PRECONDITION(a_dummy, "Must have a begin dummy atom"); PRECONDITION(b_dummy, "Must have an end dummy atom"); if (already_checked.find(a) == already_checked.end()) { restore(a); already_checked.insert(a); } if (already_checked.find(b) == already_checked.end()) { restore(b); already_checked.insert(b); } // now do bond stereo std::string mark = "__molzip_bond_stereo_mark"; for (auto *bond : a->getOwningMol().bonds()) { if (bond->hasProp(mark)) { std::vector atoms; for (auto *atom : bond->getProp>(mark)) { atoms.push_back(rdcast(atom->getIdx())); } bond->getStereoAtoms().swap(atoms); bond->setStereo( bond->getProp("__molzip_bond_stereo")); } } } private: // Mark the original order of the nbr atoms including the dummy // The goal is to copy the dummy chiral order over to the // atom being bonded void mark(Atom *chiral_atom, Atom *dummy_atom, Atom *new_atom) const { if (chiral_atom->getChiralTag()) { std::string mark = "__molzip_mark_" + std::to_string(chiral_atom->getIdx()); chiral_atom->setProp("__molzip_chiral_mark", mark); int order = 0; auto &m = chiral_atom->getOwningMol(); for (auto nbrIdx : boost::make_iterator_range(m.getAtomNeighbors(chiral_atom))) { m[nbrIdx]->setProp(mark, order); ++order; } new_atom->setProp(mark, dummy_atom->getProp(mark)); } // check bond stereo auto &m = chiral_atom->getOwningMol(); for (auto bond : m.atomBonds(chiral_atom)) { if (bond->getStereo() != Bond::BondStereo::STEREONONE) { std::string mark = "__molzip_bond_stereo_mark"; std::vector atoms; bool has_dummy = false; for (auto idx : bond->getStereoAtoms()) { if (static_cast(idx) == dummy_atom->getIdx()) { atoms.push_back(new_atom); has_dummy = true; } else { atoms.push_back(m.getAtomWithIdx(idx)); } } if (has_dummy) { bond->setProp(mark, atoms); bond->setProp("__molzip_bond_stereo", bond->getStereo()); } } } } // Restore the atom's chirality by comparing the original order // to the current void restore(Atom *chiral_atom) const { if (!chiral_atom->getChiralTag()) { return; } std::string mark = chiral_atom->getProp("__molzip_chiral_mark"); // std::vector orders1; std::vector orders2; auto &m = chiral_atom->getOwningMol(); for (auto nbrIdx : boost::make_iterator_range(m.getAtomNeighbors(chiral_atom))) { orders2.push_back(m[nbrIdx]->getProp(mark)); } if (num_swaps_to_interconvert(orders2) % 2 == 1) { chiral_atom->invertChirality(); } } }; } // namespace static const std::string indexPropName("__zipIndex"); std::unique_ptr molzip( const ROMol &a, const ROMol &b, const MolzipParams ¶ms, std::optional> &attachmentMapping) { if (attachmentMapping) { attachmentMapping->clear(); } std::unique_ptr newmol; if (b.getNumAtoms()) { newmol.reset(static_cast(combineMols(a, b))); } else { newmol.reset(new RWMol(a)); } std::map mappings; std::map> mappings_by_atom; // Linker bonds resolve to the same zip bond by using the // lowest label. I.e. // [*:1][*:2] sets the link bond to label 1 // so this sets linkerBonds[1] == linkerBonds[2] = the same ZipBond std::map linkerBonds; std::vector deletions; if (params.label == MolzipLabel::FragmentOnBonds) { for (auto *atom : newmol->atoms()) { if (atom->getAtomicNum() == 0) { auto molno = atom->getIsotope(); auto attached_atom = get_other_atom(atom); auto &bond = mappings[molno]; bond.a = attached_atom; bond.a_dummy = atom; bond.b = newmol->getAtomWithIdx(molno); for (auto nbrIdx : boost::make_iterator_range(newmol->getAtomNeighbors(bond.b))) { auto *nbr = (*newmol)[nbrIdx]; if (nbr->getAtomicNum() == 0 && nbr->getIsotope() == attached_atom->getIdx()) { bond.b_dummy = nbr; break; } } if (!bond.b_dummy) { BOOST_LOG(rdErrorLog) << "Cannot find atom to bond using FragmentOnBond labelling" << std::endl; return std::unique_ptr(); } mappings_by_atom[atom].push_back(&bond); deletions.push_back(atom); if (attachmentMapping) { if (int otherIndex, dummyIndex; atom->getPropIfPresent(indexPropName, dummyIndex) && bond.b->getPropIfPresent(indexPropName, otherIndex)) { (*attachmentMapping)[dummyIndex] = otherIndex; } } } } } else { // Non Fragment By Bonds attaching for (auto *atom : newmol->atoms()) { auto molno = get_label(atom, params); if (molno != NOLABEL) { auto attached_atom = get_other_atom(atom); auto attached_molno = attached_atom ? get_label(attached_atom, params) : NOLABEL; if (attached_molno != NOLABEL) { // we have a linker bond // [*:1][*:2].[*:1]C.[*:2]S links C and S and drops all dummies // Note: the linker bond MUST come first here // Get the min molno and use this to assign the bonds to link if (molno > attached_molno) { std::swap(molno, attached_molno); std::swap(atom, attached_atom); } auto link_bond = atom->getOwningMol().getBondBetweenAtoms( atom->getIdx(), attached_atom->getIdx()); CHECK_INVARIANT( link_bond, ("molzip: link bond with labels: " + std::to_string(molno) + "," + std::to_string(attached_molno) + " is missing")); auto bondType = link_bond->getBondType(); if (mappings.find(molno) == mappings.end()) { auto &bond = mappings[molno]; CHECK_INVARIANT( linkerBonds.find(attached_molno) == linkerBonds.end(), ("molzip: Linker attachment point with label: " + std::to_string(attached_molno) + " found before linker bond")); linkerBonds[attached_molno] = &bond; bond.isLinker = true; bond.linkerBondType = bondType; bond.a_link = atom; bond.b_link = attached_atom; deletions.push_back(atom); deletions.push_back(attached_atom); } else { // we'll find this bond twice, so let's make sure it is setup // correctly auto &bond = mappings[molno]; CHECK_INVARIANT( bondType = bond.linkerBondType, ("molzip: LINKER bond with labels: " + std::to_string(molno) + "," + std::to_string(attached_molno) + " has inconsistent bond types")); CHECK_INVARIANT( bond.isLinker, ("molzip: LINKER bond with labels: " + std::to_string(molno) + "," + std::to_string(attached_molno) + " found but not encountered first in the molecules to be zipped.")); CHECK_INVARIANT( (bond.a_link == atom && bond.b_link == attached_atom) || (bond.b_link == atom && bond.a_link == attached_atom), ("molzip: Linker Bond with labels " + std::to_string(molno) + "," + std::to_string(attached_molno) + " not setup correctly")); } } else if (mappings.find(molno) == mappings.end() && linkerBonds.find(molno) == linkerBonds.end()) { // Normal linkage C[*:1].S[*:1] links C and S // LinkBond [*:1][*:2].C[*:1].S[*:2] links C and S auto &bond = mappings[molno]; CHECK_INVARIANT( !bond.a, "molzip: bond info already setup for bgn atom with label:" + std::to_string(molno)); bond.a = attached_atom; bond.a_dummy = atom; } else { auto &bond = linkerBonds.find(molno) == linkerBonds.end() ? mappings[molno] : *linkerBonds[molno]; if (bond.isLinker) { CHECK_INVARIANT( !bond.a || !bond.b, "molzip: Linker bond has multiple attachments for label: " + std::to_string(molno)); if (bond.a) { bond.b = attached_atom; bond.b_dummy = atom; deletions.push_back(bond.b_dummy); } else { bond.a = attached_atom; bond.a_dummy = atom; deletions.push_back(bond.a_dummy); } } else { CHECK_INVARIANT( bond.a, "molzip: bond info not properly setup for bgn atom with label:" + std::to_string(molno)); CHECK_INVARIANT( !bond.b, "molzip: bond info already exists for end atom with label:" + std::to_string(molno)); bond.b = attached_atom; bond.b_dummy = atom; } mappings_by_atom[bond.a].push_back(&bond); if (attachmentMapping) { if (int otherIndex, dummyIndex; bond.a_dummy->getPropIfPresent(indexPropName, dummyIndex) && bond.b->getPropIfPresent(indexPropName, otherIndex)) { (*attachmentMapping)[dummyIndex] = otherIndex; } if (int otherIndex, dummyIndex; bond.b_dummy->getPropIfPresent(indexPropName, dummyIndex) && bond.a->getPropIfPresent(indexPropName, otherIndex)) { (*attachmentMapping)[dummyIndex] = otherIndex; } } } deletions.push_back(atom); } } } // Mark the existing chirality so we can try and restore it later for (auto &kv : mappings_by_atom) { for (auto &bond : kv.second) { bond->mark_chirality(); } } // Make all the bonds for (auto &kv : mappings) { kv.second.bond(*newmol, params); } newmol->beginBatchEdit(); // Remove the used atoms for (auto &atom : deletions) { if (atom->hasProp("__molzip_used")) { newmol->removeAtom(atom); } } newmol->commitBatchEdit(); // Try and restore the chirality now that we have new bonds std::set already_checked; for (auto &kv : mappings_by_atom) { for (auto &bond : kv.second) { bond->restore_chirality(already_checked); } } // remove all molzip tags for (auto *atom : newmol->atoms()) { auto propnames = atom->getPropList(); for (auto &prop : propnames) { if (prop.find("__molzip") == 0) { atom->clearProp(prop); } } } for (auto *bond : newmol->bonds()) { auto propnames = bond->getPropList(); for (auto &prop : propnames) { if (prop.find("__molzip") == 0) { bond->clearProp(prop); } } } newmol->updatePropertyCache(params.enforceValenceRules); newmol->setProp(common_properties::_StereochemDone, true); return newmol; } RDKIT_CHEMTRANSFORMS_EXPORT std::unique_ptr molzip( const ROMol &a, const ROMol &b, const MolzipParams ¶ms) { std::optional> opt(std::nullopt); return molzip(a, b, params, opt); } std::unique_ptr molzip(const ROMol &a, const MolzipParams ¶ms) { const static ROMol b; return molzip(a, b, params); } std::unique_ptr molzip(std::vector &decomposition, const MolzipParams ¶ms) { if (params.generateCoordinates) { int index = 0; for (const auto &mol : decomposition) { for (const auto atom : mol->atoms()) { atom->setProp(indexPropName, index++); } } } if (decomposition.empty()) { return nullptr; } // When the rgroup decomposition splits a ring, it puts it in both // rgroups, so remove these std::vector mols; if (params.label != MolzipLabel::FragmentOnBonds && decomposition.size() > 1) { std::vector existing_smiles; for (size_t idx = 1; idx < decomposition.size(); ++idx) { auto &mol = decomposition[idx]; auto smiles = MolToSmiles(*mol); if (std::find(existing_smiles.begin(), existing_smiles.end(), smiles) == existing_smiles.end()) { mols.push_back(mol); existing_smiles.push_back(smiles); } } } auto combinedMol = decomposition[0]; if (!mols.empty()) { combinedMol = std::accumulate( mols.begin(), mols.end(), decomposition[0], [](const auto &combined, const auto &mol) { return boost::shared_ptr(combineMols(*combined, *mol)); }); } const static ROMol b; std::optional attachmentMappingOption = std::map(); auto zippedMol = molzip(*combinedMol, b, params, attachmentMappingOption); if (params.generateCoordinates && zippedMol->getNumAtoms() > 0) { const auto confId = RDDepict::compute2DCoords(*zippedMol); const auto zippedConf = zippedMol->getConformer(confId); auto attachmentMapping = *attachmentMappingOption; for (auto &mol : decomposition) { const auto newConf = new Conformer(mol->getNumAtoms()); newConf->set3D(false); for (const auto atom : mol->atoms()) { int zippedIndex = atom->getProp(indexPropName); atom->clearProp(indexPropName); if (const auto attachment = attachmentMapping.find(zippedIndex); attachment != attachmentMapping.end()) { zippedIndex = (*attachment).second; } auto zipppedAtoms = zippedMol->atoms(); auto zippedAtom = std::find_if( zipppedAtoms.begin(), zipppedAtoms.end(), [zippedIndex](const Atom *zippedAtom) { const auto index = zippedAtom->getProp(indexPropName); return index == zippedIndex; }); newConf->setAtomPos(atom->getIdx(), zippedConf.getAtomPos((*zippedAtom)->getIdx())); } mol->addConformer(newConf, true); } for (const auto atom : zippedMol->atoms()) { atom->clearProp(indexPropName); } } return zippedMol; } std::unique_ptr molzip(const std::map &row, const MolzipParams ¶ms) { auto core = row.find("Core"); PRECONDITION(core != row.end(), "RGroup has no Core, cannot molzip"); std::vector mols; mols.push_back(core->second); for (auto it : row) { if (it.first != "Core") { mols.push_back(it.second); } } return molzip(mols, params); } } // end of namespace RDKit