Files
rdkit/Code/GraphMol/ChemTransforms/ChemTransforms.cpp
2022-11-01 04:14:26 +01:00

992 lines
35 KiB
C++

//
// Copyright (C) 2006-2021 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/utils.h>
#include <RDGeneral/Invariant.h>
#include <RDGeneral/RDLog.h>
#include <GraphMol/RDKitQueries.h>
#include <RDGeneral/Exceptions.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Chirality.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/dynamic_bitset.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/algorithm/string.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/FileParseException.h>
#include <RDGeneral/BadFileException.h>
#include <GraphMol/ChemTransforms/ChemTransforms.h>
namespace RDKit {
namespace {
void updateSubMolConfs(const ROMol &mol, RWMol &res,
boost::dynamic_bitset<> &removedAtoms) {
// update conformer information:
res.clearConformers();
for (auto citer = mol.beginConformers(); citer != mol.endConformers();
++citer) {
auto *newConf = new Conformer(res.getNumAtoms());
newConf->setId((*citer)->getId());
newConf->set3D((*citer)->is3D());
int aIdx = 0;
for (unsigned int i = 0; i < mol.getNumAtoms(); ++i) {
if (!removedAtoms[i]) {
newConf->setAtomPos(aIdx, (*citer)->getAtomPos(i));
++aIdx;
}
}
res.addConformer(newConf, false);
}
}
struct SideChainMapping {
int molIndex;
int coreIndex;
bool useMatch;
SideChainMapping(int molIndex)
: molIndex(molIndex), coreIndex(-1), useMatch(false) {}
SideChainMapping(int molIndex, int coreIndex, bool useMatch)
: molIndex(molIndex), coreIndex(coreIndex), useMatch(useMatch) {}
};
} // namespace
ROMol *deleteSubstructs(const ROMol &mol, const ROMol &query, bool onlyFrags,
bool useChirality) {
auto *res = new RWMol(mol, false);
std::vector<MatchVectType> fgpMatches;
std::vector<MatchVectType>::const_iterator mati;
VECT_INT_VECT
matches; // all matches on the molecule - list of list of atom ids
MatchVectType::const_iterator mi;
// do the substructure matching and get the atoms that match the query
const bool uniquify = true;
const bool recursionPossible = true;
SubstructMatch(*res, query, fgpMatches, uniquify, recursionPossible,
useChirality);
// if didn't find any matches nothing to be done here
// simply return a copy of the molecule
if (fgpMatches.size() == 0) {
return res;
}
for (mati = fgpMatches.begin(); mati != fgpMatches.end(); mati++) {
INT_VECT match; // each match onto the molecule - list of atoms ids
for (mi = mati->begin(); mi != mati->end(); mi++) {
match.push_back(mi->second);
}
matches.push_back(match);
}
// now loop over the list of matches and check if we can delete any of them
INT_VECT delList;
VECT_INT_VECT_I mxi, fi;
if (onlyFrags) {
VECT_INT_VECT frags;
MolOps::getMolFrags(*res, frags);
for (fi = frags.begin(); fi != frags.end(); fi++) {
std::sort(fi->begin(), fi->end());
for (mxi = matches.begin(); mxi != matches.end(); mxi++) {
std::sort(mxi->begin(), mxi->end());
if ((*fi) == (*mxi)) {
INT_VECT tmp;
Union((*mxi), delList, tmp);
delList = tmp;
break;
} // end of if we found a matching fragment
} // endof loop over matches
} // end of loop over fragments
} // end of if onlyFrags
else {
// in this case we want to delete any matches we find
// simply loop over the matches and collect the atoms that need to
// be removed
for (mxi = matches.begin(); mxi != matches.end(); mxi++) {
INT_VECT tmp;
Union((*mxi), delList, tmp);
delList = tmp;
}
}
// now loop over the union list and delete the atoms
res->beginBatchEdit();
boost::dynamic_bitset<> removedAtoms(mol.getNumAtoms());
for (auto idx : delList) {
removedAtoms.set(idx);
res->removeAtom(idx);
}
res->commitBatchEdit();
// if we removed any atoms, clear the computed properties:
if (delList.size()) {
updateSubMolConfs(mol, *res, removedAtoms);
res->clearComputedProps(true);
// update our properties, but allow unhappiness:
res->updatePropertyCache(false);
}
return res;
}
std::vector<ROMOL_SPTR> replaceSubstructs(
const ROMol &mol, const ROMol &query, const ROMol &replacement,
bool replaceAll, unsigned int replacementConnectionPoint,
bool useChirality) {
PRECONDITION(replacementConnectionPoint < replacement.getNumAtoms(),
"bad replacementConnectionPoint");
std::vector<ROMOL_SPTR> res;
std::vector<MatchVectType> fgpMatches;
boost::dynamic_bitset<> removedAtoms(mol.getNumAtoms());
// do the substructure matching and get the atoms that match the query
const bool uniquify = true;
const bool recursionPossible = true;
SubstructMatch(mol, query, fgpMatches, uniquify, recursionPossible,
useChirality);
// if we didn't find any matches, there's nothing to be done here
// simply return a list with a copy of the starting molecule
if (fgpMatches.size() == 0) {
res.push_back(ROMOL_SPTR(new ROMol(mol, false)));
res[0]->clearComputedProps(false);
return res;
}
INT_VECT delList;
// now loop over the list of matches and replace them:
for (std::vector<MatchVectType>::const_iterator mati = fgpMatches.begin();
mati != fgpMatches.end(); mati++) {
INT_VECT match; // each match onto the molecule - list of atoms ids
for (const auto &mi : *mati) {
match.push_back(mi.second);
}
INT_VECT sortMatch = match;
std::sort(sortMatch.begin(), sortMatch.end());
if (!replaceAll || !res.size()) {
res.push_back(ROMOL_SPTR(new ROMol(mol, false)));
}
RWMol *newMol = static_cast<RWMol *>(res.rbegin()->get());
// we need a tab to the orig number of atoms because the
// new molecule will start numbered above this:
int numOrigAtoms = newMol->getNumAtoms();
// Add the atoms and bonds from the replacement:
newMol->insertMol(replacement);
// loop over the central atom's (the first atom in match) bonds
// and duplicate any that connect to the remainder of the molecule:
Atom *origAtom = newMol->getAtomWithIdx(match[0]);
ROMol::ADJ_ITER nbrIdx, endNbrs;
boost::tie(nbrIdx, endNbrs) = newMol->getAtomNeighbors(origAtom);
while (nbrIdx != endNbrs) {
// we don't want to duplicate any "intra-match" bonds:
if (!std::binary_search(sortMatch.begin(), sortMatch.end(),
int(*nbrIdx))) {
Bond *oBond = newMol->getBondBetweenAtoms(match[0], *nbrIdx);
CHECK_INVARIANT(oBond, "required bond not found");
newMol->addBond(numOrigAtoms + replacementConnectionPoint, *nbrIdx,
oBond->getBondType());
}
nbrIdx++;
}
if (replaceAll) {
// we'll accumulate a list of atoms to be removed:
INT_VECT tmp;
Union(sortMatch, delList, tmp);
delList = tmp;
} else {
// just delete the atoms now:
newMol->beginBatchEdit();
for (auto idx : sortMatch) {
removedAtoms.set(idx);
newMol->removeAtom(idx);
}
newMol->commitBatchEdit();
}
}
if (delList.size()) {
if (replaceAll) {
// remove the atoms from the delList:
auto *newMol = static_cast<RWMol *>(res[0].get());
newMol->beginBatchEdit();
for (auto idx : delList) {
removedAtoms.set(idx);
newMol->removeAtom(idx);
}
newMol->commitBatchEdit();
}
// clear conformers and computed props and do basic updates
// on the resulting molecules, but allow unhappiness:
for (auto &re : res) {
updateSubMolConfs(mol, *(RWMol *)re.get(), removedAtoms);
re->clearComputedProps(true);
re->updatePropertyCache(false);
}
}
return res;
}
ROMol *replaceSidechains(const ROMol &mol, const ROMol &coreQuery,
bool useChirality) {
MatchVectType matchV;
// do the substructure matching and get the atoms that match the query
const bool recursionPossible = true;
bool matchFound =
SubstructMatch(mol, coreQuery, matchV, recursionPossible, useChirality);
// if we didn't find any matches, there's nothing to be done here
// simply return null to indicate the problem
if (!matchFound || matchV.size() == 0) {
return nullptr;
}
boost::dynamic_bitset<> matchingIndices(mol.getNumAtoms());
for (MatchVectType::const_iterator mvit = matchV.begin();
mvit != matchV.end(); ++mvit) {
matchingIndices[mvit->second] = 1;
}
auto *newMol = new RWMol(mol);
boost::dynamic_bitset<> keepSet(newMol->getNumAtoms());
std::vector<unsigned int> dummyIndices;
for (MatchVectType::const_iterator mvit = matchV.begin();
mvit != matchV.end(); ++mvit) {
keepSet.set(mvit->second);
// if the atom in the molecule has higher degree than the atom in the
// core, we have an attachment point:
if (newMol->getAtomWithIdx(mvit->second)->getDegree() >
coreQuery.getAtomWithIdx(mvit->first)->getDegree()) {
ROMol::ADJ_ITER nbrIdx, endNbrs;
boost::tie(nbrIdx, endNbrs) =
newMol->getAtomNeighbors(newMol->getAtomWithIdx(mvit->second));
while (nbrIdx != endNbrs) {
if (!matchingIndices[*nbrIdx]) {
// this neighbor isn't in the match, convert it to a dummy atom and
// save it
keepSet.set(*nbrIdx);
dummyIndices.push_back(*nbrIdx);
Atom *at = newMol->getAtomWithIdx(*nbrIdx);
Bond *b = newMol->getBondBetweenAtoms(mvit->second, *nbrIdx);
if (b) {
b->setIsAromatic(false);
b->setBondType(Bond::SINGLE);
}
at->setAtomicNum(0);
at->setNumExplicitHs(0);
at->setIsAromatic(false);
at->setIsotope(dummyIndices.size());
}
++nbrIdx;
}
}
}
boost::dynamic_bitset<> removedAtoms(mol.getNumAtoms());
newMol->beginBatchEdit();
for (const auto at : newMol->atoms()) {
if (!keepSet.test(at->getIdx())) {
newMol->removeAtom(at);
removedAtoms.set(at->getIdx());
}
}
// Remove bonds between newly added dummies (if any)
if (dummyIndices.size() > 1) {
for (size_t i = 0; i < dummyIndices.size() - 1; ++i) {
for (size_t j = i + 1; j < dummyIndices.size(); ++j) {
const auto b =
newMol->getBondBetweenAtoms(dummyIndices.at(i), dummyIndices.at(j));
if (b) {
newMol->removeBond(dummyIndices.at(i), dummyIndices.at(j));
}
}
}
}
newMol->commitBatchEdit();
updateSubMolConfs(mol, *newMol, removedAtoms);
// clear computed props and do basic updates on the
// the resulting molecule, but allow unhappiness:
newMol->clearComputedProps(true);
newMol->updatePropertyCache(false);
return static_cast<ROMol *>(newMol);
}
ROMol *replaceCore(const ROMol &mol, const ROMol &coreQuery,
bool replaceDummies, bool labelByIndex,
bool requireDummyMatch, bool useChirality) {
MatchVectType matchV;
// do the substructure matching and get the atoms that match the query
const bool recursionPossible = true;
bool matchFound =
SubstructMatch(mol, coreQuery, matchV, recursionPossible, useChirality);
// if we didn't find any matches, there's nothing to be done here
// simply return null to indicate the problem
if (!matchFound || matchV.size() == 0) {
return nullptr;
}
return replaceCore(mol, coreQuery, matchV, replaceDummies, labelByIndex,
requireDummyMatch);
}
namespace {
const std::string replaceCoreDummyBond = "_replaceCoreDummyBond";
int findNbrBond(RWMol &mol, Bond *bond, Atom *bondAtom, const INT_VECT &bring,
const boost::dynamic_bitset<> &removedAtoms) {
int res = -1;
for (const auto nbrBond : mol.atomBonds(bondAtom)) {
if (nbrBond != bond &&
(nbrBond->hasProp(replaceCoreDummyBond) ||
(!removedAtoms[nbrBond->getOtherAtomIdx(bondAtom->getIdx())] &&
std::find(bring.begin(), bring.end(), nbrBond->getIdx()) !=
bring.end()))) {
res = nbrBond->getOtherAtomIdx(bondAtom->getIdx());
break;
}
}
return res;
}
void setSubMolBrokenRingStereo(RWMol &mol,
const boost::dynamic_bitset<> &removedAtoms) {
PRECONDITION(mol.getRingInfo() && mol.getRingInfo()->isInitialized(),
"bad ringinfo");
for (const auto &bring : mol.getRingInfo()->bondRings()) {
// check whether or not this bond ring is affected by the removal
if (std::find_if(bring.begin(), bring.end(),
[&mol, &removedAtoms](auto idx) {
const auto bond = mol.getBondWithIdx(idx);
return removedAtoms[bond->getBeginAtomIdx()] ||
removedAtoms[bond->getEndAtomIdx()];
}) != bring.end()) {
for (auto bidx : bring) {
const auto bond = mol.getBondWithIdx(bidx);
// is this a bond we can reasonably set cis/trans for and where neither
// atom is being removed?
if ((bond->getIsAromatic() ||
bond->getBondType() == Bond::BondType::DOUBLE) &&
bond->getStereo() == Bond::BondStereo::STEREONONE &&
mol.getRingInfo()->minBondRingSize(bond->getIdx()) <
Chirality::minRingSizeForDoubleBondStereo &&
!removedAtoms[bond->getBeginAtomIdx()] &&
!removedAtoms[bond->getEndAtomIdx()]) {
// find the two neighboring bonds which are in the ring or to the
// newly added dummy atoms. Make sure they aren't affected by the atom
// removal
int beginAtomNbrIdx =
findNbrBond(mol, bond, bond->getBeginAtom(), bring, removedAtoms);
if (beginAtomNbrIdx >= 0) {
int endAtomNbrIdx =
findNbrBond(mol, bond, bond->getEndAtom(), bring, removedAtoms);
if (endAtomNbrIdx >= 0) {
// we can set stereo on the bond
bond->setStereoAtoms(beginAtomNbrIdx, endAtomNbrIdx);
bond->setStereo(Bond::BondStereo::STEREOCIS);
}
}
}
}
}
}
}
} // namespace
ROMol *replaceCore(const ROMol &mol, const ROMol &core,
const MatchVectType &matchV, bool replaceDummies,
bool labelByIndex, bool requireDummyMatch) {
unsigned int origNumAtoms = mol.getNumAtoms();
std::vector<std::pair<int, SideChainMapping>> matches;
matches.reserve(origNumAtoms);
std::vector<int> matchingIndices(origNumAtoms, -1);
std::vector<int> allIndices(origNumAtoms, -1);
boost::dynamic_bitset<> molAtomsMapped(origNumAtoms);
boost::dynamic_bitset<> multipleMappedMolAtoms(origNumAtoms);
for (const auto &mvit : matchV) {
if (mvit.first < 0 || mvit.first >= rdcast<int>(core.getNumAtoms())) {
throw ValueErrorException(
"Supplied MatchVect indices out of bounds of the core molecule");
}
if (mvit.second < 0 || mvit.second >= rdcast<int>(mol.getNumAtoms())) {
throw ValueErrorException(
"Supplied MatchVect indices out of bounds of the target molecule");
}
bool useMatch = false;
if (replaceDummies || core.getAtomWithIdx(mvit.first)->getAtomicNum() > 0) {
matchingIndices[mvit.second] = mvit.first;
useMatch = true;
}
allIndices[mvit.second] = mvit.first;
SideChainMapping mapping(mvit.second, mvit.first, useMatch);
matches.emplace_back(mvit.second, mapping);
if (molAtomsMapped[mvit.second]) {
multipleMappedMolAtoms.set(mvit.second);
}
molAtomsMapped.set(mvit.second);
}
boost::dynamic_bitset<> multipleOwnedBonds(mol.getNumBonds());
if (multipleMappedMolAtoms.any()) {
for (const auto &match : matches) {
const auto &mappingInfo = match.second;
if (multipleMappedMolAtoms[mappingInfo.molIndex]) {
auto coreAtom = core.getAtomWithIdx(mappingInfo.coreIndex);
CHECK_INVARIANT(
coreAtom->getDegree() == 1,
"Multiple core atoms match a mol atom, but one of the core "
"atoms has degree > 1 ");
auto coreNeighborIdx =
core[*core.getAtomNeighbors(coreAtom).first]->getIdx();
auto molNeighborIdx =
std::find_if(matchV.cbegin(), matchV.cend(),
[coreNeighborIdx](std::pair<int, int> p) {
return p.first == static_cast<int>(coreNeighborIdx);
})
->second;
if (molNeighborIdx > -1) {
auto connectingBond =
mol.getBondBetweenAtoms(mappingInfo.molIndex, molNeighborIdx);
CHECK_INVARIANT(connectingBond,
"expected bond in molecule not found");
multipleOwnedBonds.set(connectingBond->getIdx());
}
}
}
}
auto *newMol = new RWMol(mol);
std::vector<Atom *> keepList;
std::map<Atom *, int> dummyAtomMap;
// go through the matches in query order, not target molecule
// order
for (unsigned int i = 0; i < origNumAtoms; ++i) {
if (!molAtomsMapped[i]) {
SideChainMapping mapping(i);
matches.emplace_back(i, mapping);
}
}
std::sort(matches.begin(), matches.end(),
[](const std::pair<int, SideChainMapping> &p1,
const std::pair<int, SideChainMapping> &p2) {
if (p1.second.coreIndex == p2.second.coreIndex) {
return p1.first < p2.first;
}
return p1.second.coreIndex < p2.second.coreIndex;
});
std::vector<std::pair<int, Atom *>> dummies;
std::list<Bond *> allNewBonds;
for (const auto &match : matches) {
const auto &mappingInfo = match.second;
if (!mappingInfo.useMatch) {
Atom *sidechainAtom = newMol->getAtomWithIdx(mappingInfo.molIndex);
// we're keeping the sidechain atoms:
keepList.push_back(sidechainAtom);
// loop over our neighbors and see if any are in the match:
std::list<unsigned int> nbrList;
ROMol::ADJ_ITER nbrIter, endNbrs;
boost::tie(nbrIter, endNbrs) = newMol->getAtomNeighbors(sidechainAtom);
while (nbrIter != endNbrs && (*nbrIter) < origNumAtoms) {
// we need to add bonds and atoms to the molecule while looping
// over neighbors. This invalidates iterators, so collect a list
// of our neighbors now:
nbrList.push_back(*nbrIter);
++nbrIter;
}
unsigned int whichNbr = 0;
std::list<Bond *> newBonds;
for (std::list<unsigned int>::const_iterator lIter = nbrList.begin();
lIter != nbrList.end(); ++lIter) {
unsigned int nbrIdx = *lIter;
Bond *connectingBond =
newMol->getBondBetweenAtoms(mappingInfo.molIndex, nbrIdx);
bool bondToCore = matchingIndices[nbrIdx] > -1;
auto coreBond =
bondToCore && allIndices[nbrIdx] > -1 && mappingInfo.coreIndex > -1
? core.getBondBetweenAtoms(mappingInfo.coreIndex,
allIndices[nbrIdx])
: nullptr;
if (bondToCore && multipleMappedMolAtoms[mappingInfo.molIndex] &&
mappingInfo.coreIndex > -1) {
// The core has multiple atoms that map onto this mol atom - check we
// have matched correct core bond.
// Otherwise we can use this bond only if nobody else owns it.
if (coreBond == nullptr &&
multipleOwnedBonds[connectingBond->getIdx()]) {
bondToCore = false;
}
}
if (bondToCore) {
// we've matched an atom in the core.
if (requireDummyMatch &&
core.getAtomWithIdx(matchingIndices[nbrIdx])->getAtomicNum() !=
0) {
delete newMol;
return nullptr;
}
auto *newAt = new Atom(0);
// if we were not in the matching list, still keep
// the original indices (replaceDummies=False)
int mapping = mappingInfo.coreIndex;
// If we don't have a core bond, the label belongs to the neighbor
if (coreBond == nullptr) {
mapping = allIndices[nbrIdx];
}
// we want to order the dummies in the same orders as
// the mappings, if not labelling by Index they are in arbitrary
// order
// right now so save and sort later.
if (mapping != -1) {
dummies.emplace_back(mapping, newAt);
} else {
dummies.emplace_back(matchingIndices[nbrIdx], newAt);
}
newMol->addAtom(newAt, false, true);
dummyAtomMap[newAt] = nbrIdx;
keepList.push_back(newAt);
Bond *bnd = connectingBond->copy();
if (bnd->getBeginAtomIdx() ==
static_cast<size_t>(mappingInfo.molIndex)) {
bnd->setEndAtomIdx(newAt->getIdx());
} else {
bnd->setBeginAtomIdx(newAt->getIdx());
}
newBonds.push_back(bnd);
allNewBonds.push_back(bnd);
// we may be changing the bond ordering at the atom.
// e.g. replacing the N in C[C@](Cl)(N)F gives an atom ordering of
// C[C?](Cl)(F)[X]
// so we need the SMILES C[C@@](Cl)(F)[X] to maintain the appropriate
// chirality
// check for these cases and adjust our chirality flags as
// appropriate.
//
if (sidechainAtom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CW ||
sidechainAtom->getChiralTag() == Atom::CHI_TETRAHEDRAL_CCW) {
bool switchIt = false;
switch (newMol->getAtomDegree(sidechainAtom)) {
case 4:
// start: ordering: swap?
// N[C@](F)(Cl)C -> F[C@@](Cl)(C)X yes
// F[C@](N)(Cl)C -> F[C@](Cl)(C)X no
// F[C@](Cl)(N)C -> F[C@@](Cl)(C)X yes
// F[C@](Cl)(C)N -> F[C@](Cl)(C)X no
if (!(whichNbr % 2)) {
switchIt = true;
}
break;
case 3:
// things are different in the degree three case because of the
// implicit H:
// start: ordering: swap?
// N[C@H](F)C -> [C@H](F)(C)X yes
// [C@H](N)(F)C -> [C@H](F)(C)X no
// F[C@H](N)C -> F[C@@H](C)X yes
// F[C@H](C)N -> F[C@H](C)X no
if (whichNbr == 1) {
switchIt = true;
}
break;
}
if (switchIt) {
sidechainAtom->invertChirality();
}
}
}
++whichNbr;
}
// add the bonds now, after we've finished the loop over neighbors:
for (auto &newBond : newBonds) {
newBond->setProp(replaceCoreDummyBond, 1);
newMol->addBond(newBond, true);
}
}
}
if (!labelByIndex) {
// sort the mapping indices, but label from 1..N
std::stable_sort(dummies.begin(), dummies.end());
for (size_t nDummy = 0; nDummy < dummies.size(); ++nDummy) {
dummies[nDummy].second->setIsotope(nDummy + 1);
}
} else {
// don't sort, just label by the index
for (auto &dummy : dummies) {
dummy.second->setIsotope(dummy.first);
}
}
std::vector<Atom *> delList;
boost::dynamic_bitset<> removedAtoms(mol.getNumAtoms());
bool removedRingAtom = false;
newMol->beginBatchEdit();
for (const auto at : newMol->atoms()) {
if (std::find(keepList.begin(), keepList.end(), at) == keepList.end()) {
newMol->removeAtom(at);
removedAtoms.set(at->getIdx());
if (!removedRingAtom && mol.getRingInfo() &&
mol.getRingInfo()->isInitialized() &&
mol.getRingInfo()->numAtomRings(at->getIdx())) {
removedRingAtom = true;
}
}
}
if (removedRingAtom) {
setSubMolBrokenRingStereo(*newMol, removedAtoms);
for (auto bond : newMol->bonds()) {
bond->clearProp(replaceCoreDummyBond);
}
}
newMol->commitBatchEdit();
updateSubMolConfs(mol, *newMol, removedAtoms);
// Update any terminal dummy atom coordinates after removing atoms not in the
// keeplist and calling updateSubMolConfs
for (auto &newBond : allNewBonds) {
auto beginAtom = newBond->getBeginAtom();
auto endAtom = newBond->getEndAtom();
CHECK_INVARIANT(beginAtom->getDegree() == 1 || endAtom->getDegree() == 1,
"neither atom has degree one");
if (newMol->getNumConformers()) {
if (endAtom->getAtomicNum() == 0 && endAtom->getDegree() == 1) {
MolOps::setTerminalAtomCoords(*newMol, endAtom->getIdx(),
beginAtom->getIdx());
} else {
MolOps::setTerminalAtomCoords(*newMol, beginAtom->getIdx(),
endAtom->getIdx());
}
}
// make a guess at the position of the dummy atoms showing the attachment
// point:
for (auto citer = mol.beginConformers(); citer != mol.endConformers();
++citer) {
Conformer &newConf = newMol->getConformer((*citer)->getId());
for (auto iter = dummyAtomMap.begin(); iter != dummyAtomMap.end();
++iter) {
newConf.setAtomPos(iter->first->getIdx(),
(*citer)->getAtomPos(iter->second));
}
}
// clear computed props and do basic updates on
// the resulting molecule, but allow unhappiness:
newMol->clearComputedProps(true);
newMol->updatePropertyCache(false);
}
return static_cast<ROMol *>(newMol);
}
ROMol *MurckoDecompose(const ROMol &mol) {
auto *res = new RWMol(mol);
unsigned int nAtoms = res->getNumAtoms();
if (!nAtoms) {
return res;
}
// start by getting the shortest paths matrix:
MolOps::getDistanceMat(mol, false, false, true);
boost::shared_array<int> pathMat;
mol.getProp(common_properties::DistanceMatrix_Paths, pathMat);
boost::dynamic_bitset<> keepAtoms(nAtoms);
const RingInfo *ringInfo = res->getRingInfo();
for (unsigned int i = 0; i < nAtoms; ++i) {
if (ringInfo->numAtomRings(i)) {
keepAtoms[i] = 1;
}
}
const VECT_INT_VECT &rings = ringInfo->atomRings();
// std::cerr<<" rings: "<<rings.size()<<std::endl;
// now find the shortest paths between each ring system and mark the atoms
// along each as being keepers:
for (auto ringsItI = rings.begin(); ringsItI != rings.end(); ++ringsItI) {
for (auto ringsItJ = ringsItI + 1; ringsItJ != rings.end(); ++ringsItJ) {
int atomI = (*ringsItI)[0];
int atomJ = (*ringsItJ)[0];
// std::cerr<<atomI<<" -> "<<atomJ<<": ";
while (atomI != atomJ) {
keepAtoms[atomI] = 1;
atomI = pathMat[atomJ * nAtoms + atomI];
// test for the disconnected case:
if (atomI < 0) {
break;
}
// std::cerr<<atomI<<" ";
}
// std::cerr<<std::endl;
}
}
boost::dynamic_bitset<> removedAtoms(nAtoms);
res->beginBatchEdit();
for (unsigned int i = 0; i < nAtoms; ++i) {
if (!keepAtoms[i]) {
Atom *atom = res->getAtomWithIdx(i);
bool removeIt = true;
// check if the atom has a neighboring keeper:
for (auto nbr : res->atomNeighbors(atom)) {
if (keepAtoms[nbr->getIdx()]) {
if (res->getBondBetweenAtoms(atom->getIdx(), nbr->getIdx())
->getBondType() == Bond::DOUBLE) {
removeIt = false;
break;
} else if (nbr->getIsAromatic() && nbr->getAtomicNum() != 6) {
// fix aromatic heteroatoms:
nbr->setNumExplicitHs(1);
} else if (nbr->getIsAromatic() && nbr->getAtomicNum() == 6 &&
nbr->getFormalCharge() == 1) {
// fix aromatic carbocations
nbr->setNumExplicitHs(1);
} else if (nbr->getNoImplicit() ||
nbr->getChiralTag() != Atom::CHI_UNSPECIFIED) {
nbr->setNoImplicit(false);
nbr->setNumExplicitHs(0);
nbr->setChiralTag(Atom::CHI_UNSPECIFIED);
}
}
}
if (removeIt) {
res->removeAtom(atom);
removedAtoms.set(atom->getIdx());
}
}
}
res->commitBatchEdit();
updateSubMolConfs(mol, *res, removedAtoms);
res->clearComputedProps();
return (ROMol *)res;
}
ROMol *combineMols(const ROMol &mol1, const ROMol &mol2,
RDGeom::Point3D offset) {
auto *res = new RWMol(mol1);
int nAtoms1 = res->getNumAtoms();
res->insertMol(mol2);
// copy over coordinates
if (mol1.getNumConformers() && mol2.getNumConformers()) {
if (mol1.getNumConformers() != mol2.getNumConformers()) {
BOOST_LOG(rdWarningLog)
<< "combineMols: molecules have unequal numbers of conformers"
<< std::endl;
}
for (auto conf1It = res->beginConformers(); conf1It != res->endConformers();
++conf1It) {
Conformer *conf1 = (*conf1It).get();
try {
const Conformer *conf2 = &mol2.getConformer(conf1->getId());
for (unsigned int i = 0; i < mol2.getNumAtoms(); ++i) {
conf1->setAtomPos(i + nAtoms1, conf2->getAtomPos(i) + offset);
}
} catch (ConformerException &) {
BOOST_LOG(rdWarningLog) << "combineMols: conformer id "
<< conf1->getId() << " not found in mol2";
}
}
}
res->clearComputedProps(true);
return (ROMol *)res;
}
void addRecursiveQueries(
ROMol &mol, const std::map<std::string, ROMOL_SPTR> &queries,
const std::string &propName,
std::vector<std::pair<unsigned int, std::string>> *reactantLabels) {
std::string delim = ",";
boost::char_separator<char> sep(delim.c_str());
if (reactantLabels != nullptr) {
(*reactantLabels).resize(0);
}
ROMol::VERTEX_ITER atBegin, atEnd;
boost::tie(atBegin, atEnd) = mol.getVertices();
while (atBegin != atEnd) {
Atom *at = mol[*atBegin];
++atBegin;
if (!at->hasProp(propName)) {
continue;
}
std::string pval;
at->getProp(propName, pval);
std::string maybeSmarts =
pval; // keep unmodified in case we are a smarts string
boost::algorithm::to_lower(pval);
if (reactantLabels != nullptr) {
std::pair<unsigned int, std::string> label(at->getIdx(), pval);
(*reactantLabels).push_back(label);
}
QueryAtom::QUERYATOM_QUERY *qToAdd = nullptr;
bool notFound = false;
if (pval.find(delim) != std::string::npos) {
boost::tokenizer<boost::char_separator<char>> tokens(pval, sep);
boost::tokenizer<boost::char_separator<char>>::iterator token;
qToAdd = new ATOM_OR_QUERY();
qToAdd->setDescription("AtomOr");
for (token = tokens.begin(); token != tokens.end(); ++token) {
auto iter = queries.find(*token);
if (iter == queries.end()) {
delete qToAdd;
notFound = true;
break;
}
auto *tqp = new RecursiveStructureQuery(new ROMol(*(iter->second)));
std::shared_ptr<RecursiveStructureQuery> nq(tqp);
qToAdd->addChild(nq);
}
} else {
auto iter = queries.find(pval);
if (iter == queries.end()) {
notFound = true;
} else {
qToAdd = new RecursiveStructureQuery(new ROMol(*(iter->second)));
}
}
if (notFound) {
// See if we are actually a smarts expression already
RWMol *m = nullptr;
try {
m = SmartsToMol(maybeSmarts);
if (!m) {
throw KeyErrorException(pval);
}
qToAdd = new RecursiveStructureQuery(m);
} catch (...) {
throw KeyErrorException(pval);
}
}
if (!at->hasQuery()) {
QueryAtom qAt(*at);
unsigned int idx = at->getIdx();
static_cast<RWMol &>(mol).replaceAtom(idx, &qAt);
at = mol.getAtomWithIdx(idx);
}
at->expandQuery(qToAdd, Queries::COMPOSITE_AND);
}
}
void parseQueryDefFile(std::istream *inStream,
std::map<std::string, ROMOL_SPTR> &queryDefs,
bool standardize, const std::string &delimiter,
const std::string &comment, unsigned int nameColumn,
unsigned int smartsColumn) {
PRECONDITION(inStream, "no stream");
queryDefs.clear();
boost::char_separator<char> sep(delimiter.c_str());
unsigned int line = 0;
std::string tempStr;
while (!inStream->eof() && !inStream->fail()) {
line++;
tempStr = getLine(inStream);
if (tempStr == "" || tempStr.find(comment) == 0) {
continue;
}
boost::tokenizer<boost::char_separator<char>> tokens(tempStr, sep);
unsigned int tpos;
boost::tokenizer<boost::char_separator<char>>::iterator token;
std::string qname = "";
std::string qval = "";
for (token = tokens.begin(), tpos = 0; token != tokens.end();
++token, ++tpos) {
if (tpos == nameColumn) {
qname = *token;
} else if (tpos == smartsColumn) {
qval = *token;
}
}
boost::trim_if(qname, boost::is_any_of(" \t"));
boost::trim_if(qval, boost::is_any_of(" \t"));
if (qname == "" || qval == "") {
continue;
}
RWMol *m = nullptr;
try {
m = SmartsToMol(qval);
} catch (...) {
m = nullptr;
}
if (!m) {
BOOST_LOG(rdWarningLog) << "cannot convert SMARTS " << qval
<< " to molecule at line " << line << std::endl;
continue;
}
ROMOL_SPTR msptr(m);
if (standardize) {
boost::algorithm::to_lower(qname);
}
queryDefs[qname] = msptr;
}
}
void parseQueryDefFile(const std::string &filename,
std::map<std::string, ROMOL_SPTR> &queryDefs,
bool standardize, const std::string &delimiter,
const std::string &comment, unsigned int nameColumn,
unsigned int smartsColumn) {
std::ifstream inStream(filename.c_str());
if (!inStream || (inStream.bad())) {
std::ostringstream errout;
errout << "Bad input file " << filename;
throw BadFileException(errout.str());
}
parseQueryDefFile(&inStream, queryDefs, standardize, delimiter, comment,
nameColumn, smartsColumn);
}
void parseQueryDefText(const std::string &queryDefText,
std::map<std::string, ROMOL_SPTR> &queryDefs,
bool standardize, const std::string &delimiter,
const std::string &comment, unsigned int nameColumn,
unsigned int smartsColumn) {
std::stringstream inStream(queryDefText);
parseQueryDefFile(&inStream, queryDefs, standardize, delimiter, comment,
nameColumn, smartsColumn);
}
} // end of namespace RDKit