mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
o rdkit gains a RDKit::common_properties namespace that contains common string value properties o Dict.h and below gain getPropIfPresent that attempts to retrieve a property and returns true/false on success or failure. This is used to optimize access. o rdkit learns how to pass property keys by reference, not value. A new namespace has been added to RDKit, common_properties that contains the std::string values for commonly used properties. This helps to avoid typos in string values but also avoids a creation of std::strings from character values. All accessors (has/get/clear and getPropIfPresent) now pass the key by reference. Additionally, getPropIfPresent removes the double lookup of hasProp/getProp which can be a significant speedup in the smiles and smarts parsers (10-20%)
495 lines
19 KiB
C++
495 lines
19 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2001-2010 Greg Landrum and Rational Discovery LLC
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/RDKitQueries.h>
|
|
#include "SmilesParse.h"
|
|
#include "SmilesParseOps.h"
|
|
#include <list>
|
|
#include <algorithm>
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include <boost/foreach.hpp>
|
|
#include <RDGeneral/RDLog.h>
|
|
|
|
namespace SmilesParseOps{
|
|
using namespace RDKit;
|
|
|
|
void ReportParseError(const char *message,bool throwIt){
|
|
if(!throwIt) BOOST_LOG(rdErrorLog) <<"SMILES Parse Error: "<< message << std::endl;
|
|
else throw SmilesParseException(message);
|
|
}
|
|
|
|
void CleanupAfterParseError(RWMol *mol){
|
|
PRECONDITION(mol,"no molecule");
|
|
// blow out any partial bonds:
|
|
RWMol::BOND_BOOKMARK_MAP *marks = mol->getBondBookmarks();
|
|
RWMol::BOND_BOOKMARK_MAP::iterator markI=marks->begin();
|
|
while(markI != marks->end()){
|
|
RWMol::BOND_PTR_LIST &bonds=markI->second;
|
|
for(RWMol::BOND_PTR_LIST::iterator bondIt=bonds.begin();
|
|
bondIt!=bonds.end();++bondIt){
|
|
delete *bondIt;
|
|
}
|
|
++markI;
|
|
}
|
|
|
|
}
|
|
|
|
//
|
|
// set bondOrder to Bond::IONIC to skip the formation of a bond
|
|
// between the fragment and the molecule
|
|
//
|
|
void AddFragToMol(RWMol *mol,RWMol *frag,Bond::BondType bondOrder,
|
|
Bond::BondDir bondDir,bool closeRings,bool doingQuery ){
|
|
PRECONDITION(mol,"no molecule");
|
|
PRECONDITION(frag,"no fragment");
|
|
PRECONDITION(mol->getActiveAtom(),"no active atom");
|
|
Atom *lastAt = mol->getActiveAtom();
|
|
int nOrigAtoms = mol->getNumAtoms();
|
|
int nOrigBonds = mol->getNumBonds();
|
|
|
|
//
|
|
// close any rings we can in the fragment
|
|
//
|
|
if(closeRings){
|
|
CloseMolRings(frag,true);
|
|
}
|
|
|
|
//
|
|
// Add the fragment's atoms and bonds to the molecule:
|
|
//
|
|
mol->insertMol(*frag);
|
|
|
|
//
|
|
// update ring-closure order information on the added atoms:
|
|
//
|
|
for(RWMol::AtomIterator atomIt=frag->beginAtoms();
|
|
atomIt!=frag->endAtoms();atomIt++){
|
|
INT_VECT tmpVect;
|
|
if((*atomIt)->getPropIfPresent(common_properties::_RingClosures, tmpVect)){
|
|
BOOST_FOREACH(int &v, tmpVect){
|
|
// if the ring closure is not already a bond, don't touch it:
|
|
if(v>=0) v += nOrigBonds;
|
|
}
|
|
Atom *newAtom = mol->getAtomWithIdx(nOrigAtoms+(*atomIt)->getIdx());
|
|
newAtom->setProp(common_properties::_RingClosures,tmpVect);
|
|
}
|
|
}
|
|
|
|
|
|
//
|
|
// ses up the bond between the mol and the branch
|
|
//
|
|
if( bondOrder != Bond::IONIC ){
|
|
// FIX: this is not so much with the elegance...
|
|
Atom *firstAt = mol->getAtomWithIdx(nOrigAtoms);
|
|
Bond::BondType bo;
|
|
int atomIdx1,atomIdx2;
|
|
atomIdx1 = firstAt->getIdx();
|
|
atomIdx2 = lastAt->getIdx();
|
|
if(frag->hasBondBookmark(ci_LEADING_BOND)){
|
|
//std::cout << "found it" << std::endl;
|
|
const ROMol::BOND_PTR_LIST &leadingBonds=frag->getAllBondsWithBookmark(ci_LEADING_BOND);
|
|
BOOST_FOREACH(Bond *leadingBond,leadingBonds){
|
|
// we've already got a bond, so just set its local info
|
|
// and then add it to the molecule intact (no sense doing
|
|
// any extra work).
|
|
leadingBond->setOwningMol(mol);
|
|
leadingBond->setEndAtomIdx(leadingBond->getBeginAtomIdx()+nOrigAtoms);
|
|
leadingBond->setBeginAtomIdx(atomIdx2);
|
|
mol->addBond(leadingBond,true);
|
|
}
|
|
mol->clearBondBookmark(ci_LEADING_BOND);
|
|
} else {
|
|
if(!doingQuery){
|
|
if(bondOrder == Bond::UNSPECIFIED){
|
|
// no bond order provided, figure it out ourselves
|
|
if(lastAt->getIsAromatic() && firstAt->getIsAromatic() ){
|
|
bo = Bond::AROMATIC;
|
|
}
|
|
else{
|
|
bo = Bond::SINGLE;
|
|
}
|
|
}
|
|
else{
|
|
bo = bondOrder;
|
|
}
|
|
if(bo==Bond::DATIVEL){
|
|
int tmp=atomIdx2;
|
|
atomIdx2 = atomIdx1;
|
|
atomIdx1 = tmp;
|
|
bo = Bond::DATIVE;
|
|
} else if(bo == Bond::DATIVER){
|
|
bo = Bond::DATIVE;
|
|
}
|
|
int idx = mol->addBond(atomIdx2,atomIdx1,bo)-1;
|
|
mol->getBondWithIdx(idx)->setBondDir(bondDir);
|
|
} else {
|
|
// semantics are different in SMARTS, unspecified bonds can be single or aromatic:
|
|
if(bondOrder == Bond::UNSPECIFIED){
|
|
QueryBond *newB=new QueryBond(Bond::SINGLE);
|
|
newB->expandQuery(makeBondOrderEqualsQuery(Bond::AROMATIC),
|
|
Queries::COMPOSITE_OR,
|
|
true);
|
|
newB->setOwningMol(mol);
|
|
newB->setBeginAtomIdx(atomIdx1);
|
|
newB->setEndAtomIdx(atomIdx2);
|
|
mol->addBond(newB);
|
|
delete newB;
|
|
}
|
|
else{
|
|
bo=bondOrder;
|
|
if(bo==Bond::DATIVEL){
|
|
int tmp=atomIdx2;
|
|
atomIdx2 = atomIdx1;
|
|
atomIdx1 = tmp;
|
|
bo = Bond::DATIVE;
|
|
} else if(bo == Bond::DATIVER){
|
|
bo = Bond::DATIVE;
|
|
}
|
|
int idx = mol->addBond(atomIdx2,atomIdx1,bo)-1;
|
|
mol->getBondWithIdx(idx)->setBondDir(bondDir);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//
|
|
// okay, the next thing we have to worry about is the possibility
|
|
// that there might be ring opening/closing in the fragment we just
|
|
// dealt with e.g. for things like C1C(C1) and C1C.C1
|
|
// We deal with this by copying in the bookmarks and partial bonds
|
|
// that exist in the fragment
|
|
//
|
|
RWMol::ATOM_BOOKMARK_MAP::iterator atIt;
|
|
for(atIt=frag->getAtomBookmarks()->begin();
|
|
atIt!=frag->getAtomBookmarks()->end();
|
|
++atIt){
|
|
// don't bother even considering bookmarks outside
|
|
// the range used for loops
|
|
if(atIt->first < 100 && atIt->first > 0){
|
|
RWMol::ATOM_PTR_LIST::iterator otherAt;
|
|
for(otherAt=atIt->second.begin();
|
|
otherAt!=atIt->second.end();otherAt++){
|
|
Atom *at2 = *otherAt;
|
|
int newIdx = at2->getIdx()+nOrigAtoms;
|
|
mol->setAtomBookmark(mol->getAtomWithIdx(newIdx),atIt->first);
|
|
//frag->clearAtomBookmark(atIt->first,at2);
|
|
while( frag->hasBondBookmark(atIt->first) ){
|
|
Bond *b = frag->getBondWithBookmark(atIt->first);
|
|
int atomIdx1 = b->getBeginAtomIdx()+nOrigAtoms;
|
|
b->setOwningMol(mol);
|
|
b->setBeginAtomIdx(atomIdx1);
|
|
mol->setBondBookmark(b,atIt->first);
|
|
frag->clearBondBookmark(atIt->first,b);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
frag->clearAllAtomBookmarks();
|
|
frag->clearAllBondBookmarks();
|
|
};
|
|
|
|
|
|
void _invChiralRingAtomWithHs(Atom *atom) {
|
|
PRECONDITION(atom,"bad atom");
|
|
// we will assume that this function is called on a ring atom with a
|
|
// ring closure bond
|
|
if (atom->getNumExplicitHs() == 1) {
|
|
atom->invertChirality();
|
|
}
|
|
}
|
|
typedef std::pair<int,int> INT_PAIR;
|
|
bool operator<(const INT_PAIR &p1,const INT_PAIR &p2){
|
|
return p1.first<p2.first;
|
|
}
|
|
|
|
void AdjustAtomChiralityFlags(RWMol *mol){
|
|
PRECONDITION(mol,"no molecule");
|
|
for(RWMol::AtomIterator atomIt=mol->beginAtoms();
|
|
atomIt != mol->endAtoms();
|
|
++atomIt){
|
|
Atom::ChiralType chiralType=(*atomIt)->getChiralTag();
|
|
if(chiralType==Atom::CHI_TETRAHEDRAL_CW ||
|
|
chiralType==Atom::CHI_TETRAHEDRAL_CCW ){
|
|
//
|
|
// The atom is marked as chiral, set the SMILES-order of the
|
|
// atom's bonds. This is easy for non-ring-closure bonds,
|
|
// because the SMILES order is determined solely by the atom
|
|
// indices. Things are trickier for ring-closure bonds, which we
|
|
// need to insert into the list in a particular order
|
|
//
|
|
INT_VECT ringClosures;
|
|
(*atomIt)->getPropIfPresent(common_properties::_RingClosures, ringClosures);
|
|
|
|
#if 0
|
|
std::cout << "CLOSURES: ";
|
|
std::copy(ringClosures.begin(),ringClosures.end(),
|
|
std::ostream_iterator<int>(std::cout," "));
|
|
std::cout << std::endl;
|
|
#endif
|
|
std::list<INT_PAIR> neighbors;
|
|
// push this atom onto the list of neighbors (we'll use this
|
|
// to find our place later):
|
|
neighbors.push_back(std::make_pair((*atomIt)->getIdx(),-1));
|
|
std::list<int> bondOrder;
|
|
RWMol::ADJ_ITER nbrIdx,endNbrs;
|
|
boost::tie(nbrIdx,endNbrs) = mol->getAtomNeighbors(*atomIt);
|
|
while(nbrIdx != endNbrs){
|
|
Bond *nbrBond=mol->getBondBetweenAtoms((*atomIt)->getIdx(),*nbrIdx);
|
|
if(std::find(ringClosures.begin(),
|
|
ringClosures.end(),
|
|
static_cast<int>(nbrBond->getIdx()))== ringClosures.end()){
|
|
neighbors.push_back(std::make_pair(*nbrIdx,nbrBond->getIdx()));
|
|
}
|
|
++nbrIdx;
|
|
}
|
|
// sort the list of non-ring-closure bonds:
|
|
neighbors.sort();
|
|
|
|
|
|
// find the location of this atom. it pretty much has to be
|
|
// first in the list, e.g for smiles like [C@](F)(Cl)(Br)I, or
|
|
// second (everything else).
|
|
std::list<INT_PAIR>::iterator selfPos=neighbors.begin();
|
|
if(selfPos->first != static_cast<int>((*atomIt)->getIdx())){
|
|
++selfPos;
|
|
}
|
|
CHECK_INVARIANT(selfPos->first==static_cast<int>((*atomIt)->getIdx()),"weird atom ordering");
|
|
|
|
// copy over the bond ids:
|
|
INT_LIST bondOrdering;
|
|
for(std::list<INT_PAIR>::iterator neighborIt=neighbors.begin();
|
|
neighborIt != neighbors.end(); ++neighborIt){
|
|
if(neighborIt != selfPos){
|
|
bondOrdering.push_back(neighborIt->second);
|
|
} else {
|
|
// we are not going to add the atom itself, but we will push on
|
|
// ring closure bonds at this point (if required):
|
|
BOOST_FOREACH(int closure,ringClosures){
|
|
bondOrdering.push_back(closure);
|
|
}
|
|
}
|
|
}
|
|
|
|
// ok, we now have the SMILES ordering of the bonds, figure out the
|
|
// permutation order.
|
|
//
|
|
// This whole thing is necessary because the ring-closure bonds
|
|
// in the SMILES come before the bonds to the other neighbors, but
|
|
// they come after the neighbors in the molecule we build.
|
|
// A crude example:
|
|
// in F[C@](Cl)(Br)I the C-Cl bond is index 1 in both SMILES
|
|
// and as built
|
|
// in F[C@]1(Br)I.Cl1 the C-Cl bond is index 1 in the SMILES
|
|
// and index 3 as built.
|
|
//
|
|
int nSwaps=(*atomIt)->getPerturbationOrder(bondOrdering);
|
|
// FIX: explain this one:
|
|
if((*atomIt)->getDegree()==3 && (*atomIt)->hasProp(common_properties::_SmilesStart)) ++nSwaps;
|
|
if(nSwaps%2){
|
|
(*atomIt)->invertChirality();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Bond::BondType GetUnspecifiedBondType(const RWMol *mol,const Atom *atom1,const Atom *atom2){
|
|
PRECONDITION(mol,"no molecule");
|
|
PRECONDITION(atom1,"no atom1");
|
|
PRECONDITION(atom2,"no atom2");
|
|
Bond::BondType res;
|
|
if(atom1->getIsAromatic() && atom2->getIsAromatic()) {
|
|
res = Bond::AROMATIC;
|
|
} else {
|
|
res = Bond::SINGLE;
|
|
}
|
|
return res;
|
|
}
|
|
void CloseMolRings(RWMol *mol,bool toleratePartials){
|
|
// Here's what we want to do here:
|
|
// loop through the molecule's atom bookmarks
|
|
// for each bookmark:
|
|
// connect pairs of atoms sharing that bookmark
|
|
// left to right (in the order in which they were
|
|
// inserted into the molecule).
|
|
// whilst doing this, we have to be cognizant of the fact that
|
|
// there may well be partial bonds in the molecule which need
|
|
// to be tied in as well. WOO HOO! IT'S A BIG MESS!
|
|
PRECONDITION(mol,"no molecule");
|
|
RWMol::ATOM_BOOKMARK_MAP::iterator bookmarkIt;
|
|
bookmarkIt=mol->getAtomBookmarks()->begin();
|
|
while(bookmarkIt!=mol->getAtomBookmarks()->end()){
|
|
// don't bother even considering bookmarks outside
|
|
// the range used for loops
|
|
if(bookmarkIt->first < 100 && bookmarkIt->first >= 0){
|
|
RWMol::ATOM_PTR_LIST::iterator atomIt,atomsEnd;
|
|
RWMol::ATOM_PTR_LIST bookmarkedAtomsToRemove;
|
|
atomIt = bookmarkIt->second.begin();
|
|
atomsEnd = bookmarkIt->second.end();
|
|
while(atomIt != atomsEnd){
|
|
Atom *atom1 = *atomIt;
|
|
++atomIt;
|
|
if(!toleratePartials && atomIt==atomsEnd){
|
|
ReportParseError("unclosed ring");
|
|
} else if(atomIt!=atomsEnd && *atomIt==atom1){
|
|
// make sure we don't try to connect an atom to itself,
|
|
// this was bug 3145697:
|
|
++atomIt;
|
|
} else if(atomIt!=atomsEnd) {
|
|
// we actually found an atom, so connect it to the first
|
|
Atom *atom2 = *atomIt;
|
|
++atomIt;
|
|
|
|
int bondIdx=-1;
|
|
// We're guaranteed two partial bonds, one for each time
|
|
// the ring index was used. We give the first specification
|
|
// priority.
|
|
CHECK_INVARIANT(mol->hasBondBookmark(bookmarkIt->first),"Missing bond bookmark");
|
|
|
|
// now use the info from the partial bond:
|
|
// The partial bond itself will have a proper order and directionality
|
|
// (with a minor caveat documented below) and will have its beginning
|
|
// atom set already:
|
|
RWMol::BOND_PTR_LIST bonds=mol->getAllBondsWithBookmark(bookmarkIt->first);
|
|
RWMol::BOND_PTR_LIST::iterator bondIt=bonds.begin();
|
|
CHECK_INVARIANT(bonds.size()>=2,"Missing bond");
|
|
|
|
// get pointers to the two bonds:
|
|
Bond *bond1=*bondIt;
|
|
++bondIt;
|
|
Bond *bond2=*bondIt;
|
|
|
|
// remove those bonds from the bookmarks:
|
|
mol->clearBondBookmark(bookmarkIt->first,bond1);
|
|
mol->clearBondBookmark(bookmarkIt->first,bond2);
|
|
|
|
// Make sure the bonds have the correct starting atoms:
|
|
CHECK_INVARIANT(bond1->getBeginAtomIdx()==atom1->getIdx(),"bad begin atom");
|
|
CHECK_INVARIANT(bond2->getBeginAtomIdx()==atom2->getIdx(),"bad begin atom");
|
|
|
|
Bond *matchedBond;
|
|
|
|
// figure out which (if either) bond has a specified type, we'll
|
|
// keep that one. We also need to update the end atom index to match
|
|
// FIX: daylight barfs when you give it multiple specs for the closure
|
|
// bond, we'll just take the first one and ignore others
|
|
// NOTE: we used to do this the other way (take the last specification),
|
|
// but that turned out to be troublesome in odd cases like C1CC11CC1.
|
|
if(!bond1->hasProp(common_properties::_unspecifiedOrder)){
|
|
matchedBond = bond1;
|
|
matchedBond->setEndAtomIdx(atom2->getIdx());
|
|
delete bond2;
|
|
} else {
|
|
matchedBond = bond2;
|
|
matchedBond->setEndAtomIdx(atom1->getIdx());
|
|
delete bond1;
|
|
}
|
|
if(matchedBond->getBondType()==Bond::UNSPECIFIED){
|
|
Bond::BondType bondT=GetUnspecifiedBondType(mol,atom1,atom2);
|
|
matchedBond->setBondType(bondT);
|
|
}
|
|
matchedBond->setOwningMol(mol);
|
|
if(matchedBond->getBondType()==Bond::AROMATIC){
|
|
matchedBond->setIsAromatic(true);
|
|
}
|
|
|
|
#if 0
|
|
//
|
|
// In cases like this: Cl\C=C1.F/1, we need to
|
|
// reverse the directionality on the added bond
|
|
// (because the bond is added from C -> F, but the
|
|
// directionality is for F->C. We recognize these
|
|
// cases because the matched bond direction isn't
|
|
// the same as the added bond direction (i.e. atom1
|
|
// isn't the begin atom for the matched bond).
|
|
//
|
|
// This was Issue 175
|
|
if(atom1->getIdx()!=matchedBond->getBeginAtomIdx()){
|
|
switch(matchedBond->getBondDir()){
|
|
case Bond::ENDUPRIGHT:
|
|
matchedBond->setBondDir(Bond::ENDDOWNRIGHT);
|
|
break;
|
|
case Bond::ENDDOWNRIGHT:
|
|
matchedBond->setBondDir(Bond::ENDUPRIGHT);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
// add the bond:
|
|
bondIdx=mol->addBond(matchedBond,true);
|
|
|
|
// we found a bond, so update the atom's _RingClosures
|
|
// property:
|
|
if(bondIdx>-1){
|
|
CHECK_INVARIANT(atom1->hasProp(common_properties::_RingClosures) &&
|
|
atom2->hasProp(common_properties::_RingClosures),
|
|
"somehow atom doesn't have _RingClosures property.");
|
|
INT_VECT closures;
|
|
atom1->getProp(common_properties::_RingClosures,closures);
|
|
INT_VECT::iterator closurePos= std::find(closures.begin(),
|
|
closures.end(),
|
|
-(bookmarkIt->first+1));
|
|
CHECK_INVARIANT(closurePos!=closures.end(),
|
|
"could not find bookmark in atom _RingClosures");
|
|
*closurePos = bondIdx-1;
|
|
atom1->setProp(common_properties::_RingClosures,closures);
|
|
|
|
atom2->getProp(common_properties::_RingClosures,closures);
|
|
closurePos= std::find(closures.begin(),
|
|
closures.end(),
|
|
-(bookmarkIt->first+1));
|
|
CHECK_INVARIANT(closurePos!=closures.end(),
|
|
"could not find bookmark in atom _RingClosures");
|
|
*closurePos = bondIdx-1;
|
|
atom2->setProp(common_properties::_RingClosures,closures);
|
|
}
|
|
bookmarkedAtomsToRemove.push_back(atom1);
|
|
bookmarkedAtomsToRemove.push_back(atom2);
|
|
}
|
|
}
|
|
//
|
|
// increment the bookmark before calling erase. Otherwise we
|
|
// get a seg fault under MSVC++
|
|
//
|
|
int mark=bookmarkIt->first;
|
|
bookmarkIt++;
|
|
RWMol::ATOM_PTR_LIST::const_iterator aplci;
|
|
BOOST_FOREACH(Atom *atom,bookmarkedAtomsToRemove){
|
|
mol->clearAtomBookmark(mark,atom);
|
|
}
|
|
} else {
|
|
++bookmarkIt;
|
|
}
|
|
}
|
|
};
|
|
|
|
void CleanupAfterParsing(RWMol *mol){
|
|
PRECONDITION(mol,"no molecule");
|
|
for(RWMol::AtomIterator atomIt=mol->beginAtoms();
|
|
atomIt!=mol->endAtoms();++atomIt){
|
|
if((*atomIt)->hasProp(common_properties::_RingClosures))
|
|
(*atomIt)->clearProp(common_properties::_RingClosures);
|
|
if((*atomIt)->hasProp(common_properties::_SmilesStart))
|
|
(*atomIt)->clearProp(common_properties::_SmilesStart);
|
|
}
|
|
for(RWMol::BondIterator bondIt=mol->beginBonds();
|
|
bondIt!=mol->endBonds();++bondIt){
|
|
if((*bondIt)->hasProp(common_properties::_unspecifiedOrder))
|
|
(*bondIt)->clearProp(common_properties::_unspecifiedOrder);
|
|
}
|
|
}
|
|
|
|
|
|
} // end of namespace SmilesParseOps
|