mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
937 lines
37 KiB
C++
Executable File
937 lines
37 KiB
C++
Executable File
// $Id: Mol2FileParser.cpp 1457 2009-04-03 09:05:17Z landrgr1 $
|
|
//
|
|
// Copyright (c) 2008, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior
|
|
// written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// created by Nik Stiefl May 2008
|
|
// this file is heavily based on glandrum's MolFileParser
|
|
// currently no stereochemistry is detected and read in from
|
|
// string is not supported
|
|
//
|
|
//
|
|
|
|
#include "FileParsers.h"
|
|
#include "MolFileStereochem.h"
|
|
#include <RDGeneral/Invariant.h>
|
|
#include <GraphMol/RDKitQueries.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <RDGeneral/RDLog.h>
|
|
//
|
|
#include <fstream>
|
|
#include <boost/tokenizer.hpp>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <boost/algorithm/string/trim.hpp>
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/BadFileException.h>
|
|
|
|
|
|
namespace RDKit{
|
|
|
|
namespace {
|
|
void fixNitroSubstructureAndCharge(RWMol *res, unsigned int atIdx){
|
|
unsigned int noODblNeighbors=0;
|
|
ROMol::ADJ_ITER nbrIdxIt,nbrEndIdxIt;
|
|
unsigned int toModIdx=0;
|
|
boost::tie(nbrIdxIt,nbrEndIdxIt)=res->getAtomNeighbors(res->getAtomWithIdx(atIdx));
|
|
while (nbrIdxIt!=nbrEndIdxIt){
|
|
Bond *curBond = res->getBondBetweenAtoms(atIdx,*nbrIdxIt);
|
|
if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==8 &&
|
|
curBond->getBondType() == Bond::DOUBLE){
|
|
++noODblNeighbors;
|
|
toModIdx=*nbrIdxIt;
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
if(noODblNeighbors==2){
|
|
res->getBondBetweenAtoms(atIdx,toModIdx)->setBondType(Bond::SINGLE);
|
|
res->getAtomWithIdx(atIdx)->setFormalCharge(1);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(-1);
|
|
}
|
|
}
|
|
|
|
void readFormalChargesFromAttr(std::istream *inStream, RWMol *res){
|
|
PRECONDITION(inStream,"inStream not valid");
|
|
PRECONDITION(!inStream->eof(),"inStream is at eof");
|
|
PRECONDITION(res,"RWMol not valid");
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
bool readNextAtomAttribs=true;
|
|
unsigned int atomIdx=0, noAtomAttr=0;
|
|
|
|
//std::streampos stPos = inStream->tellg();
|
|
std::string tempStr = getLine(inStream);
|
|
//there needs to be at least one entry
|
|
if (inStream->eof()){
|
|
throw FileParseException("premature EOF in readFormalCharges");
|
|
}
|
|
while (readNextAtomAttribs){
|
|
tokenizer tokens(tempStr,sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
try {
|
|
atomIdx=boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
noAtomAttr=boost::lexical_cast<unsigned int>(*itemIt);
|
|
}catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 UnityAtomAttr.");
|
|
}
|
|
for(unsigned int i=0;i<noAtomAttr;++i){
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()){
|
|
throw FileParseException("premature EOF in readFormalCharges");
|
|
}
|
|
tokenizer atmTokens(tempStr,sep);
|
|
itemIt = atmTokens.begin();
|
|
if (*itemIt == "AtomExpr"){
|
|
int formCharge=0;
|
|
++itemIt;
|
|
//FIX:what if an atom has multiple properties? Seems like they might be separated
|
|
//with ";" ... need to look at that in more detail!
|
|
if ((*itemIt).find("=")==std::string::npos){
|
|
try {
|
|
formCharge = boost::lexical_cast<int>(*itemIt);
|
|
}catch (boost::bad_lexical_cast &) {
|
|
readNextAtomAttribs=false;
|
|
throw FileParseException("Cannot process mol2 formal charge.");
|
|
}
|
|
//assign the charge
|
|
res->getAtomWithIdx(atomIdx-1)->setFormalCharge(formCharge);
|
|
}
|
|
}
|
|
}//endfor
|
|
//check if we have finished reading UNITY_ATOM_ATTR
|
|
if (inStream->eof()){
|
|
readNextAtomAttribs=false;
|
|
}else{
|
|
tempStr = getLine(inStream);
|
|
if (tempStr == "" || tempStr[0] == '@' || tempStr[0] == '#'){
|
|
readNextAtomAttribs=false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void guessFormalCharges(RWMol *res){
|
|
//FIX: this whole thing has problems with positively charged pyridines et al.
|
|
for(RWMol::AtomIterator atomIt=res->beginAtoms();atomIt!=res->endAtoms();++atomIt) {
|
|
Atom *at = (*atomIt);
|
|
//assign only if no formal charge set on that atom and atom is not carbon (the latter
|
|
//might needs changing later on - let's see) and not for query atoms (dummy etc.)
|
|
//since this happens only during cleanup of bad substructures
|
|
//FIX: check nitro compounds et al.
|
|
|
|
if(at->getFormalCharge()==0 && at->getSymbol()!="C" && !(at->hasQuery())){
|
|
//have to calculate the explicit valence without call to calcExplicitValence since
|
|
//this will barf when I have e.g. an uncharged 4 valent nitrogen ...
|
|
int noAromBonds=0;
|
|
double accum=0; //FIX: could this give non int values ?
|
|
ROMol::OEDGE_ITER beg,end;
|
|
boost::tie(beg,end) = res->getAtomBonds(at);
|
|
while(beg!=end){
|
|
accum += (*res)[*beg]->getValenceContrib(at);
|
|
if ((*res)[*beg]->getBondType() == Bond::AROMATIC){
|
|
++noAromBonds;
|
|
}
|
|
++beg;
|
|
}
|
|
//Assumption: if there is an aromatic bridge atom the accum will be 4.5
|
|
//(three aromatic bonds), e.g. naphthalenes. However those are not charged so we
|
|
//can stop here
|
|
//FIX: a structure such as c12cccc[n+]2cccn1 will not work if we just continue
|
|
//for noAromBonds>2
|
|
//special case checking - if atom c then go on
|
|
if (noAromBonds>2 && at->getSymbol()=="C"){
|
|
continue;
|
|
}
|
|
|
|
//dbtranslate problems - for mols with no UNITY_ATOM_ATTR set - we won't be able to guess
|
|
//if this mol needs to be guessed or not ... hence, we don't guess on atoms with
|
|
//ar specification and not atom type X.ar and in 5-membered ring (there is stuff like
|
|
//c1cc[o+]cc1 that should be charged
|
|
//e.g. 5ring with N.pl3 as NH atom or other atoms without ar specification in aromatic ring
|
|
//FIX: do we need make sure this only happens for atoms in ring?
|
|
std::string tATT;
|
|
at->getProp("_TriposAtomType",tATT);
|
|
MolOps::findSSSR(*res);
|
|
if (tATT.find("ar")==std::string::npos && at->getIsAromatic() &&
|
|
res->getRingInfo()->isAtomInRingOfSize(at->getIdx(),5)){
|
|
continue;
|
|
}
|
|
|
|
//for dbtranslate a problem will also occur for N.ar with 3 aromatic bonds and no charge
|
|
//assigned for these we don't assign charges (corina will create kekulized input for this
|
|
//(at least in most cases) - anyway, throw a warning!
|
|
if (noAromBonds==3 && tATT=="N.ar"){
|
|
std::string nm;
|
|
res->getProp("_Name",nm);
|
|
BOOST_LOG(rdWarningLog)<<nm<<": warning - aromatic N with 3 aromatic bonds - "
|
|
"skipping charge guess for this atom"<<std::endl;
|
|
continue;
|
|
}
|
|
|
|
|
|
//sometimes things like benzimidazoles can have only one bond of the
|
|
//imidazole ring as aromatic and the other one as a single bond ...
|
|
//catch that this way - see also the trick from GL
|
|
int expVal = static_cast<int>(round(accum+0.1));
|
|
const INT_VECT &valens = PeriodicTable::getTable()->getValenceList(at->getAtomicNum());
|
|
INT_VECT_CI vi;
|
|
|
|
//check default valence and compare to expVal - chg
|
|
//the hypothesis is that we prefer positively charged atoms over negatively charged ones
|
|
//for multi default valence atoms (e.g. CS(O)(O) should end up being C[S+]([O-])[O-] rather
|
|
//than C[S-][O-][O-] but that might change based no different examples
|
|
int assignChg=expVal - (*valens.begin());
|
|
if (assignChg>0){
|
|
for (vi = valens.begin(); vi != valens.end(); ++vi) {
|
|
//Since we do this only for nocharged atoms we can get away without including the
|
|
//charge into this
|
|
//apart from that we do not assign charges higher than +/- 1 for atoms with multiple valence states
|
|
//otherwise the early break would have to go away which in turn would result in things like [S+4] for
|
|
//sulfonamides
|
|
assignChg = expVal - (*vi);
|
|
if ((*vi)<=expVal && abs(assignChg)<2){
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (assignChg){
|
|
//no aromatic atom will get aa abs(charge) > 1
|
|
if(at->getIsAromatic() && abs(assignChg)>1){
|
|
at->setFormalCharge((assignChg>0)-(assignChg<0));//this results in -1 or +1
|
|
}else{
|
|
at->setFormalCharge(assignChg);
|
|
}
|
|
//corina will create strange nitro groups which look like N(=O)(=O) which in turn will result in
|
|
//[N+2](=O)(=O) this needs to be fixed at this stage since otherwise we would have to check on all
|
|
//N.pl3 during the cleanup substructures step
|
|
if(assignChg==2 && expVal==5 && at->getSymbol()=="N"){
|
|
fixNitroSubstructureAndCharge(res,at->getIdx());
|
|
}
|
|
//what if expVal != at->calcExplicitValence();?
|
|
//cannot imagine a case where that will happen now.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned int chkNoHNeighbNOx(RWMol *res, ROMol::ADJ_ITER atIdxIt, int &toModIdx){
|
|
Atom *at = res->getAtomWithIdx(*atIdxIt);
|
|
unsigned int noHNbrs=0;
|
|
ROMol::ADJ_ITER nbrIdxIt,nbrEndIdxIt;
|
|
boost::tie(nbrIdxIt,nbrEndIdxIt) = res->getAtomNeighbors(at);
|
|
while (nbrIdxIt!=nbrEndIdxIt){
|
|
if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==1){
|
|
++noHNbrs;
|
|
}else if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==8 &&
|
|
res->getAtomDegree(res->getAtomWithIdx(*nbrIdxIt))==1){
|
|
//this is a N in an N-oxide constellation
|
|
//we can do the above if clause since mol2 have explicit hydrogens
|
|
toModIdx=*atIdxIt;
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
return noHNbrs;
|
|
}
|
|
|
|
bool cleanUpMol2Substructures(RWMol *res){
|
|
//NOTE: check the nitro fix in guess formal charges!
|
|
boost::dynamic_bitset<> isFixed(res->getNumAtoms());
|
|
for(ROMol::AtomIterator atIt=res->beginAtoms();atIt!=res->endAtoms();++atIt){
|
|
std::string tAT;
|
|
Atom *at = *atIt;
|
|
unsigned int idx=at->getIdx();
|
|
at->getProp("_TriposAtomType",tAT);
|
|
|
|
if (tAT=="N.4"){
|
|
at->setFormalCharge(1);
|
|
} else if(tAT=="O.co2"){
|
|
//negatively charged carboxylates with O.co2
|
|
//according to Tripos, those should only appear in carboxylates and phosphates,
|
|
//FIX: do it also for phsopahtes and sulphates ...
|
|
if (at->getDegree()!=1){
|
|
BOOST_LOG(rdWarningLog)<<"Warning - O.co2 with degree >1."<<std::endl;
|
|
return false;
|
|
}
|
|
ROMol::ADJ_ITER nbrIdxIt,endNbrsIdxIt;
|
|
//getAtomNeighbours returns 2 adjacency iterators (index iterators)
|
|
boost::tie(nbrIdxIt,endNbrsIdxIt) = res->getAtomNeighbors(at);
|
|
//this should return only the C.2
|
|
Atom *nbr=res->getAtomWithIdx(*nbrIdxIt);
|
|
std::string tATT;
|
|
nbr->getProp("_TriposAtomType",tATT);
|
|
//carboxylates
|
|
if (tATT=="C.2" || tATT=="S.o2" ){
|
|
//this should return only the bond between C.2 and O.co2
|
|
Bond *b = res->getBondBetweenAtoms(idx,*nbrIdxIt);
|
|
if(!isFixed[*nbrIdxIt]){
|
|
//the first occurence is negatively charged and has a single bond
|
|
b->setBondType(Bond::SINGLE);
|
|
b->setIsAromatic(false);
|
|
at->setFormalCharge(-1);
|
|
at->setIsAromatic(false);
|
|
nbr->setIsAromatic(false);
|
|
isFixed[idx]=1;
|
|
isFixed[*nbrIdxIt]=1;
|
|
} else{
|
|
//the other occurences are not charged and have a double bond
|
|
b->setBondType(Bond::DOUBLE);
|
|
b->setIsAromatic(false);
|
|
at->setIsAromatic(false);
|
|
isFixed[idx]=1;
|
|
}
|
|
}else{
|
|
std::string nm;
|
|
res->getProp("_Name",nm);
|
|
BOOST_LOG(rdWarningLog)<<nm<<": warning - O.co2 with non C.2 or S.o2 neighbor."<<std::endl;
|
|
return false;
|
|
}
|
|
} else if(tAT=="C.cat"){
|
|
//positively charged guanidinium groups with C.cat
|
|
//according to Tripos these should only appear in guanidinium groups
|
|
//for the structural fix - the last nitrogen with the least number of
|
|
//heavy atoms will get the double bond and the positive charge.
|
|
//remember : this is not canonical!
|
|
//first - set the C.cat as fixed
|
|
isFixed[idx]=1;
|
|
ROMol::ADJ_ITER nbrIdxIt,endNbrsIdxIt,tmpIdxIt;
|
|
unsigned int lowestDeg=100;
|
|
boost::tie(nbrIdxIt,endNbrsIdxIt) = res->getAtomNeighbors(at);
|
|
//one problem of programs like Corina is, that they will create also C.cat
|
|
//for groups that are not guanidinium. We cannot fix all, but the charged amidine
|
|
//in a ring is taken care of too.
|
|
tmpIdxIt=nbrIdxIt;
|
|
//declare and initialise toModIdx
|
|
int toModIdx=-1;
|
|
unsigned int noNNeighbors=0;
|
|
while (tmpIdxIt != endNbrsIdxIt){
|
|
if(res->getAtomWithIdx(*tmpIdxIt)->getSymbol()=="N"){
|
|
++noNNeighbors;
|
|
}
|
|
++tmpIdxIt;
|
|
}
|
|
if(noNNeighbors<2 || noNNeighbors>3){
|
|
std::string nm;
|
|
res->getProp("_Name",nm);
|
|
BOOST_LOG(rdWarningLog)<<nm<<": Error - C.Cat with bad number of N neighbors."<<std::endl;
|
|
return false;
|
|
} else if(noNNeighbors == 2){
|
|
//the idea is that we assign the positive charge according to the following precedence:
|
|
//1. is part of N-oxide
|
|
//2. atom with highest number of hydrogen atoms
|
|
//3. atom in ring
|
|
//4. random
|
|
//first we identify the N atoms
|
|
ROMol::ADJ_ITER idxIt1=nbrIdxIt, idxIt2=nbrIdxIt;
|
|
bool firstIdent=false;
|
|
while (nbrIdxIt != endNbrsIdxIt){
|
|
if (res->getAtomWithIdx(*nbrIdxIt)->getSymbol()=="N"){
|
|
//fix the bond to one - only the modified N will have a double bond to C.cat
|
|
res->getBondBetweenAtoms(idx,*nbrIdxIt)->setBondType(Bond::SINGLE);
|
|
res->getBondBetweenAtoms(idx,*nbrIdxIt)->setIsAromatic(false);
|
|
res->getAtomWithIdx(*nbrIdxIt)->setIsAromatic(false);
|
|
//FIX: what is happening if we hit an atom that was fixed before - propably nothing.
|
|
//since I cannot think of a case where this is a problem - throw a warning
|
|
if(isFixed[*nbrIdxIt]){
|
|
std::string nm;
|
|
res->getProp("_Name",nm);
|
|
BOOST_LOG(rdWarningLog)<<nm<<": warning - charged amidine and isFixed atom."<<std::endl;
|
|
}
|
|
isFixed[*nbrIdxIt]=1;
|
|
if (firstIdent){
|
|
idxIt2=nbrIdxIt;
|
|
}else{
|
|
idxIt1=nbrIdxIt;
|
|
firstIdent=true;
|
|
}
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
//now that we know which are the relevant atoms we check the above features
|
|
//is part of N-oxide?
|
|
//number of hydrogens on each neighbour
|
|
unsigned int noHNbrs1=chkNoHNeighbNOx(res, idxIt1, toModIdx);
|
|
unsigned int noHNbrs2=chkNoHNeighbNOx(res, idxIt2, toModIdx);
|
|
if (toModIdx<0){
|
|
//no N-oxide
|
|
if (noHNbrs1 != noHNbrs2){
|
|
if (noHNbrs1>noHNbrs2){
|
|
toModIdx=*idxIt1;
|
|
}else{
|
|
toModIdx=*idxIt2; //this is random if both have the same number of atoms
|
|
}
|
|
}else{
|
|
//perceive the rings
|
|
MolOps::findSSSR(*res);
|
|
//then we check if both atoms are in a ring
|
|
unsigned int rIdx1=res->getRingInfo()->numAtomRings((*idxIt1));
|
|
unsigned int rIdx2=res->getRingInfo()->numAtomRings((*idxIt2));
|
|
if(rIdx1>rIdx2){
|
|
toModIdx=*idxIt1;
|
|
}else{
|
|
toModIdx=*idxIt2;
|
|
}
|
|
}
|
|
}
|
|
res->getBondBetweenAtoms(idx,toModIdx)->setBondType(Bond::DOUBLE);
|
|
res->getBondBetweenAtoms(idx,toModIdx)->setIsAromatic(false);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
|
|
at->setIsAromatic(false);
|
|
} else{
|
|
while (nbrIdxIt != endNbrsIdxIt){
|
|
if(!isFixed[*nbrIdxIt]){
|
|
//we get in here if this N.pl3 was not seen / fixed before
|
|
Atom *nbr=res->getAtomWithIdx(*nbrIdxIt);
|
|
//get the number of heavy atoms connected to this atom
|
|
ROMol::ADJ_ITER nbrNbrIdxIt,nbrEndNbrsIdxIt;
|
|
unsigned int hvyAtDeg=0;
|
|
boost::tie(nbrNbrIdxIt,nbrEndNbrsIdxIt) = res->getAtomNeighbors(nbr);
|
|
while (nbrNbrIdxIt!=nbrEndNbrsIdxIt){
|
|
if(res->getAtomWithIdx(*nbrNbrIdxIt)->getAtomicNum()>1){
|
|
std::string nbrAT;
|
|
res->getAtomWithIdx(*nbrNbrIdxIt)->getProp("_TriposAtomType",nbrAT);
|
|
if (nbrAT=="C.cat"){
|
|
hvyAtDeg+=2;//that way we reduce the risk of ionising the N attached to another C.cat ...
|
|
} else{
|
|
++hvyAtDeg;
|
|
}
|
|
}
|
|
++nbrNbrIdxIt;
|
|
}
|
|
//now check for lowest heavy atom degree
|
|
if(hvyAtDeg<lowestDeg){
|
|
toModIdx=*nbrIdxIt;
|
|
lowestDeg=hvyAtDeg;
|
|
}
|
|
//modify the bond between C.Cat and the N.pl3
|
|
Bond *b = res->getBondBetweenAtoms(idx,*nbrIdxIt);
|
|
b->setBondType(Bond::SINGLE);
|
|
b->setIsAromatic(false);
|
|
nbr->setIsAromatic(false);
|
|
//set N.pl3 as fixed
|
|
isFixed[*nbrIdxIt]=1;
|
|
} else{
|
|
//the N is allready fixed - since we don't touch this atom make the bond to single
|
|
//FIX: check on 3-way symmetric guanidinium mol -
|
|
// this could produce a only single bonded C.cat for bad H mols
|
|
res->getBondBetweenAtoms(idx,*nbrIdxIt)->setBondType(Bond::SINGLE);
|
|
res->getBondBetweenAtoms(idx,*nbrIdxIt)->setIsAromatic(false);
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
//now modify the respective N and the C.cat
|
|
Bond *b = res->getBondBetweenAtoms(idx,toModIdx);
|
|
b->setBondType(Bond::DOUBLE);
|
|
b->setIsAromatic(false);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
|
|
at->setIsAromatic(false);
|
|
}
|
|
}
|
|
idx++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Atom *ParseMol2FileAtomLine(const std::string atomLine, RDGeom::Point3D &pos) {
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
std::string tAN,tAT;
|
|
tokenizer tokens(atomLine,sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("no info in mol2 atom line");
|
|
}
|
|
|
|
Atom *res = new Atom();
|
|
|
|
//skip TriposAtomId
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
//the sybyl atom name does not necessarily make sense - into atom property
|
|
tAN=*itemIt;
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
try{
|
|
pos.x = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
pos.y = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
pos.z = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 coordinates.");
|
|
}
|
|
//now it becomes interesting - this is the SYBYL atom type. I put this into an atom property and deduce the
|
|
//symbol from that (everything before the dot)
|
|
tAT=*itemIt;
|
|
std::string symb = (*itemIt).substr(0,(*itemIt).find('.'));
|
|
|
|
//bad symbols:
|
|
//LP is not an atom so remove it ...
|
|
if(symb=="LP"){
|
|
delete res;
|
|
return NULL;
|
|
} else if(symb=="ANY" || symb=="Du"){
|
|
//queryAtoms
|
|
// according to the SYBYL spec, these match anything
|
|
QueryAtom *query=new QueryAtom(0);
|
|
query->setQuery(makeAtomNullQuery());
|
|
delete res;
|
|
res=query;
|
|
} else if (symb=="HEV"){
|
|
QueryAtom *query=new QueryAtom(1);
|
|
query->getQuery()->setNegation(true);
|
|
delete res;
|
|
res=query;
|
|
} else if (symb=="HET"){
|
|
//Tripos: N,O,P,S
|
|
QueryAtom *query=new QueryAtom(7);
|
|
query->expandQuery(makeAtomNumEqualsQuery(8),Queries::COMPOSITE_OR,true);
|
|
query->expandQuery(makeAtomNumEqualsQuery(15),Queries::COMPOSITE_OR,true);
|
|
query->expandQuery(makeAtomNumEqualsQuery(16),Queries::COMPOSITE_OR,true);
|
|
delete res;
|
|
res=query;
|
|
}else if (symb=="HAL"){
|
|
//Tripos: F,Cl,Br,I
|
|
QueryAtom *query=new QueryAtom(9);
|
|
query->expandQuery(makeAtomNumEqualsQuery(17),Queries::COMPOSITE_OR,true);
|
|
query->expandQuery(makeAtomNumEqualsQuery(35),Queries::COMPOSITE_OR,true);
|
|
query->expandQuery(makeAtomNumEqualsQuery(53),Queries::COMPOSITE_OR,true);
|
|
delete res;
|
|
res=query;
|
|
} else{
|
|
res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(symb));
|
|
res->setMass(PeriodicTable::getTable()->getAtomicWeight(res->getAtomicNum()));
|
|
}
|
|
|
|
//now assign the properties
|
|
res->setProp("_TriposAtomName",tAN); //maybe remove that since it's useless?
|
|
res->setProp("_TriposAtomType",tAT);
|
|
//no implicit hydrogens for mol2 files
|
|
res->setNoImplicit(true);
|
|
|
|
//up to here the fields must be written - check if we have more
|
|
//next comes the subst_id, subst_name - skip those
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){return res;}
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){return res;}
|
|
++itemIt;
|
|
//the Partial charge in the file
|
|
if(itemIt!=tokens.end()){
|
|
res->setProp("_TriposPartialCharge",*itemIt);
|
|
}
|
|
//we skip the status bit ...
|
|
|
|
return res;
|
|
}
|
|
|
|
Bond *ParseMol2FileBondLine(const std::string bondLine, const INT_VECT &idxCorresp) {
|
|
unsigned int idx1,idx2;
|
|
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
|
|
tokenizer tokens(bondLine,sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
|
|
try{
|
|
//tripos bond id skip
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
idx1 = boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
idx2 = boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
if(itemIt==tokens.end()){
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 bonds.");
|
|
}
|
|
|
|
// adjust the numbering
|
|
idx1--;idx2--;
|
|
|
|
//if either of both ends of the bond is not an atom in the mol - return NULL
|
|
if (!(idx1<idxCorresp.size() || idx2<idxCorresp.size())){
|
|
throw FileParseException("index mismatch");
|
|
}
|
|
|
|
if (idxCorresp[idx1]<0 || idxCorresp[idx2]<0){
|
|
return NULL;
|
|
}
|
|
|
|
//lexical casts of bond types is not really useful in this case since we could also have strings as values ...
|
|
//use if/else if/end statements for now - maybe change to mapping later - no idea if it's worth it ...?
|
|
Bond::BondType type;
|
|
std::string tBType=*itemIt;
|
|
if(tBType=="1" || tBType=="am"){
|
|
type=Bond::SINGLE;
|
|
} else if(tBType=="2"){
|
|
type=Bond::DOUBLE;
|
|
} else if(tBType=="3"){
|
|
type=Bond::TRIPLE;
|
|
} else if(tBType=="ar"){
|
|
type=Bond::AROMATIC;
|
|
} else if(tBType=="du" || tBType=="un"){
|
|
type=Bond::UNSPECIFIED;//have to come back here - see if there is any comment in th documentation ...
|
|
} else {
|
|
//this happens only if some weird thing is in the file or if we encounter a "nc" - not connected ...
|
|
//but why would anyone specify a bond which is not connected?
|
|
BOOST_LOG(rdWarningLog) << "Warning - unsupported bond type: "<< tBType << " ignored!" << std::endl;
|
|
return NULL;
|
|
}
|
|
Bond *res = new Bond(type);
|
|
res->setBeginAtomIdx(idxCorresp[idx1]);
|
|
res->setEndAtomIdx(idxCorresp[idx2]);
|
|
return res;
|
|
}
|
|
|
|
void ParseMol2AtomBlock(std::istream *inStream, RWMol *res, unsigned int nAtoms, INT_VECT &idxCorresp){
|
|
PRECONDITION(inStream,"inStream not valid");
|
|
PRECONDITION(!inStream->eof(),"inStream is at eof");
|
|
PRECONDITION(res,"RWMol not valid");
|
|
PRECONDITION(idxCorresp.size()==nAtoms,"vector size mismatch");
|
|
unsigned int nLP=0;
|
|
bool hasHAtoms=false;
|
|
|
|
std::vector<RDGeom::Point3D> threeDPs;
|
|
threeDPs.reserve(nAtoms);
|
|
|
|
for(unsigned int i=0;i<nAtoms;++i){
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()){
|
|
throw FileParseException("premature EOF");
|
|
}
|
|
RDGeom::Point3D pos;
|
|
Atom *atom = ParseMol2FileAtomLine(tempStr, pos);
|
|
|
|
//if atom is NULL then we hit LP
|
|
if (atom){
|
|
int aid = res->addAtom(atom,false,true);
|
|
idxCorresp[i]=aid;
|
|
threeDPs.push_back(pos);
|
|
if(atom->getSymbol()=="H"){
|
|
hasHAtoms=true;
|
|
}
|
|
} else{
|
|
++nLP;
|
|
}
|
|
}
|
|
//mol2 files need to have hydrogen atoms otherwise formal charge estimation will be problematic
|
|
if(!hasHAtoms){
|
|
std::string nm;
|
|
res->getProp("_Name",nm);
|
|
BOOST_LOG(rdWarningLog) << nm<<": Warning - no explicit hydrogens in mol2 file but needed for formal charge estimation." << std::endl;
|
|
}
|
|
//create conformer based on 3DPoints and add to RWMol
|
|
Conformer *conf = new Conformer(nAtoms-nLP);
|
|
std::vector<RDGeom::Point3D>::const_iterator threeDPsIt=threeDPs.begin();
|
|
for (unsigned int i=0;i<threeDPs.size();++i){
|
|
conf->setAtomPos(i, *threeDPsIt);
|
|
++threeDPsIt;
|
|
}
|
|
res->addConformer(conf, true);
|
|
|
|
POSTCONDITION(res->getNumAtoms()==(nAtoms-nLP),"Wrong number of atoms in molecule");
|
|
}
|
|
|
|
void ParseMol2BondBlock(std::istream *inStream, RWMol *res, unsigned int nBonds, const INT_VECT &idxCorresp){
|
|
PRECONDITION(inStream,"inStream not valid");
|
|
PRECONDITION(!inStream->eof(),"inStream is at eof");
|
|
PRECONDITION(res,"RWMol not valid");
|
|
unsigned int nBadBonds=0;
|
|
|
|
for(unsigned int i=0;i<nBonds;++i){
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()){
|
|
throw FileParseException("premature EOF");
|
|
}
|
|
Bond *bond = ParseMol2FileBondLine(tempStr,idxCorresp);
|
|
//if something weird happened there will be no bond for that line
|
|
if(bond){
|
|
// if we got an aromatic bond set the flag on the bond and the connected atoms
|
|
if(bond->getBondType()==Bond::AROMATIC){
|
|
bond->setIsAromatic(true);
|
|
res->getAtomWithIdx(bond->getBeginAtomIdx())->setIsAromatic(true);
|
|
res->getAtomWithIdx(bond->getEndAtomIdx())->setIsAromatic(true);
|
|
}
|
|
res->addBond(bond,true);
|
|
} else{
|
|
nBadBonds++;
|
|
}
|
|
}
|
|
POSTCONDITION(res->getNumBonds()==(nBonds-nBadBonds),"Wrong number of atoms in molecule");
|
|
}
|
|
|
|
}; // end of anonymous namespace
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a stream
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize, bool removeHs,
|
|
Mol2Type variant){
|
|
PRECONDITION(inStream,"no stream");
|
|
std::string tempStr,lineBeg;
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
|
|
// all molecules start with a @<TRIPOS>MOLECULE! There is no other way to define an end of
|
|
// molecule than to find a new one or an eof. Hence I have to read until I find one of the two ...
|
|
std::streampos molStart=0,atomStart=0,bondStart=0,chargeStart=0;
|
|
while(!inStream->eof()){
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
break;
|
|
}
|
|
|
|
if(tempStr!="" && tempStr[0]=='@'){
|
|
tokenizer tokens(tempStr, sep);
|
|
std::string firstToken = *tokens.begin();
|
|
if (firstToken=="@<TRIPOS>MOLECULE"){
|
|
if (!molStart){
|
|
//we reach that point in a multimol2 when we have not seen a @<TRIPOS>MOLECULE
|
|
//and set all important flags
|
|
molStart = inStream->tellg();
|
|
} else{
|
|
break;
|
|
}
|
|
} else if(firstToken=="@<TRIPOS>ATOM"){
|
|
atomStart = inStream->tellg();
|
|
} else if(firstToken=="@<TRIPOS>BOND"){
|
|
bondStart = inStream->tellg();
|
|
} else if (firstToken=="@<TRIPOS>UNITY_ATOM_ATTR"){
|
|
//tripos dbtranslate will write not Tripos conform atom types for various aromatic atoms
|
|
//this results in problems with the formal charge guesser. Hence, if we find the
|
|
//UNITY_ATOM_ATTR that contains formal charges we will read those and skip the charger and
|
|
//the substructure cleanup
|
|
chargeStart = inStream->tellg();
|
|
}//end if seenMolBefore
|
|
}//end if @
|
|
}//end while
|
|
|
|
//we should reach this point with at least the molStart and atomStart set
|
|
if(!molStart){
|
|
throw FileParseException("No MOLECULE block found in Mol2 data");
|
|
}
|
|
if(!atomStart){
|
|
throw FileParseException("No ATOM block found in Mol2 data");
|
|
}
|
|
|
|
if(inStream->eof()){
|
|
inStream->clear();
|
|
}
|
|
|
|
inStream->seekg(molStart,std::ios::beg);
|
|
tempStr = getLine(inStream);
|
|
RWMol *res = new RWMol();
|
|
boost::trim_right(tempStr);
|
|
res->setProp("_Name",tempStr);
|
|
|
|
tempStr = getLine(inStream);
|
|
tokenizer tokens(tempStr,sep);
|
|
if(tokens.begin()==tokens.end()){
|
|
if(res){
|
|
delete res;
|
|
}
|
|
throw FileParseException("Empty counts line");
|
|
}
|
|
|
|
unsigned int nAtoms=0,nBonds=0;
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
// counts line, this is where we really get started
|
|
try{
|
|
nAtoms = boost::lexical_cast<unsigned int>(*itemIt);
|
|
|
|
++itemIt;
|
|
if(itemIt!=tokens.end()){
|
|
nBonds = boost::lexical_cast<unsigned int>(*itemIt);
|
|
}
|
|
} catch(boost::bad_lexical_cast &){
|
|
if(res){
|
|
delete res;
|
|
}
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert " << *itemIt << " to unsigned int";
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
|
|
if(nAtoms==0){
|
|
if(res){
|
|
delete res;
|
|
}
|
|
throw FileParseException("molecule has no atoms");
|
|
}
|
|
tempStr = getLine(inStream);//mol_type - ignore
|
|
tempStr = getLine(inStream);
|
|
boost::trim(tempStr);
|
|
res->setProp("_TriposChargeType",tempStr);
|
|
//stop here since we don't support anything else from the MOLECULE block
|
|
INT_VECT idxCorresp(nAtoms,-1);
|
|
inStream->seekg(atomStart,std::ios::beg);
|
|
try{
|
|
ParseMol2AtomBlock(inStream,res,nAtoms,idxCorresp);
|
|
} catch(const FileParseException &e){
|
|
if(res){
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
if(nBonds){
|
|
//stop here since we don't support anything else from the MOLECULE block
|
|
inStream->seekg(bondStart,std::ios::beg);
|
|
try{
|
|
ParseMol2BondBlock(inStream,res,nBonds,idxCorresp);
|
|
} catch(const FileParseException &e){
|
|
if (res){
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
if (!chargeStart){
|
|
bool molFixed = cleanUpMol2Substructures(res);
|
|
|
|
if(!molFixed){
|
|
delete res;
|
|
return NULL;
|
|
}
|
|
|
|
//mol2 format does not support formal charge information, hence we need to guess it
|
|
//based on default and explicit valences
|
|
guessFormalCharges(res);
|
|
}else{
|
|
inStream->seekg(chargeStart,std::ios::beg);
|
|
try {
|
|
readFormalChargesFromAttr(inStream,res);
|
|
}catch(const FileParseException &e){
|
|
if (res){
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
//set chirality prior to sanitisation since it happens from 3D and it's not
|
|
//possible anymore once the hydrogens are removed
|
|
//FIX: for now this is only for the first conformer - need to be changed once we
|
|
//use multiconformer files
|
|
MolOps::assignChiralTypesFrom3D(*res);
|
|
|
|
if (res && sanitize ) {
|
|
try {
|
|
if(removeHs){
|
|
ROMol *tmp=MolOps::removeHs(*res,false,false);
|
|
delete res;
|
|
res = static_cast<RWMol *>(tmp);
|
|
} else {
|
|
MolOps::sanitizeMol(*res);
|
|
}
|
|
}
|
|
catch (MolSanitizeException &se){
|
|
BOOST_LOG(rdWarningLog)<<"sanitise ";
|
|
std::string molName;
|
|
res->getProp("_Name",molName);
|
|
BOOST_LOG(rdWarningLog)<<molName<<": ";
|
|
delete res;
|
|
throw se;
|
|
}
|
|
}
|
|
|
|
return res;
|
|
};
|
|
|
|
|
|
RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize, bool removeHs,
|
|
Mol2Type variant){
|
|
return Mol2DataStreamToMol(&inStream,sanitize,removeHs);
|
|
};
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a string
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize, bool removeHs,
|
|
Mol2Type variant){
|
|
std::istringstream inStream(molBlock);
|
|
return Mol2DataStreamToMol(inStream, sanitize, removeHs);
|
|
}
|
|
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a file
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2FileToMol(std::string fName, bool sanitize, bool removeHs,
|
|
Mol2Type variant){
|
|
// FIX: this binary mode of opening file is here because of a bug in VC++ 6.0
|
|
// the function "tellg" does not work correctly if we do not open it this way
|
|
// Jan 2009: Confirmed that this is still the case in visual studio 2008
|
|
std::ifstream inStream(fName.c_str(),std::ios_base::binary);
|
|
if(!inStream || (inStream.bad())){
|
|
std::ostringstream errout;
|
|
errout << "Bad input file " << fName;
|
|
throw BadFileException(errout.str());
|
|
}
|
|
RWMol *res=NULL;
|
|
if(!inStream.eof()){
|
|
res=Mol2DataStreamToMol(inStream, sanitize, removeHs);
|
|
}
|
|
return res;
|
|
}
|
|
}
|
|
|