// $Id: Mol2FileParser.cpp 1457 2009-04-03 09:05:17Z landrgr1 $ // // Copyright (c) 2008, Novartis Institutes for BioMedical Research Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior // written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // created by Nik Stiefl May 2008 // this file is heavily based on glandrum's MolFileParser // #include "FileParsers.h" #include "MolFileStereochem.h" #include #include #include #include // #include #include #include #include #include #include #include #include namespace RDKit{ namespace { void fixNitroSubstructureAndCharge(RWMol *res, unsigned int atIdx){ unsigned int noODblNeighbors=0; ROMol::ADJ_ITER nbrIdxIt,nbrEndIdxIt; unsigned int toModIdx=0; boost::tie(nbrIdxIt,nbrEndIdxIt)=res->getAtomNeighbors(res->getAtomWithIdx(atIdx)); while (nbrIdxIt!=nbrEndIdxIt){ Bond *curBond = res->getBondBetweenAtoms(atIdx,*nbrIdxIt); if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==8 && curBond->getBondType() == Bond::DOUBLE){ ++noODblNeighbors; toModIdx=*nbrIdxIt; } ++nbrIdxIt; } if(noODblNeighbors==2){ res->getBondBetweenAtoms(atIdx,toModIdx)->setBondType(Bond::SINGLE); res->getAtomWithIdx(atIdx)->setFormalCharge(1); res->getAtomWithIdx(toModIdx)->setFormalCharge(-1); } } void readFormalChargesFromAttr(std::istream *inStream, RWMol *res){ PRECONDITION(inStream,"inStream not valid"); PRECONDITION(!inStream->eof(),"inStream is at eof"); PRECONDITION(res,"RWMol not valid"); typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \t\n"); bool readNextAtomAttribs=true; unsigned int atomIdx=0, noAtomAttr=0; //std::streampos stPos = inStream->tellg(); std::string tempStr = getLine(inStream); //there needs to be at least one entry if (inStream->eof()){ throw FileParseException("premature EOF in readFormalCharges"); } while (readNextAtomAttribs){ tokenizer tokens(tempStr,sep); tokenizer::const_iterator itemIt = tokens.begin(); try { atomIdx=boost::lexical_cast(*itemIt); ++itemIt; noAtomAttr=boost::lexical_cast(*itemIt); }catch (boost::bad_lexical_cast &) { throw FileParseException("Cannot process mol2 UnityAtomAttr."); } for(unsigned int i=0;ieof()){ throw FileParseException("premature EOF in readFormalCharges"); } tokenizer atmTokens(tempStr,sep); itemIt = atmTokens.begin(); if (*itemIt == "AtomExpr"){ int formCharge=0; ++itemIt; //FIX:what if an atom has multiple properties? Seems like they might be separated //with ";" ... need to look at that in more detail! if ((*itemIt).find("=")==std::string::npos){ try { formCharge = boost::lexical_cast(*itemIt); }catch (boost::bad_lexical_cast &) { readNextAtomAttribs=false; throw FileParseException("Cannot process mol2 formal charge."); } //assign the charge res->getAtomWithIdx(atomIdx-1)->setFormalCharge(formCharge); } } }//endfor //check if we have finished reading UNITY_ATOM_ATTR if (inStream->eof()){ readNextAtomAttribs=false; }else{ tempStr = getLine(inStream); if (tempStr == "" || tempStr[0] == '@' || tempStr[0] == '#'){ readNextAtomAttribs=false; } } } } void guessFormalCharges(RWMol *res){ //FIX: this whole thing has problems with positively charged pyridines et al. for(RWMol::AtomIterator atomIt=res->beginAtoms();atomIt!=res->endAtoms();++atomIt) { Atom *at = (*atomIt); //assign only if no formal charge set on that atom and atom is not carbon (the latter //might needs changing later on - let's see) and not for query atoms (dummy etc.) //since this happens only during cleanup of bad substructures //FIX: check nitro compounds et al. if(at->getFormalCharge()==0 && at->getSymbol()!="C" && !(at->hasQuery())){ //have to calculate the explicit valence without call to calcExplicitValence since //this will barf when I have e.g. an uncharged 4 valent nitrogen ... int noAromBonds=0; double accum=0; //FIX: could this give non int values ? ROMol::OEDGE_ITER beg,end; boost::tie(beg,end) = res->getAtomBonds(at); while(beg!=end){ accum += (*res)[*beg]->getValenceContrib(at); if ((*res)[*beg]->getBondType() == Bond::AROMATIC){ ++noAromBonds; } ++beg; } //Assumption: if there is an aromatic bridge atom the accum will be 4.5 //(three aromatic bonds), e.g. naphthalenes. However those are not charged so we //can stop here //FIX: a structure such as c12cccc[n+]2cccn1 will not work if we just continue //for noAromBonds>2 //special case checking - if atom c then go on if (noAromBonds>2 && at->getSymbol()=="C"){ continue; } //dbtranslate problems - for mols with no UNITY_ATOM_ATTR set - we won't be able to guess //if this mol needs to be guessed or not ... hence, we don't guess on atoms with //ar specification and not atom type X.ar and in 5-membered ring (there is stuff like //c1cc[o+]cc1 that should be charged //e.g. 5ring with N.pl3 as NH atom or other atoms without ar specification in aromatic ring //FIX: do we need make sure this only happens for atoms in ring? std::string tATT; at->getProp(common_properties::_TriposAtomType,tATT); MolOps::findSSSR(*res); if (tATT.find("ar")==std::string::npos && at->getIsAromatic() && res->getRingInfo()->isAtomInRingOfSize(at->getIdx(),5)){ continue; } //for dbtranslate a problem will also occur for N.ar with 3 aromatic bonds and no charge //assigned for these we don't assign charges (corina will create kekulized input for this //(at least in most cases) - anyway, throw a warning! if (noAromBonds==3 && tATT=="N.ar"){ std::string nm; res->getProp(common_properties::_Name,nm); BOOST_LOG(rdWarningLog)<(round(accum+0.1)); const INT_VECT &valens = PeriodicTable::getTable()->getValenceList(at->getAtomicNum()); INT_VECT_CI vi; //check default valence and compare to expVal - chg //the hypothesis is that we prefer positively charged atoms over negatively charged ones //for multi default valence atoms (e.g. CS(O)(O) should end up being C[S+]([O-])[O-] rather //than C[S-][O-][O-] but that might change based no different examples int nElectrons=PeriodicTable::getTable()->getNouterElecs(at->getAtomicNum()); int assignChg; if(nElectrons>=4) assignChg = expVal - (*valens.begin()); else assignChg = (*valens.begin()) - expVal; if (assignChg>0 && nElectrons>=4){ for (vi = valens.begin(); vi != valens.end(); ++vi) { //Since we do this only for nocharged atoms we can get away without including the //charge into this //apart from that we do not assign charges higher than +/- 1 for atoms with multiple valence states //otherwise the early break would have to go away which in turn would result in things like [S+4] for //sulfonamides assignChg = expVal - (*vi); if ((*vi)<=expVal && abs(assignChg)<2){ break; } } } if (assignChg){ //no aromatic atom will get aa abs(charge) > 1 if(at->getIsAromatic() && abs(assignChg)>1){ at->setFormalCharge((assignChg>0)-(assignChg<0));//this results in -1 or +1 }else{ at->setFormalCharge(assignChg); } //corina will create strange nitro groups which look like N(=O)(=O) which in turn will result in //[N+2](=O)(=O) this needs to be fixed at this stage since otherwise we would have to check on all //N.pl3 during the cleanup substructures step if(assignChg==2 && expVal==5 && at->getSymbol()=="N"){ fixNitroSubstructureAndCharge(res,at->getIdx()); } //what if expVal != at->calcExplicitValence();? //cannot imagine a case where that will happen now. } } } } unsigned int chkNoHNeighbNOx(RWMol *res, ROMol::ADJ_ITER atIdxIt, int &toModIdx){ Atom *at = res->getAtomWithIdx(*atIdxIt); unsigned int noHNbrs=0; ROMol::ADJ_ITER nbrIdxIt,nbrEndIdxIt; boost::tie(nbrIdxIt,nbrEndIdxIt) = res->getAtomNeighbors(at); while (nbrIdxIt!=nbrEndIdxIt){ if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==1){ ++noHNbrs; }else if(res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum()==8 && res->getAtomDegree(res->getAtomWithIdx(*nbrIdxIt))==1){ //this is a N in an N-oxide constellation //we can do the above if clause since mol2 have explicit hydrogens toModIdx=*atIdxIt; } ++nbrIdxIt; } return noHNbrs; } bool cleanUpMol2Substructures(RWMol *res){ //NOTE: check the nitro fix in guess formal charges! boost::dynamic_bitset<> isFixed(res->getNumAtoms()); for(ROMol::AtomIterator atIt=res->beginAtoms();atIt!=res->endAtoms();++atIt){ std::string tAT; Atom *at = *atIt; unsigned int idx=at->getIdx(); at->getProp(common_properties::_TriposAtomType,tAT); if (tAT=="N.4"){ at->setFormalCharge(1); } else if(tAT=="O.co2"){ //negatively charged carboxylates with O.co2 //according to Tripos, those should only appear in carboxylates and phosphates, //FIX: do it also for phsopahtes and sulphates ... if (at->getDegree()!=1){ BOOST_LOG(rdWarningLog)<<"Warning - O.co2 with degree >1."<getAtomNeighbors(at); //this should return only the C.2 Atom *nbr=res->getAtomWithIdx(*nbrIdxIt); std::string tATT; nbr->getProp(common_properties::_TriposAtomType,tATT); //carboxylates if (tATT=="C.2" || tATT=="S.o2" ){ //this should return only the bond between C.2 and O.co2 Bond *b = res->getBondBetweenAtoms(idx,*nbrIdxIt); if(!isFixed[*nbrIdxIt]){ //the first occurence is negatively charged and has a single bond b->setBondType(Bond::SINGLE); b->setIsAromatic(false); at->setFormalCharge(-1); at->setIsAromatic(false); nbr->setIsAromatic(false); isFixed[idx]=1; isFixed[*nbrIdxIt]=1; } else{ //the other occurences are not charged and have a double bond b->setBondType(Bond::DOUBLE); b->setIsAromatic(false); at->setIsAromatic(false); isFixed[idx]=1; } }else{ std::string nm; res->getProp(common_properties::_Name,nm); BOOST_LOG(rdWarningLog)<getAtomNeighbors(at); //one problem of programs like Corina is, that they will create also C.cat //for groups that are not guanidinium. We cannot fix all, but the charged amidine //in a ring is taken care of too. tmpIdxIt=nbrIdxIt; //declare and initialise toModIdx int toModIdx=-1; unsigned int noNNeighbors=0; while (tmpIdxIt != endNbrsIdxIt){ if(res->getAtomWithIdx(*tmpIdxIt)->getSymbol()=="N"){ ++noNNeighbors; } ++tmpIdxIt; } if(noNNeighbors<2 || noNNeighbors>3){ std::string nm; res->getProp(common_properties::_Name,nm); BOOST_LOG(rdWarningLog)<getAtomWithIdx(*nbrIdxIt)->getSymbol()=="N"){ //fix the bond to one - only the modified N will have a double bond to C.cat res->getBondBetweenAtoms(idx,*nbrIdxIt)->setBondType(Bond::SINGLE); res->getBondBetweenAtoms(idx,*nbrIdxIt)->setIsAromatic(false); res->getAtomWithIdx(*nbrIdxIt)->setIsAromatic(false); //FIX: what is happening if we hit an atom that was fixed before - propably nothing. //since I cannot think of a case where this is a problem - throw a warning if(isFixed[*nbrIdxIt]){ std::string nm; res->getProp(common_properties::_Name,nm); BOOST_LOG(rdWarningLog)<noHNbrs2){ toModIdx=*idxIt1; }else{ toModIdx=*idxIt2; //this is random if both have the same number of atoms } }else{ //perceive the rings MolOps::findSSSR(*res); //then we check if both atoms are in a ring unsigned int rIdx1=res->getRingInfo()->numAtomRings((*idxIt1)); unsigned int rIdx2=res->getRingInfo()->numAtomRings((*idxIt2)); if(rIdx1>rIdx2){ toModIdx=*idxIt1; }else{ toModIdx=*idxIt2; } } } res->getBondBetweenAtoms(idx,toModIdx)->setBondType(Bond::DOUBLE); res->getBondBetweenAtoms(idx,toModIdx)->setIsAromatic(false); res->getAtomWithIdx(toModIdx)->setFormalCharge(1); at->setIsAromatic(false); } else{ while (nbrIdxIt != endNbrsIdxIt){ if(!isFixed[*nbrIdxIt]){ //we get in here if this N.pl3 was not seen / fixed before Atom *nbr=res->getAtomWithIdx(*nbrIdxIt); //get the number of heavy atoms connected to this atom ROMol::ADJ_ITER nbrNbrIdxIt,nbrEndNbrsIdxIt; unsigned int hvyAtDeg=0; boost::tie(nbrNbrIdxIt,nbrEndNbrsIdxIt) = res->getAtomNeighbors(nbr); while (nbrNbrIdxIt!=nbrEndNbrsIdxIt){ if(res->getAtomWithIdx(*nbrNbrIdxIt)->getAtomicNum()>1){ std::string nbrAT; res->getAtomWithIdx(*nbrNbrIdxIt)->getProp(common_properties::_TriposAtomType,nbrAT); if (nbrAT=="C.cat"){ hvyAtDeg+=2;//that way we reduce the risk of ionising the N attached to another C.cat ... } else{ ++hvyAtDeg; } } ++nbrNbrIdxIt; } //now check for lowest heavy atom degree if(hvyAtDeggetBondBetweenAtoms(idx,*nbrIdxIt); b->setBondType(Bond::SINGLE); b->setIsAromatic(false); nbr->setIsAromatic(false); //set N.pl3 as fixed isFixed[*nbrIdxIt]=1; } else{ //the N is allready fixed - since we don't touch this atom make the bond to single //FIX: check on 3-way symmetric guanidinium mol - // this could produce a only single bonded C.cat for bad H mols res->getBondBetweenAtoms(idx,*nbrIdxIt)->setBondType(Bond::SINGLE); res->getBondBetweenAtoms(idx,*nbrIdxIt)->setIsAromatic(false); } ++nbrIdxIt; } //now modify the respective N and the C.cat Bond *b = res->getBondBetweenAtoms(idx,toModIdx); b->setBondType(Bond::DOUBLE); b->setIsAromatic(false); res->getAtomWithIdx(toModIdx)->setFormalCharge(1); at->setIsAromatic(false); } } idx++; } return true; } Atom *ParseMol2FileAtomLine(const std::string atomLine, RDGeom::Point3D &pos) { typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \t\n"); std::string tAN,tAT; tokenizer tokens(atomLine,sep); tokenizer::const_iterator itemIt = tokens.begin(); if(itemIt==tokens.end()){ throw FileParseException("no info in mol2 atom line"); } Atom *res = new Atom(); //skip TriposAtomId ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("premature end of mol2 atom line"); } //the sybyl atom name does not necessarily make sense - into atom property tAN=*itemIt; ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("premature end of mol2 atom line"); } try{ pos.x = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("premature end of mol2 atom line"); } pos.y = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("premature end of mol2 atom line"); } pos.z = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("premature end of mol2 atom line"); } } catch (boost::bad_lexical_cast &) { throw FileParseException("Cannot process mol2 coordinates."); } //now it becomes interesting - this is the SYBYL atom type. I put this into an atom property and deduce the //symbol from that (everything before the dot) tAT=*itemIt; std::string symb = (*itemIt).substr(0,(*itemIt).find('.')); //bad symbols: //LP is not an atom so remove it ... if(symb=="LP"){ delete res; return NULL; } else if(symb=="ANY" || symb=="Du"){ //queryAtoms // according to the SYBYL spec, these match anything QueryAtom *query=new QueryAtom(0); query->setQuery(makeAtomNullQuery()); delete res; res=query; } else if (symb=="HEV"){ QueryAtom *query=new QueryAtom(1); query->getQuery()->setNegation(true); delete res; res=query; } else if (symb=="HET"){ //Tripos: N,O,P,S QueryAtom *query=new QueryAtom(7); query->expandQuery(makeAtomNumQuery(8),Queries::COMPOSITE_OR,true); query->expandQuery(makeAtomNumQuery(15),Queries::COMPOSITE_OR,true); query->expandQuery(makeAtomNumQuery(16),Queries::COMPOSITE_OR,true); delete res; res=query; }else if (symb=="HAL"){ //Tripos: F,Cl,Br,I QueryAtom *query=new QueryAtom(9); query->expandQuery(makeAtomNumQuery(17),Queries::COMPOSITE_OR,true); query->expandQuery(makeAtomNumQuery(35),Queries::COMPOSITE_OR,true); query->expandQuery(makeAtomNumQuery(53),Queries::COMPOSITE_OR,true); delete res; res=query; } else{ res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(symb)); } //now assign the properties res->setProp("_TriposAtomName",tAN); //maybe remove that since it's useless? res->setProp(common_properties::_TriposAtomType,tAT); //no implicit hydrogens for mol2 files res->setNoImplicit(true); //up to here the fields must be written - check if we have more //next comes the subst_id, subst_name - skip those ++itemIt; if(itemIt==tokens.end()){return res;} ++itemIt; if(itemIt==tokens.end()){return res;} ++itemIt; //the Partial charge in the file if(itemIt!=tokens.end()){ res->setProp("_TriposPartialCharge",*itemIt); } //we skip the status bit ... return res; } Bond *ParseMol2FileBondLine(const std::string bondLine, const INT_VECT &idxCorresp) { unsigned int idx1,idx2; typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \t\n"); tokenizer tokens(bondLine,sep); tokenizer::const_iterator itemIt = tokens.begin(); if(itemIt==tokens.end()){ throw FileParseException("no info in mol2 bond line"); } try{ //tripos bond id skip ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("no info in mol2 bond line"); } idx1 = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("no info in mol2 bond line"); } idx2 = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt==tokens.end()){ throw FileParseException("no info in mol2 bond line"); } } catch (boost::bad_lexical_cast &) { throw FileParseException("Cannot process mol2 bonds."); } // adjust the numbering idx1--;idx2--; //if either of both ends of the bond is not an atom in the mol - return NULL if (!(idx1setBeginAtomIdx(idxCorresp[idx1]); res->setEndAtomIdx(idxCorresp[idx2]); return res; } void ParseMol2AtomBlock(std::istream *inStream, RWMol *res, unsigned int nAtoms, INT_VECT &idxCorresp){ PRECONDITION(inStream,"inStream not valid"); PRECONDITION(!inStream->eof(),"inStream is at eof"); PRECONDITION(res,"RWMol not valid"); PRECONDITION(idxCorresp.size()==nAtoms,"vector size mismatch"); unsigned int nLP=0; bool hasHAtoms=false; std::vector threeDPs; threeDPs.reserve(nAtoms); for(unsigned int i=0;ieof()){ throw FileParseException("premature EOF"); } RDGeom::Point3D pos; Atom *atom = ParseMol2FileAtomLine(tempStr, pos); //if atom is NULL then we hit LP if (atom){ int aid = res->addAtom(atom,false,true); idxCorresp[i]=aid; threeDPs.push_back(pos); if(atom->getSymbol()=="H"){ hasHAtoms=true; } } else{ ++nLP; } } //mol2 files need to have hydrogen atoms otherwise formal charge estimation will be problematic if(!hasHAtoms){ std::string nm; res->getProp(common_properties::_Name,nm); BOOST_LOG(rdWarningLog) << nm<<": Warning - no explicit hydrogens in mol2 file but needed for formal charge estimation." << std::endl; } //create conformer based on 3DPoints and add to RWMol Conformer *conf = new Conformer(nAtoms-nLP); std::vector::const_iterator threeDPsIt=threeDPs.begin(); for (unsigned int i=0;isetAtomPos(i, *threeDPsIt); ++threeDPsIt; } res->addConformer(conf, true); POSTCONDITION(res->getNumAtoms()==(nAtoms-nLP),"Wrong number of atoms in molecule"); } void ParseMol2BondBlock(std::istream *inStream, RWMol *res, unsigned int nBonds, const INT_VECT &idxCorresp){ PRECONDITION(inStream,"inStream not valid"); PRECONDITION(!inStream->eof(),"inStream is at eof"); PRECONDITION(res,"RWMol not valid"); unsigned int nBadBonds=0; for(unsigned int i=0;ieof()){ throw FileParseException("premature EOF"); } Bond *bond = ParseMol2FileBondLine(tempStr,idxCorresp); //if something weird happened there will be no bond for that line if(bond){ // if we got an aromatic bond set the flag on the bond and the connected atoms if(bond->getBondType()==Bond::AROMATIC){ bond->setIsAromatic(true); res->getAtomWithIdx(bond->getBeginAtomIdx())->setIsAromatic(true); res->getAtomWithIdx(bond->getEndAtomIdx())->setIsAromatic(true); } res->addBond(bond,true); } else{ nBadBonds++; } } POSTCONDITION(res->getNumBonds()==(nBonds-nBadBonds),"Wrong number of atoms in molecule"); } }; // end of anonymous namespace //------------------------------------------------ // // Read a molecule from a stream // //------------------------------------------------ RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize, bool removeHs, Mol2Type variant){ RDUNUSED_PARAM(variant); PRECONDITION(inStream,"no stream"); std::string tempStr,lineBeg; typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \t\n"); Utils::LocaleSwitcher ls; // all molecules start with a @MOLECULE! There is no other way to define an end of // molecule than to find a new one or an eof. Hence I have to read until I find one of the two ... std::streampos molStart=0,atomStart=0,bondStart=0,chargeStart=0; while(!inStream->eof()){ tempStr = getLine(inStream); if(inStream->eof()){ break; } if(tempStr!="" && tempStr[0]=='@'){ tokenizer tokens(tempStr, sep); std::string firstToken = *tokens.begin(); if (firstToken=="@MOLECULE"){ if (!molStart){ //we reach that point in a multimol2 when we have not seen a @MOLECULE //and set all important flags molStart = inStream->tellg(); } else{ break; } } else if(firstToken=="@ATOM"){ atomStart = inStream->tellg(); } else if(firstToken=="@BOND"){ bondStart = inStream->tellg(); } else if (firstToken=="@UNITY_ATOM_ATTR"){ //tripos dbtranslate will write not Tripos conform atom types for various aromatic atoms //this results in problems with the formal charge guesser. Hence, if we find the //UNITY_ATOM_ATTR that contains formal charges we will read those and skip the charger and //the substructure cleanup chargeStart = inStream->tellg(); }//end if seenMolBefore }//end if @ }//end while //we should reach this point with at least the molStart and atomStart set if(!molStart){ throw FileParseException("No MOLECULE block found in Mol2 data"); } if(!atomStart){ throw FileParseException("No ATOM block found in Mol2 data"); } if(inStream->eof()){ inStream->clear(); } inStream->seekg(molStart,std::ios::beg); tempStr = getLine(inStream); RWMol *res = new RWMol(); boost::trim_right(tempStr); res->setProp(common_properties::_Name,tempStr); tempStr = getLine(inStream); tokenizer tokens(tempStr,sep); if(tokens.begin()==tokens.end()){ if(res){ delete res; } throw FileParseException("Empty counts line"); } unsigned int nAtoms=0,nBonds=0; tokenizer::const_iterator itemIt = tokens.begin(); // counts line, this is where we really get started try{ nAtoms = boost::lexical_cast(*itemIt); ++itemIt; if(itemIt!=tokens.end()){ nBonds = boost::lexical_cast(*itemIt); } } catch(boost::bad_lexical_cast &){ if(res){ delete res; } std::ostringstream errout; errout << "Cannot convert " << *itemIt << " to unsigned int"; throw FileParseException(errout.str()) ; } if(nAtoms==0){ if(res){ delete res; } throw FileParseException("molecule has no atoms"); } tempStr = getLine(inStream);//mol_type - ignore tempStr = getLine(inStream); boost::trim(tempStr); res->setProp("_TriposChargeType",tempStr); //stop here since we don't support anything else from the MOLECULE block INT_VECT idxCorresp(nAtoms,-1); inStream->seekg(atomStart,std::ios::beg); try{ ParseMol2AtomBlock(inStream,res,nAtoms,idxCorresp); } catch(const FileParseException &e){ if(res){ delete res; } throw e; } if(nBonds){ //stop here since we don't support anything else from the MOLECULE block inStream->seekg(bondStart,std::ios::beg); try{ ParseMol2BondBlock(inStream,res,nBonds,idxCorresp); } catch(const FileParseException &e){ if (res){ delete res; } throw e; } } if (!chargeStart){ bool molFixed = cleanUpMol2Substructures(res); if(!molFixed){ delete res; return NULL; } //mol2 format does not support formal charge information, hence we need to guess it //based on default and explicit valences guessFormalCharges(res); }else{ inStream->seekg(chargeStart,std::ios::beg); try { readFormalChargesFromAttr(inStream,res); }catch(const FileParseException &e){ if (res){ delete res; } throw e; } } //set chirality prior to sanitisation since it happens from 3D and it's not //possible anymore once the hydrogens are removed //FIX: for now this is only for the first conformer - need to be changed once we //use multiconformer files MolOps::assignChiralTypesFrom3D(*res); if (res && sanitize ) { MolOps::cleanUp(*res); try { if(removeHs){ MolOps::removeHs(*res,false,false); } else { MolOps::sanitizeMol(*res); } // call DetectBondStereoChemistry after sanitization "because we need // the ring information". Also this will set the E/Z labels on the bond. // Similar in spirit to what happens in MolFileParser const Conformer &conf = res->getConformer(); DetectBondStereoChemistry(*res, &conf); } catch (MolSanitizeException &se){ BOOST_LOG(rdWarningLog)<<"sanitise "; std::string molName; res->getProp(common_properties::_Name,molName); BOOST_LOG(rdWarningLog)<updatePropertyCache(false); MolOps::assignStereochemistry(*res,true,true); } return res; }; RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize, bool removeHs, Mol2Type variant){ RDUNUSED_PARAM(variant); return Mol2DataStreamToMol(&inStream,sanitize,removeHs); }; //------------------------------------------------ // // Read a molecule from a string // //------------------------------------------------ RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize, bool removeHs, Mol2Type variant){ RDUNUSED_PARAM(variant); std::istringstream inStream(molBlock); return Mol2DataStreamToMol(inStream, sanitize, removeHs); } //------------------------------------------------ // // Read a molecule from a file // //------------------------------------------------ RWMol *Mol2FileToMol(const std::string &fName, bool sanitize, bool removeHs, Mol2Type variant){ // FIX: this binary mode of opening file is here because of a bug in VC++ 6.0 // the function "tellg" does not work correctly if we do not open it this way // Jan 2009: Confirmed that this is still the case in visual studio 2008 RDUNUSED_PARAM(variant); std::ifstream inStream(fName.c_str(),std::ios_base::binary); if(!inStream || (inStream.bad())){ std::ostringstream errout; errout << "Bad input file " << fName; throw BadFileException(errout.str()); } RWMol *res=NULL; if(!inStream.eof()){ res=Mol2DataStreamToMol(inStream, sanitize, removeHs); } return res; } }