Files
rdkit/Code/GraphMol/FileParsers/Mol2FileParser.cpp
2015-11-14 14:58:11 +01:00

1025 lines
37 KiB
C++

// $Id: Mol2FileParser.cpp 1457 2009-04-03 09:05:17Z landrgr1 $
//
// Copyright (c) 2008, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior
// written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// created by Nik Stiefl May 2008
// this file is heavily based on glandrum's MolFileParser
//
#include "FileParsers.h"
#include "MolFileStereochem.h"
#include <RDGeneral/Invariant.h>
#include <GraphMol/RDKitQueries.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/RDLog.h>
//
#include <fstream>
#include <boost/tokenizer.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/dynamic_bitset.hpp>
#include <RDGeneral/FileParseException.h>
#include <RDGeneral/BadFileException.h>
#include <RDGeneral/LocaleSwitcher.h>
namespace RDKit {
namespace {
void fixNitroSubstructureAndCharge(RWMol *res, unsigned int atIdx) {
unsigned int noODblNeighbors = 0;
ROMol::ADJ_ITER nbrIdxIt, nbrEndIdxIt;
unsigned int toModIdx = 0;
boost::tie(nbrIdxIt, nbrEndIdxIt) =
res->getAtomNeighbors(res->getAtomWithIdx(atIdx));
while (nbrIdxIt != nbrEndIdxIt) {
Bond *curBond = res->getBondBetweenAtoms(atIdx, *nbrIdxIt);
if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 8 &&
curBond->getBondType() == Bond::DOUBLE) {
++noODblNeighbors;
toModIdx = *nbrIdxIt;
}
++nbrIdxIt;
}
if (noODblNeighbors == 2) {
res->getBondBetweenAtoms(atIdx, toModIdx)->setBondType(Bond::SINGLE);
res->getAtomWithIdx(atIdx)->setFormalCharge(1);
res->getAtomWithIdx(toModIdx)->setFormalCharge(-1);
}
}
void readFormalChargesFromAttr(std::istream *inStream, RWMol *res) {
PRECONDITION(inStream, "inStream not valid");
PRECONDITION(!inStream->eof(), "inStream is at eof");
PRECONDITION(res, "RWMol not valid");
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
boost::char_separator<char> sep(" \t\n");
bool readNextAtomAttribs = true;
unsigned int atomIdx = 0, noAtomAttr = 0;
// std::streampos stPos = inStream->tellg();
std::string tempStr = getLine(inStream);
// there needs to be at least one entry
if (inStream->eof()) {
throw FileParseException("premature EOF in readFormalCharges");
}
while (readNextAtomAttribs) {
tokenizer tokens(tempStr, sep);
tokenizer::const_iterator itemIt = tokens.begin();
try {
atomIdx = boost::lexical_cast<unsigned int>(*itemIt);
++itemIt;
noAtomAttr = boost::lexical_cast<unsigned int>(*itemIt);
} catch (boost::bad_lexical_cast &) {
throw FileParseException("Cannot process mol2 UnityAtomAttr.");
}
for (unsigned int i = 0; i < noAtomAttr; ++i) {
std::string tempStr = getLine(inStream);
if (inStream->eof()) {
throw FileParseException("premature EOF in readFormalCharges");
}
tokenizer atmTokens(tempStr, sep);
itemIt = atmTokens.begin();
if (*itemIt == "AtomExpr") {
int formCharge = 0;
++itemIt;
// FIX:what if an atom has multiple properties? Seems like they might be
// separated
// with ";" ... need to look at that in more detail!
if ((*itemIt).find("=") == std::string::npos) {
try {
formCharge = boost::lexical_cast<int>(*itemIt);
} catch (boost::bad_lexical_cast &) {
readNextAtomAttribs = false;
throw FileParseException("Cannot process mol2 formal charge.");
}
// assign the charge
res->getAtomWithIdx(atomIdx - 1)->setFormalCharge(formCharge);
}
}
} // endfor
// check if we have finished reading UNITY_ATOM_ATTR
if (inStream->eof()) {
readNextAtomAttribs = false;
} else {
tempStr = getLine(inStream);
if (tempStr == "" || tempStr[0] == '@' || tempStr[0] == '#') {
readNextAtomAttribs = false;
}
}
}
}
void guessFormalCharges(RWMol *res) {
// FIX: this whole thing has problems with positively charged pyridines et al.
for (RWMol::AtomIterator atomIt = res->beginAtoms();
atomIt != res->endAtoms(); ++atomIt) {
Atom *at = (*atomIt);
// assign only if no formal charge set on that atom and atom is not carbon
// (the latter
// might needs changing later on - let's see) and not for query atoms (dummy
// etc.)
// since this happens only during cleanup of bad substructures
// FIX: check nitro compounds et al.
if (at->getFormalCharge() == 0 && at->getSymbol() != "C" &&
!(at->hasQuery())) {
// have to calculate the explicit valence without call to
// calcExplicitValence since
// this will barf when I have e.g. an uncharged 4 valent nitrogen ...
int noAromBonds = 0;
double accum = 0; // FIX: could this give non int values ?
ROMol::OEDGE_ITER beg, end;
boost::tie(beg, end) = res->getAtomBonds(at);
while (beg != end) {
accum += (*res)[*beg]->getValenceContrib(at);
if ((*res)[*beg]->getBondType() == Bond::AROMATIC) {
++noAromBonds;
}
++beg;
}
// Assumption: if there is an aromatic bridge atom the accum will be 4.5
//(three aromatic bonds), e.g. naphthalenes. However those are not charged
// so we
// can stop here
// FIX: a structure such as c12cccc[n+]2cccn1 will not work if we just
// continue
// for noAromBonds>2
// special case checking - if atom c then go on
if (noAromBonds > 2 && at->getSymbol() == "C") {
continue;
}
// dbtranslate problems - for mols with no UNITY_ATOM_ATTR set - we won't
// be able to guess
// if this mol needs to be guessed or not ... hence, we don't guess on
// atoms with
// ar specification and not atom type X.ar and in 5-membered ring (there
// is stuff like
// c1cc[o+]cc1 that should be charged
// e.g. 5ring with N.pl3 as NH atom or other atoms without ar
// specification in aromatic ring
// FIX: do we need make sure this only happens for atoms in ring?
std::string tATT;
at->getProp(common_properties::_TriposAtomType, tATT);
MolOps::findSSSR(*res);
if (tATT.find("ar") == std::string::npos && at->getIsAromatic() &&
res->getRingInfo()->isAtomInRingOfSize(at->getIdx(), 5)) {
continue;
}
// for dbtranslate a problem will also occur for N.ar with 3 aromatic
// bonds and no charge
// assigned for these we don't assign charges (corina will create
// kekulized input for this
//(at least in most cases) - anyway, throw a warning!
if (noAromBonds == 3 && tATT == "N.ar") {
std::string nm;
res->getProp(common_properties::_Name, nm);
BOOST_LOG(rdWarningLog)
<< nm << ": warning - aromatic N with 3 aromatic bonds - "
"skipping charge guess for this atom" << std::endl;
continue;
}
// sometimes things like benzimidazoles can have only one bond of the
// imidazole ring as aromatic and the other one as a single bond ...
// catch that this way - see also the trick from GL
int expVal = static_cast<int>(round(accum + 0.1));
const INT_VECT &valens =
PeriodicTable::getTable()->getValenceList(at->getAtomicNum());
INT_VECT_CI vi;
// check default valence and compare to expVal - chg
// the hypothesis is that we prefer positively charged atoms over
// negatively charged ones
// for multi default valence atoms (e.g. CS(O)(O) should end up being
// C[S+]([O-])[O-] rather
// than C[S-][O-][O-] but that might change based no different examples
int nElectrons =
PeriodicTable::getTable()->getNouterElecs(at->getAtomicNum());
int assignChg;
if (nElectrons >= 4)
assignChg = expVal - (*valens.begin());
else
assignChg = (*valens.begin()) - expVal;
if (assignChg > 0 && nElectrons >= 4) {
for (vi = valens.begin(); vi != valens.end(); ++vi) {
// Since we do this only for nocharged atoms we can get away without
// including the
// charge into this
// apart from that we do not assign charges higher than +/- 1 for
// atoms with multiple valence states
// otherwise the early break would have to go away which in turn would
// result in things like [S+4] for
// sulfonamides
assignChg = expVal - (*vi);
if ((*vi) <= expVal && abs(assignChg) < 2) {
break;
}
}
}
if (assignChg) {
// no aromatic atom will get aa abs(charge) > 1
if (at->getIsAromatic() && abs(assignChg) > 1) {
at->setFormalCharge((assignChg > 0) -
(assignChg < 0)); // this results in -1 or +1
} else {
at->setFormalCharge(assignChg);
}
// corina will create strange nitro groups which look like N(=O)(=O)
// which in turn will result in
//[N+2](=O)(=O) this needs to be fixed at this stage since otherwise we
// would have to check on all
// N.pl3 during the cleanup substructures step
if (assignChg == 2 && expVal == 5 && at->getSymbol() == "N") {
fixNitroSubstructureAndCharge(res, at->getIdx());
}
// what if expVal != at->calcExplicitValence();?
// cannot imagine a case where that will happen now.
}
}
}
}
unsigned int chkNoHNeighbNOx(RWMol *res, ROMol::ADJ_ITER atIdxIt,
int &toModIdx) {
Atom *at = res->getAtomWithIdx(*atIdxIt);
unsigned int noHNbrs = 0;
ROMol::ADJ_ITER nbrIdxIt, nbrEndIdxIt;
boost::tie(nbrIdxIt, nbrEndIdxIt) = res->getAtomNeighbors(at);
while (nbrIdxIt != nbrEndIdxIt) {
if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 1) {
++noHNbrs;
} else if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 8 &&
res->getAtomDegree(res->getAtomWithIdx(*nbrIdxIt)) == 1) {
// this is a N in an N-oxide constellation
// we can do the above if clause since mol2 have explicit hydrogens
toModIdx = *atIdxIt;
}
++nbrIdxIt;
}
return noHNbrs;
}
bool cleanUpMol2Substructures(RWMol *res) {
// NOTE: check the nitro fix in guess formal charges!
boost::dynamic_bitset<> isFixed(res->getNumAtoms());
for (ROMol::AtomIterator atIt = res->beginAtoms(); atIt != res->endAtoms();
++atIt) {
std::string tAT;
Atom *at = *atIt;
unsigned int idx = at->getIdx();
at->getProp(common_properties::_TriposAtomType, tAT);
if (tAT == "N.4") {
at->setFormalCharge(1);
} else if (tAT == "O.co2") {
// negatively charged carboxylates with O.co2
// according to Tripos, those should only appear in carboxylates and
// phosphates,
// FIX: do it also for phsopahtes and sulphates ...
if (at->getDegree() != 1) {
BOOST_LOG(rdWarningLog) << "Warning - O.co2 with degree >1."
<< std::endl;
return false;
}
ROMol::ADJ_ITER nbrIdxIt, endNbrsIdxIt;
// getAtomNeighbours returns 2 adjacency iterators (index iterators)
boost::tie(nbrIdxIt, endNbrsIdxIt) = res->getAtomNeighbors(at);
// this should return only the C.2
Atom *nbr = res->getAtomWithIdx(*nbrIdxIt);
std::string tATT;
nbr->getProp(common_properties::_TriposAtomType, tATT);
// carboxylates
if (tATT == "C.2" || tATT == "S.o2") {
// this should return only the bond between C.2 and O.co2
Bond *b = res->getBondBetweenAtoms(idx, *nbrIdxIt);
if (!isFixed[*nbrIdxIt]) {
// the first occurence is negatively charged and has a single bond
b->setBondType(Bond::SINGLE);
b->setIsAromatic(false);
at->setFormalCharge(-1);
at->setIsAromatic(false);
nbr->setIsAromatic(false);
isFixed[idx] = 1;
isFixed[*nbrIdxIt] = 1;
} else {
// the other occurences are not charged and have a double bond
b->setBondType(Bond::DOUBLE);
b->setIsAromatic(false);
at->setIsAromatic(false);
isFixed[idx] = 1;
}
} else {
std::string nm;
res->getProp(common_properties::_Name, nm);
BOOST_LOG(rdWarningLog)
<< nm << ": warning - O.co2 with non C.2 or S.o2 neighbor."
<< std::endl;
return false;
}
} else if (tAT == "C.cat") {
// positively charged guanidinium groups with C.cat
// according to Tripos these should only appear in guanidinium groups
// for the structural fix - the last nitrogen with the least number of
// heavy atoms will get the double bond and the positive charge.
// remember : this is not canonical!
// first - set the C.cat as fixed
isFixed[idx] = 1;
ROMol::ADJ_ITER nbrIdxIt, endNbrsIdxIt, tmpIdxIt;
unsigned int lowestDeg = 100;
boost::tie(nbrIdxIt, endNbrsIdxIt) = res->getAtomNeighbors(at);
// one problem of programs like Corina is, that they will create also
// C.cat
// for groups that are not guanidinium. We cannot fix all, but the charged
// amidine
// in a ring is taken care of too.
tmpIdxIt = nbrIdxIt;
// declare and initialise toModIdx
int toModIdx = -1;
unsigned int noNNeighbors = 0;
while (tmpIdxIt != endNbrsIdxIt) {
if (res->getAtomWithIdx(*tmpIdxIt)->getSymbol() == "N") {
++noNNeighbors;
}
++tmpIdxIt;
}
if (noNNeighbors < 2 || noNNeighbors > 3) {
std::string nm;
res->getProp(common_properties::_Name, nm);
BOOST_LOG(rdWarningLog)
<< nm << ": Error - C.Cat with bad number of N neighbors."
<< std::endl;
return false;
} else if (noNNeighbors == 2) {
// the idea is that we assign the positive charge according to the
// following precedence:
// 1. is part of N-oxide
// 2. atom with highest number of hydrogen atoms
// 3. atom in ring
// 4. random
// first we identify the N atoms
ROMol::ADJ_ITER idxIt1 = nbrIdxIt, idxIt2 = nbrIdxIt;
bool firstIdent = false;
while (nbrIdxIt != endNbrsIdxIt) {
if (res->getAtomWithIdx(*nbrIdxIt)->getSymbol() == "N") {
// fix the bond to one - only the modified N will have a double bond
// to C.cat
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setBondType(Bond::SINGLE);
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setIsAromatic(false);
res->getAtomWithIdx(*nbrIdxIt)->setIsAromatic(false);
// FIX: what is happening if we hit an atom that was fixed before -
// propably nothing.
// since I cannot think of a case where this is a problem - throw a
// warning
if (isFixed[*nbrIdxIt]) {
std::string nm;
res->getProp(common_properties::_Name, nm);
BOOST_LOG(rdWarningLog)
<< nm << ": warning - charged amidine and isFixed atom."
<< std::endl;
}
isFixed[*nbrIdxIt] = 1;
if (firstIdent) {
idxIt2 = nbrIdxIt;
} else {
idxIt1 = nbrIdxIt;
firstIdent = true;
}
}
++nbrIdxIt;
}
// now that we know which are the relevant atoms we check the above
// features
// is part of N-oxide?
// number of hydrogens on each neighbour
unsigned int noHNbrs1 = chkNoHNeighbNOx(res, idxIt1, toModIdx);
unsigned int noHNbrs2 = chkNoHNeighbNOx(res, idxIt2, toModIdx);
if (toModIdx < 0) {
// no N-oxide
if (noHNbrs1 != noHNbrs2) {
if (noHNbrs1 > noHNbrs2) {
toModIdx = *idxIt1;
} else {
toModIdx = *idxIt2; // this is random if both have the same
// number of atoms
}
} else {
// perceive the rings
MolOps::findSSSR(*res);
// then we check if both atoms are in a ring
unsigned int rIdx1 = res->getRingInfo()->numAtomRings((*idxIt1));
unsigned int rIdx2 = res->getRingInfo()->numAtomRings((*idxIt2));
if (rIdx1 > rIdx2) {
toModIdx = *idxIt1;
} else {
toModIdx = *idxIt2;
}
}
}
res->getBondBetweenAtoms(idx, toModIdx)->setBondType(Bond::DOUBLE);
res->getBondBetweenAtoms(idx, toModIdx)->setIsAromatic(false);
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
at->setIsAromatic(false);
} else {
while (nbrIdxIt != endNbrsIdxIt) {
if (!isFixed[*nbrIdxIt]) {
// we get in here if this N.pl3 was not seen / fixed before
Atom *nbr = res->getAtomWithIdx(*nbrIdxIt);
// get the number of heavy atoms connected to this atom
ROMol::ADJ_ITER nbrNbrIdxIt, nbrEndNbrsIdxIt;
unsigned int hvyAtDeg = 0;
boost::tie(nbrNbrIdxIt, nbrEndNbrsIdxIt) =
res->getAtomNeighbors(nbr);
while (nbrNbrIdxIt != nbrEndNbrsIdxIt) {
if (res->getAtomWithIdx(*nbrNbrIdxIt)->getAtomicNum() > 1) {
std::string nbrAT;
res->getAtomWithIdx(*nbrNbrIdxIt)
->getProp(common_properties::_TriposAtomType, nbrAT);
if (nbrAT == "C.cat") {
hvyAtDeg += 2; // that way we reduce the risk of ionising the
// N attached to another C.cat ...
} else {
++hvyAtDeg;
}
}
++nbrNbrIdxIt;
}
// now check for lowest heavy atom degree
if (hvyAtDeg < lowestDeg) {
toModIdx = *nbrIdxIt;
lowestDeg = hvyAtDeg;
}
// modify the bond between C.Cat and the N.pl3
Bond *b = res->getBondBetweenAtoms(idx, *nbrIdxIt);
b->setBondType(Bond::SINGLE);
b->setIsAromatic(false);
nbr->setIsAromatic(false);
// set N.pl3 as fixed
isFixed[*nbrIdxIt] = 1;
} else {
// the N is allready fixed - since we don't touch this atom make the
// bond to single
// FIX: check on 3-way symmetric guanidinium mol -
// this could produce a only single bonded C.cat for bad H mols
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setBondType(Bond::SINGLE);
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setIsAromatic(false);
}
++nbrIdxIt;
}
// now modify the respective N and the C.cat
Bond *b = res->getBondBetweenAtoms(idx, toModIdx);
b->setBondType(Bond::DOUBLE);
b->setIsAromatic(false);
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
at->setIsAromatic(false);
}
}
idx++;
}
return true;
}
Atom *ParseMol2FileAtomLine(const std::string atomLine, RDGeom::Point3D &pos) {
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
boost::char_separator<char> sep(" \t\n");
std::string tAN, tAT;
tokenizer tokens(atomLine, sep);
tokenizer::const_iterator itemIt = tokens.begin();
if (itemIt == tokens.end()) {
throw FileParseException("no info in mol2 atom line");
}
Atom *res = new Atom();
// skip TriposAtomId
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("premature end of mol2 atom line");
}
// the sybyl atom name does not necessarily make sense - into atom property
tAN = *itemIt;
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("premature end of mol2 atom line");
}
try {
pos.x = boost::lexical_cast<double>(*itemIt);
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("premature end of mol2 atom line");
}
pos.y = boost::lexical_cast<double>(*itemIt);
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("premature end of mol2 atom line");
}
pos.z = boost::lexical_cast<double>(*itemIt);
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("premature end of mol2 atom line");
}
} catch (boost::bad_lexical_cast &) {
throw FileParseException("Cannot process mol2 coordinates.");
}
// now it becomes interesting - this is the SYBYL atom type. I put this into
// an atom property and deduce the
// symbol from that (everything before the dot)
tAT = *itemIt;
std::string symb = (*itemIt).substr(0, (*itemIt).find('.'));
// bad symbols:
// LP is not an atom so remove it ...
if (symb == "LP") {
delete res;
return NULL;
} else if (symb == "ANY" || symb == "Du") {
// queryAtoms
// according to the SYBYL spec, these match anything
QueryAtom *query = new QueryAtom(0);
query->setQuery(makeAtomNullQuery());
delete res;
res = query;
} else if (symb == "HEV") {
QueryAtom *query = new QueryAtom(1);
query->getQuery()->setNegation(true);
delete res;
res = query;
} else if (symb == "HET") {
// Tripos: N,O,P,S
QueryAtom *query = new QueryAtom(7);
query->expandQuery(makeAtomNumQuery(8), Queries::COMPOSITE_OR, true);
query->expandQuery(makeAtomNumQuery(15), Queries::COMPOSITE_OR, true);
query->expandQuery(makeAtomNumQuery(16), Queries::COMPOSITE_OR, true);
delete res;
res = query;
} else if (symb == "HAL") {
// Tripos: F,Cl,Br,I
QueryAtom *query = new QueryAtom(9);
query->expandQuery(makeAtomNumQuery(17), Queries::COMPOSITE_OR, true);
query->expandQuery(makeAtomNumQuery(35), Queries::COMPOSITE_OR, true);
query->expandQuery(makeAtomNumQuery(53), Queries::COMPOSITE_OR, true);
delete res;
res = query;
} else {
res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(symb));
}
// now assign the properties
res->setProp("_TriposAtomName", tAN); // maybe remove that since it's
// useless?
res->setProp(common_properties::_TriposAtomType, tAT);
// no implicit hydrogens for mol2 files
res->setNoImplicit(true);
// up to here the fields must be written - check if we have more
// next comes the subst_id, subst_name - skip those
++itemIt;
if (itemIt == tokens.end()) {
return res;
}
++itemIt;
if (itemIt == tokens.end()) {
return res;
}
++itemIt;
// the Partial charge in the file
if (itemIt != tokens.end()) {
res->setProp("_TriposPartialCharge", *itemIt);
}
// we skip the status bit ...
return res;
}
Bond *ParseMol2FileBondLine(const std::string bondLine,
const INT_VECT &idxCorresp) {
unsigned int idx1, idx2;
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
boost::char_separator<char> sep(" \t\n");
tokenizer tokens(bondLine, sep);
tokenizer::const_iterator itemIt = tokens.begin();
if (itemIt == tokens.end()) {
throw FileParseException("no info in mol2 bond line");
}
try {
// tripos bond id skip
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("no info in mol2 bond line");
}
idx1 = boost::lexical_cast<unsigned int>(*itemIt);
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("no info in mol2 bond line");
}
idx2 = boost::lexical_cast<unsigned int>(*itemIt);
++itemIt;
if (itemIt == tokens.end()) {
throw FileParseException("no info in mol2 bond line");
}
} catch (boost::bad_lexical_cast &) {
throw FileParseException("Cannot process mol2 bonds.");
}
// adjust the numbering
idx1--;
idx2--;
// if either of both ends of the bond is not an atom in the mol - return NULL
if (!(idx1 < idxCorresp.size() || idx2 < idxCorresp.size())) {
throw FileParseException("index mismatch");
}
if (idxCorresp[idx1] < 0 || idxCorresp[idx2] < 0) {
return NULL;
}
// lexical casts of bond types is not really useful in this case since we
// could also have strings as values ...
// use if/else if/end statements for now - maybe change to mapping later - no
// idea if it's worth it ...?
Bond::BondType type;
std::string tBType = *itemIt;
if (tBType == "1" || tBType == "am") {
type = Bond::SINGLE;
} else if (tBType == "2") {
type = Bond::DOUBLE;
} else if (tBType == "3") {
type = Bond::TRIPLE;
} else if (tBType == "ar") {
type = Bond::AROMATIC;
} else if (tBType == "du" || tBType == "un") {
type = Bond::UNSPECIFIED; // have to come back here - see if there is any
// comment in th documentation ...
} else {
// this happens only if some weird thing is in the file or if we encounter a
// "nc" - not connected ...
// but why would anyone specify a bond which is not connected?
BOOST_LOG(rdWarningLog) << "Warning - unsupported bond type: " << tBType
<< " ignored!" << std::endl;
return NULL;
}
Bond *res = new Bond(type);
res->setBeginAtomIdx(idxCorresp[idx1]);
res->setEndAtomIdx(idxCorresp[idx2]);
return res;
}
void ParseMol2AtomBlock(std::istream *inStream, RWMol *res, unsigned int nAtoms,
INT_VECT &idxCorresp) {
PRECONDITION(inStream, "inStream not valid");
PRECONDITION(!inStream->eof(), "inStream is at eof");
PRECONDITION(res, "RWMol not valid");
PRECONDITION(idxCorresp.size() == nAtoms, "vector size mismatch");
unsigned int nLP = 0;
bool hasHAtoms = false;
std::vector<RDGeom::Point3D> threeDPs;
threeDPs.reserve(nAtoms);
for (unsigned int i = 0; i < nAtoms; ++i) {
std::string tempStr = getLine(inStream);
if (inStream->eof()) {
throw FileParseException("premature EOF");
}
RDGeom::Point3D pos;
Atom *atom = ParseMol2FileAtomLine(tempStr, pos);
// if atom is NULL then we hit LP
if (atom) {
int aid = res->addAtom(atom, false, true);
idxCorresp[i] = aid;
threeDPs.push_back(pos);
if (atom->getSymbol() == "H") {
hasHAtoms = true;
}
} else {
++nLP;
}
}
// mol2 files need to have hydrogen atoms otherwise formal charge estimation
// will be problematic
if (!hasHAtoms) {
std::string nm;
res->getProp(common_properties::_Name, nm);
BOOST_LOG(rdWarningLog) << nm << ": Warning - no explicit hydrogens in "
"mol2 file but needed for formal charge "
"estimation." << std::endl;
}
// create conformer based on 3DPoints and add to RWMol
Conformer *conf = new Conformer(nAtoms - nLP);
std::vector<RDGeom::Point3D>::const_iterator threeDPsIt = threeDPs.begin();
for (unsigned int i = 0; i < threeDPs.size(); ++i) {
conf->setAtomPos(i, *threeDPsIt);
++threeDPsIt;
}
res->addConformer(conf, true);
POSTCONDITION(res->getNumAtoms() == (nAtoms - nLP),
"Wrong number of atoms in molecule");
}
void ParseMol2BondBlock(std::istream *inStream, RWMol *res, unsigned int nBonds,
const INT_VECT &idxCorresp) {
PRECONDITION(inStream, "inStream not valid");
PRECONDITION(!inStream->eof(), "inStream is at eof");
PRECONDITION(res, "RWMol not valid");
unsigned int nBadBonds = 0;
for (unsigned int i = 0; i < nBonds; ++i) {
std::string tempStr = getLine(inStream);
if (inStream->eof()) {
throw FileParseException("premature EOF");
}
Bond *bond = ParseMol2FileBondLine(tempStr, idxCorresp);
// if something weird happened there will be no bond for that line
if (bond) {
// if we got an aromatic bond set the flag on the bond and the connected
// atoms
if (bond->getBondType() == Bond::AROMATIC) {
bond->setIsAromatic(true);
res->getAtomWithIdx(bond->getBeginAtomIdx())->setIsAromatic(true);
res->getAtomWithIdx(bond->getEndAtomIdx())->setIsAromatic(true);
}
res->addBond(bond, true);
} else {
nBadBonds++;
}
}
POSTCONDITION(res->getNumBonds() == (nBonds - nBadBonds),
"Wrong number of atoms in molecule");
}
}; // end of anonymous namespace
//------------------------------------------------
//
// Read a molecule from a stream
//
//------------------------------------------------
RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize, bool removeHs,
Mol2Type variant) {
RDUNUSED_PARAM(variant);
PRECONDITION(inStream, "no stream");
std::string tempStr, lineBeg;
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
boost::char_separator<char> sep(" \t\n");
Utils::LocaleSwitcher ls;
// all molecules start with a @<TRIPOS>MOLECULE! There is no other way to
// define an end of
// molecule than to find a new one or an eof. Hence I have to read until I
// find one of the two ...
std::streampos molStart = 0, atomStart = 0, bondStart = 0, chargeStart = 0;
while (!inStream->eof()) {
tempStr = getLine(inStream);
if (inStream->eof()) {
break;
}
if (tempStr != "" && tempStr[0] == '@') {
tokenizer tokens(tempStr, sep);
std::string firstToken = *tokens.begin();
if (firstToken == "@<TRIPOS>MOLECULE") {
if (!molStart) {
// we reach that point in a multimol2 when we have not seen a
// @<TRIPOS>MOLECULE
// and set all important flags
molStart = inStream->tellg();
} else {
break;
}
} else if (firstToken == "@<TRIPOS>ATOM") {
atomStart = inStream->tellg();
} else if (firstToken == "@<TRIPOS>BOND") {
bondStart = inStream->tellg();
} else if (firstToken == "@<TRIPOS>UNITY_ATOM_ATTR") {
// tripos dbtranslate will write not Tripos conform atom types for
// various aromatic atoms
// this results in problems with the formal charge guesser. Hence, if we
// find the
// UNITY_ATOM_ATTR that contains formal charges we will read those and
// skip the charger and
// the substructure cleanup
chargeStart = inStream->tellg();
} // end if seenMolBefore
} // end if @
} // end while
// we should reach this point with at least the molStart and atomStart set
if (!molStart) {
throw FileParseException("No MOLECULE block found in Mol2 data");
}
if (!atomStart) {
throw FileParseException("No ATOM block found in Mol2 data");
}
if (inStream->eof()) {
inStream->clear();
}
inStream->seekg(molStart, std::ios::beg);
tempStr = getLine(inStream);
RWMol *res = new RWMol();
boost::trim_right(tempStr);
res->setProp(common_properties::_Name, tempStr);
tempStr = getLine(inStream);
tokenizer tokens(tempStr, sep);
if (tokens.begin() == tokens.end()) {
if (res) {
delete res;
}
throw FileParseException("Empty counts line");
}
unsigned int nAtoms = 0, nBonds = 0;
tokenizer::const_iterator itemIt = tokens.begin();
// counts line, this is where we really get started
try {
nAtoms = boost::lexical_cast<unsigned int>(*itemIt);
++itemIt;
if (itemIt != tokens.end()) {
nBonds = boost::lexical_cast<unsigned int>(*itemIt);
}
} catch (boost::bad_lexical_cast &) {
if (res) {
delete res;
}
std::ostringstream errout;
errout << "Cannot convert " << *itemIt << " to unsigned int";
throw FileParseException(errout.str());
}
if (nAtoms == 0) {
if (res) {
delete res;
}
throw FileParseException("molecule has no atoms");
}
tempStr = getLine(inStream); // mol_type - ignore
tempStr = getLine(inStream);
boost::trim(tempStr);
res->setProp("_TriposChargeType", tempStr);
// stop here since we don't support anything else from the MOLECULE block
INT_VECT idxCorresp(nAtoms, -1);
inStream->seekg(atomStart, std::ios::beg);
try {
ParseMol2AtomBlock(inStream, res, nAtoms, idxCorresp);
} catch (const FileParseException &e) {
if (res) {
delete res;
}
throw e;
}
if (nBonds) {
// stop here since we don't support anything else from the MOLECULE block
inStream->seekg(bondStart, std::ios::beg);
try {
ParseMol2BondBlock(inStream, res, nBonds, idxCorresp);
} catch (const FileParseException &e) {
if (res) {
delete res;
}
throw e;
}
}
if (!chargeStart) {
bool molFixed = cleanUpMol2Substructures(res);
if (!molFixed) {
delete res;
return NULL;
}
// mol2 format does not support formal charge information, hence we need to
// guess it
// based on default and explicit valences
guessFormalCharges(res);
} else {
inStream->seekg(chargeStart, std::ios::beg);
try {
readFormalChargesFromAttr(inStream, res);
} catch (const FileParseException &e) {
if (res) {
delete res;
}
throw e;
}
}
// set chirality prior to sanitisation since it happens from 3D and it's not
// possible anymore once the hydrogens are removed
// FIX: for now this is only for the first conformer - need to be changed once
// we
// use multiconformer files
MolOps::assignChiralTypesFrom3D(*res);
if (res && sanitize) {
MolOps::cleanUp(*res);
try {
if (removeHs) {
MolOps::removeHs(*res, false, false);
} else {
MolOps::sanitizeMol(*res);
}
// call DetectBondStereoChemistry after sanitization "because we need
// the ring information". Also this will set the E/Z labels on the bond.
// Similar in spirit to what happens in MolFileParser
const Conformer &conf = res->getConformer();
DetectBondStereoChemistry(*res, &conf);
} catch (MolSanitizeException &se) {
BOOST_LOG(rdWarningLog) << "sanitise ";
std::string molName;
res->getProp(common_properties::_Name, molName);
BOOST_LOG(rdWarningLog) << molName << ": ";
delete res;
throw se;
}
res->updatePropertyCache(false);
MolOps::assignStereochemistry(*res, true, true);
}
return res;
};
RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize, bool removeHs,
Mol2Type variant) {
RDUNUSED_PARAM(variant);
return Mol2DataStreamToMol(&inStream, sanitize, removeHs);
};
//------------------------------------------------
//
// Read a molecule from a string
//
//------------------------------------------------
RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize, bool removeHs,
Mol2Type variant) {
RDUNUSED_PARAM(variant);
std::istringstream inStream(molBlock);
return Mol2DataStreamToMol(inStream, sanitize, removeHs);
}
//------------------------------------------------
//
// Read a molecule from a file
//
//------------------------------------------------
RWMol *Mol2FileToMol(const std::string &fName, bool sanitize, bool removeHs,
Mol2Type variant) {
// FIX: this binary mode of opening file is here because of a bug in VC++ 6.0
// the function "tellg" does not work correctly if we do not open it this way
// Jan 2009: Confirmed that this is still the case in visual studio 2008
RDUNUSED_PARAM(variant);
std::ifstream inStream(fName.c_str(), std::ios_base::binary);
if (!inStream || (inStream.bad())) {
std::ostringstream errout;
errout << "Bad input file " << fName;
throw BadFileException(errout.str());
}
RWMol *res = NULL;
if (!inStream.eof()) {
res = Mol2DataStreamToMol(inStream, sanitize, removeHs);
}
return res;
}
}