mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
1025 lines
37 KiB
C++
1025 lines
37 KiB
C++
// $Id: Mol2FileParser.cpp 1457 2009-04-03 09:05:17Z landrgr1 $
|
|
//
|
|
// Copyright (c) 2008, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior
|
|
// written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// created by Nik Stiefl May 2008
|
|
// this file is heavily based on glandrum's MolFileParser
|
|
//
|
|
|
|
#include "FileParsers.h"
|
|
#include "MolFileStereochem.h"
|
|
#include <RDGeneral/Invariant.h>
|
|
#include <GraphMol/RDKitQueries.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <RDGeneral/RDLog.h>
|
|
//
|
|
#include <fstream>
|
|
#include <boost/tokenizer.hpp>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <boost/algorithm/string/trim.hpp>
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <RDGeneral/LocaleSwitcher.h>
|
|
|
|
namespace RDKit {
|
|
|
|
namespace {
|
|
void fixNitroSubstructureAndCharge(RWMol *res, unsigned int atIdx) {
|
|
unsigned int noODblNeighbors = 0;
|
|
ROMol::ADJ_ITER nbrIdxIt, nbrEndIdxIt;
|
|
unsigned int toModIdx = 0;
|
|
boost::tie(nbrIdxIt, nbrEndIdxIt) =
|
|
res->getAtomNeighbors(res->getAtomWithIdx(atIdx));
|
|
while (nbrIdxIt != nbrEndIdxIt) {
|
|
Bond *curBond = res->getBondBetweenAtoms(atIdx, *nbrIdxIt);
|
|
if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 8 &&
|
|
curBond->getBondType() == Bond::DOUBLE) {
|
|
++noODblNeighbors;
|
|
toModIdx = *nbrIdxIt;
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
if (noODblNeighbors == 2) {
|
|
res->getBondBetweenAtoms(atIdx, toModIdx)->setBondType(Bond::SINGLE);
|
|
res->getAtomWithIdx(atIdx)->setFormalCharge(1);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(-1);
|
|
}
|
|
}
|
|
|
|
void readFormalChargesFromAttr(std::istream *inStream, RWMol *res) {
|
|
PRECONDITION(inStream, "inStream not valid");
|
|
PRECONDITION(!inStream->eof(), "inStream is at eof");
|
|
PRECONDITION(res, "RWMol not valid");
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
bool readNextAtomAttribs = true;
|
|
unsigned int atomIdx = 0, noAtomAttr = 0;
|
|
|
|
// std::streampos stPos = inStream->tellg();
|
|
std::string tempStr = getLine(inStream);
|
|
// there needs to be at least one entry
|
|
if (inStream->eof()) {
|
|
throw FileParseException("premature EOF in readFormalCharges");
|
|
}
|
|
while (readNextAtomAttribs) {
|
|
tokenizer tokens(tempStr, sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
try {
|
|
atomIdx = boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
noAtomAttr = boost::lexical_cast<unsigned int>(*itemIt);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 UnityAtomAttr.");
|
|
}
|
|
for (unsigned int i = 0; i < noAtomAttr; ++i) {
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
throw FileParseException("premature EOF in readFormalCharges");
|
|
}
|
|
tokenizer atmTokens(tempStr, sep);
|
|
itemIt = atmTokens.begin();
|
|
if (*itemIt == "AtomExpr") {
|
|
int formCharge = 0;
|
|
++itemIt;
|
|
// FIX:what if an atom has multiple properties? Seems like they might be
|
|
// separated
|
|
// with ";" ... need to look at that in more detail!
|
|
if ((*itemIt).find("=") == std::string::npos) {
|
|
try {
|
|
formCharge = boost::lexical_cast<int>(*itemIt);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
readNextAtomAttribs = false;
|
|
throw FileParseException("Cannot process mol2 formal charge.");
|
|
}
|
|
// assign the charge
|
|
res->getAtomWithIdx(atomIdx - 1)->setFormalCharge(formCharge);
|
|
}
|
|
}
|
|
} // endfor
|
|
// check if we have finished reading UNITY_ATOM_ATTR
|
|
if (inStream->eof()) {
|
|
readNextAtomAttribs = false;
|
|
} else {
|
|
tempStr = getLine(inStream);
|
|
if (tempStr == "" || tempStr[0] == '@' || tempStr[0] == '#') {
|
|
readNextAtomAttribs = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void guessFormalCharges(RWMol *res) {
|
|
// FIX: this whole thing has problems with positively charged pyridines et al.
|
|
for (RWMol::AtomIterator atomIt = res->beginAtoms();
|
|
atomIt != res->endAtoms(); ++atomIt) {
|
|
Atom *at = (*atomIt);
|
|
// assign only if no formal charge set on that atom and atom is not carbon
|
|
// (the latter
|
|
// might needs changing later on - let's see) and not for query atoms (dummy
|
|
// etc.)
|
|
// since this happens only during cleanup of bad substructures
|
|
// FIX: check nitro compounds et al.
|
|
|
|
if (at->getFormalCharge() == 0 && at->getSymbol() != "C" &&
|
|
!(at->hasQuery())) {
|
|
// have to calculate the explicit valence without call to
|
|
// calcExplicitValence since
|
|
// this will barf when I have e.g. an uncharged 4 valent nitrogen ...
|
|
int noAromBonds = 0;
|
|
double accum = 0; // FIX: could this give non int values ?
|
|
ROMol::OEDGE_ITER beg, end;
|
|
boost::tie(beg, end) = res->getAtomBonds(at);
|
|
while (beg != end) {
|
|
accum += (*res)[*beg]->getValenceContrib(at);
|
|
if ((*res)[*beg]->getBondType() == Bond::AROMATIC) {
|
|
++noAromBonds;
|
|
}
|
|
++beg;
|
|
}
|
|
// Assumption: if there is an aromatic bridge atom the accum will be 4.5
|
|
//(three aromatic bonds), e.g. naphthalenes. However those are not charged
|
|
// so we
|
|
// can stop here
|
|
// FIX: a structure such as c12cccc[n+]2cccn1 will not work if we just
|
|
// continue
|
|
// for noAromBonds>2
|
|
// special case checking - if atom c then go on
|
|
if (noAromBonds > 2 && at->getSymbol() == "C") {
|
|
continue;
|
|
}
|
|
|
|
// dbtranslate problems - for mols with no UNITY_ATOM_ATTR set - we won't
|
|
// be able to guess
|
|
// if this mol needs to be guessed or not ... hence, we don't guess on
|
|
// atoms with
|
|
// ar specification and not atom type X.ar and in 5-membered ring (there
|
|
// is stuff like
|
|
// c1cc[o+]cc1 that should be charged
|
|
// e.g. 5ring with N.pl3 as NH atom or other atoms without ar
|
|
// specification in aromatic ring
|
|
// FIX: do we need make sure this only happens for atoms in ring?
|
|
std::string tATT;
|
|
at->getProp(common_properties::_TriposAtomType, tATT);
|
|
MolOps::findSSSR(*res);
|
|
if (tATT.find("ar") == std::string::npos && at->getIsAromatic() &&
|
|
res->getRingInfo()->isAtomInRingOfSize(at->getIdx(), 5)) {
|
|
continue;
|
|
}
|
|
|
|
// for dbtranslate a problem will also occur for N.ar with 3 aromatic
|
|
// bonds and no charge
|
|
// assigned for these we don't assign charges (corina will create
|
|
// kekulized input for this
|
|
//(at least in most cases) - anyway, throw a warning!
|
|
if (noAromBonds == 3 && tATT == "N.ar") {
|
|
std::string nm;
|
|
res->getProp(common_properties::_Name, nm);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< nm << ": warning - aromatic N with 3 aromatic bonds - "
|
|
"skipping charge guess for this atom" << std::endl;
|
|
continue;
|
|
}
|
|
|
|
// sometimes things like benzimidazoles can have only one bond of the
|
|
// imidazole ring as aromatic and the other one as a single bond ...
|
|
// catch that this way - see also the trick from GL
|
|
int expVal = static_cast<int>(round(accum + 0.1));
|
|
const INT_VECT &valens =
|
|
PeriodicTable::getTable()->getValenceList(at->getAtomicNum());
|
|
INT_VECT_CI vi;
|
|
|
|
// check default valence and compare to expVal - chg
|
|
// the hypothesis is that we prefer positively charged atoms over
|
|
// negatively charged ones
|
|
// for multi default valence atoms (e.g. CS(O)(O) should end up being
|
|
// C[S+]([O-])[O-] rather
|
|
// than C[S-][O-][O-] but that might change based no different examples
|
|
int nElectrons =
|
|
PeriodicTable::getTable()->getNouterElecs(at->getAtomicNum());
|
|
int assignChg;
|
|
if (nElectrons >= 4)
|
|
assignChg = expVal - (*valens.begin());
|
|
else
|
|
assignChg = (*valens.begin()) - expVal;
|
|
if (assignChg > 0 && nElectrons >= 4) {
|
|
for (vi = valens.begin(); vi != valens.end(); ++vi) {
|
|
// Since we do this only for nocharged atoms we can get away without
|
|
// including the
|
|
// charge into this
|
|
// apart from that we do not assign charges higher than +/- 1 for
|
|
// atoms with multiple valence states
|
|
// otherwise the early break would have to go away which in turn would
|
|
// result in things like [S+4] for
|
|
// sulfonamides
|
|
assignChg = expVal - (*vi);
|
|
if ((*vi) <= expVal && abs(assignChg) < 2) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (assignChg) {
|
|
// no aromatic atom will get aa abs(charge) > 1
|
|
if (at->getIsAromatic() && abs(assignChg) > 1) {
|
|
at->setFormalCharge((assignChg > 0) -
|
|
(assignChg < 0)); // this results in -1 or +1
|
|
} else {
|
|
at->setFormalCharge(assignChg);
|
|
}
|
|
// corina will create strange nitro groups which look like N(=O)(=O)
|
|
// which in turn will result in
|
|
//[N+2](=O)(=O) this needs to be fixed at this stage since otherwise we
|
|
// would have to check on all
|
|
// N.pl3 during the cleanup substructures step
|
|
if (assignChg == 2 && expVal == 5 && at->getSymbol() == "N") {
|
|
fixNitroSubstructureAndCharge(res, at->getIdx());
|
|
}
|
|
// what if expVal != at->calcExplicitValence();?
|
|
// cannot imagine a case where that will happen now.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned int chkNoHNeighbNOx(RWMol *res, ROMol::ADJ_ITER atIdxIt,
|
|
int &toModIdx) {
|
|
Atom *at = res->getAtomWithIdx(*atIdxIt);
|
|
unsigned int noHNbrs = 0;
|
|
ROMol::ADJ_ITER nbrIdxIt, nbrEndIdxIt;
|
|
boost::tie(nbrIdxIt, nbrEndIdxIt) = res->getAtomNeighbors(at);
|
|
while (nbrIdxIt != nbrEndIdxIt) {
|
|
if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 1) {
|
|
++noHNbrs;
|
|
} else if (res->getAtomWithIdx(*nbrIdxIt)->getAtomicNum() == 8 &&
|
|
res->getAtomDegree(res->getAtomWithIdx(*nbrIdxIt)) == 1) {
|
|
// this is a N in an N-oxide constellation
|
|
// we can do the above if clause since mol2 have explicit hydrogens
|
|
toModIdx = *atIdxIt;
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
return noHNbrs;
|
|
}
|
|
|
|
bool cleanUpMol2Substructures(RWMol *res) {
|
|
// NOTE: check the nitro fix in guess formal charges!
|
|
boost::dynamic_bitset<> isFixed(res->getNumAtoms());
|
|
for (ROMol::AtomIterator atIt = res->beginAtoms(); atIt != res->endAtoms();
|
|
++atIt) {
|
|
std::string tAT;
|
|
Atom *at = *atIt;
|
|
unsigned int idx = at->getIdx();
|
|
at->getProp(common_properties::_TriposAtomType, tAT);
|
|
|
|
if (tAT == "N.4") {
|
|
at->setFormalCharge(1);
|
|
} else if (tAT == "O.co2") {
|
|
// negatively charged carboxylates with O.co2
|
|
// according to Tripos, those should only appear in carboxylates and
|
|
// phosphates,
|
|
// FIX: do it also for phsopahtes and sulphates ...
|
|
if (at->getDegree() != 1) {
|
|
BOOST_LOG(rdWarningLog) << "Warning - O.co2 with degree >1."
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
ROMol::ADJ_ITER nbrIdxIt, endNbrsIdxIt;
|
|
// getAtomNeighbours returns 2 adjacency iterators (index iterators)
|
|
boost::tie(nbrIdxIt, endNbrsIdxIt) = res->getAtomNeighbors(at);
|
|
// this should return only the C.2
|
|
Atom *nbr = res->getAtomWithIdx(*nbrIdxIt);
|
|
std::string tATT;
|
|
nbr->getProp(common_properties::_TriposAtomType, tATT);
|
|
// carboxylates
|
|
if (tATT == "C.2" || tATT == "S.o2") {
|
|
// this should return only the bond between C.2 and O.co2
|
|
Bond *b = res->getBondBetweenAtoms(idx, *nbrIdxIt);
|
|
if (!isFixed[*nbrIdxIt]) {
|
|
// the first occurence is negatively charged and has a single bond
|
|
b->setBondType(Bond::SINGLE);
|
|
b->setIsAromatic(false);
|
|
at->setFormalCharge(-1);
|
|
at->setIsAromatic(false);
|
|
nbr->setIsAromatic(false);
|
|
isFixed[idx] = 1;
|
|
isFixed[*nbrIdxIt] = 1;
|
|
} else {
|
|
// the other occurences are not charged and have a double bond
|
|
b->setBondType(Bond::DOUBLE);
|
|
b->setIsAromatic(false);
|
|
at->setIsAromatic(false);
|
|
isFixed[idx] = 1;
|
|
}
|
|
} else {
|
|
std::string nm;
|
|
res->getProp(common_properties::_Name, nm);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< nm << ": warning - O.co2 with non C.2 or S.o2 neighbor."
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
} else if (tAT == "C.cat") {
|
|
// positively charged guanidinium groups with C.cat
|
|
// according to Tripos these should only appear in guanidinium groups
|
|
// for the structural fix - the last nitrogen with the least number of
|
|
// heavy atoms will get the double bond and the positive charge.
|
|
// remember : this is not canonical!
|
|
// first - set the C.cat as fixed
|
|
isFixed[idx] = 1;
|
|
ROMol::ADJ_ITER nbrIdxIt, endNbrsIdxIt, tmpIdxIt;
|
|
unsigned int lowestDeg = 100;
|
|
boost::tie(nbrIdxIt, endNbrsIdxIt) = res->getAtomNeighbors(at);
|
|
// one problem of programs like Corina is, that they will create also
|
|
// C.cat
|
|
// for groups that are not guanidinium. We cannot fix all, but the charged
|
|
// amidine
|
|
// in a ring is taken care of too.
|
|
tmpIdxIt = nbrIdxIt;
|
|
// declare and initialise toModIdx
|
|
int toModIdx = -1;
|
|
unsigned int noNNeighbors = 0;
|
|
while (tmpIdxIt != endNbrsIdxIt) {
|
|
if (res->getAtomWithIdx(*tmpIdxIt)->getSymbol() == "N") {
|
|
++noNNeighbors;
|
|
}
|
|
++tmpIdxIt;
|
|
}
|
|
if (noNNeighbors < 2 || noNNeighbors > 3) {
|
|
std::string nm;
|
|
res->getProp(common_properties::_Name, nm);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< nm << ": Error - C.Cat with bad number of N neighbors."
|
|
<< std::endl;
|
|
return false;
|
|
} else if (noNNeighbors == 2) {
|
|
// the idea is that we assign the positive charge according to the
|
|
// following precedence:
|
|
// 1. is part of N-oxide
|
|
// 2. atom with highest number of hydrogen atoms
|
|
// 3. atom in ring
|
|
// 4. random
|
|
// first we identify the N atoms
|
|
ROMol::ADJ_ITER idxIt1 = nbrIdxIt, idxIt2 = nbrIdxIt;
|
|
bool firstIdent = false;
|
|
while (nbrIdxIt != endNbrsIdxIt) {
|
|
if (res->getAtomWithIdx(*nbrIdxIt)->getSymbol() == "N") {
|
|
// fix the bond to one - only the modified N will have a double bond
|
|
// to C.cat
|
|
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setBondType(Bond::SINGLE);
|
|
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setIsAromatic(false);
|
|
res->getAtomWithIdx(*nbrIdxIt)->setIsAromatic(false);
|
|
// FIX: what is happening if we hit an atom that was fixed before -
|
|
// propably nothing.
|
|
// since I cannot think of a case where this is a problem - throw a
|
|
// warning
|
|
if (isFixed[*nbrIdxIt]) {
|
|
std::string nm;
|
|
res->getProp(common_properties::_Name, nm);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< nm << ": warning - charged amidine and isFixed atom."
|
|
<< std::endl;
|
|
}
|
|
isFixed[*nbrIdxIt] = 1;
|
|
if (firstIdent) {
|
|
idxIt2 = nbrIdxIt;
|
|
} else {
|
|
idxIt1 = nbrIdxIt;
|
|
firstIdent = true;
|
|
}
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
// now that we know which are the relevant atoms we check the above
|
|
// features
|
|
// is part of N-oxide?
|
|
// number of hydrogens on each neighbour
|
|
unsigned int noHNbrs1 = chkNoHNeighbNOx(res, idxIt1, toModIdx);
|
|
unsigned int noHNbrs2 = chkNoHNeighbNOx(res, idxIt2, toModIdx);
|
|
if (toModIdx < 0) {
|
|
// no N-oxide
|
|
if (noHNbrs1 != noHNbrs2) {
|
|
if (noHNbrs1 > noHNbrs2) {
|
|
toModIdx = *idxIt1;
|
|
} else {
|
|
toModIdx = *idxIt2; // this is random if both have the same
|
|
// number of atoms
|
|
}
|
|
} else {
|
|
// perceive the rings
|
|
MolOps::findSSSR(*res);
|
|
// then we check if both atoms are in a ring
|
|
unsigned int rIdx1 = res->getRingInfo()->numAtomRings((*idxIt1));
|
|
unsigned int rIdx2 = res->getRingInfo()->numAtomRings((*idxIt2));
|
|
if (rIdx1 > rIdx2) {
|
|
toModIdx = *idxIt1;
|
|
} else {
|
|
toModIdx = *idxIt2;
|
|
}
|
|
}
|
|
}
|
|
res->getBondBetweenAtoms(idx, toModIdx)->setBondType(Bond::DOUBLE);
|
|
res->getBondBetweenAtoms(idx, toModIdx)->setIsAromatic(false);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
|
|
at->setIsAromatic(false);
|
|
} else {
|
|
while (nbrIdxIt != endNbrsIdxIt) {
|
|
if (!isFixed[*nbrIdxIt]) {
|
|
// we get in here if this N.pl3 was not seen / fixed before
|
|
Atom *nbr = res->getAtomWithIdx(*nbrIdxIt);
|
|
// get the number of heavy atoms connected to this atom
|
|
ROMol::ADJ_ITER nbrNbrIdxIt, nbrEndNbrsIdxIt;
|
|
unsigned int hvyAtDeg = 0;
|
|
boost::tie(nbrNbrIdxIt, nbrEndNbrsIdxIt) =
|
|
res->getAtomNeighbors(nbr);
|
|
while (nbrNbrIdxIt != nbrEndNbrsIdxIt) {
|
|
if (res->getAtomWithIdx(*nbrNbrIdxIt)->getAtomicNum() > 1) {
|
|
std::string nbrAT;
|
|
res->getAtomWithIdx(*nbrNbrIdxIt)
|
|
->getProp(common_properties::_TriposAtomType, nbrAT);
|
|
if (nbrAT == "C.cat") {
|
|
hvyAtDeg += 2; // that way we reduce the risk of ionising the
|
|
// N attached to another C.cat ...
|
|
} else {
|
|
++hvyAtDeg;
|
|
}
|
|
}
|
|
++nbrNbrIdxIt;
|
|
}
|
|
// now check for lowest heavy atom degree
|
|
if (hvyAtDeg < lowestDeg) {
|
|
toModIdx = *nbrIdxIt;
|
|
lowestDeg = hvyAtDeg;
|
|
}
|
|
// modify the bond between C.Cat and the N.pl3
|
|
Bond *b = res->getBondBetweenAtoms(idx, *nbrIdxIt);
|
|
b->setBondType(Bond::SINGLE);
|
|
b->setIsAromatic(false);
|
|
nbr->setIsAromatic(false);
|
|
// set N.pl3 as fixed
|
|
isFixed[*nbrIdxIt] = 1;
|
|
} else {
|
|
// the N is allready fixed - since we don't touch this atom make the
|
|
// bond to single
|
|
// FIX: check on 3-way symmetric guanidinium mol -
|
|
// this could produce a only single bonded C.cat for bad H mols
|
|
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setBondType(Bond::SINGLE);
|
|
res->getBondBetweenAtoms(idx, *nbrIdxIt)->setIsAromatic(false);
|
|
}
|
|
++nbrIdxIt;
|
|
}
|
|
// now modify the respective N and the C.cat
|
|
Bond *b = res->getBondBetweenAtoms(idx, toModIdx);
|
|
b->setBondType(Bond::DOUBLE);
|
|
b->setIsAromatic(false);
|
|
res->getAtomWithIdx(toModIdx)->setFormalCharge(1);
|
|
at->setIsAromatic(false);
|
|
}
|
|
}
|
|
idx++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Atom *ParseMol2FileAtomLine(const std::string atomLine, RDGeom::Point3D &pos) {
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
std::string tAN, tAT;
|
|
tokenizer tokens(atomLine, sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("no info in mol2 atom line");
|
|
}
|
|
|
|
Atom *res = new Atom();
|
|
|
|
// skip TriposAtomId
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
// the sybyl atom name does not necessarily make sense - into atom property
|
|
tAN = *itemIt;
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
try {
|
|
pos.x = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
pos.y = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
pos.z = boost::lexical_cast<double>(*itemIt);
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("premature end of mol2 atom line");
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 coordinates.");
|
|
}
|
|
// now it becomes interesting - this is the SYBYL atom type. I put this into
|
|
// an atom property and deduce the
|
|
// symbol from that (everything before the dot)
|
|
tAT = *itemIt;
|
|
std::string symb = (*itemIt).substr(0, (*itemIt).find('.'));
|
|
|
|
// bad symbols:
|
|
// LP is not an atom so remove it ...
|
|
if (symb == "LP") {
|
|
delete res;
|
|
return NULL;
|
|
} else if (symb == "ANY" || symb == "Du") {
|
|
// queryAtoms
|
|
// according to the SYBYL spec, these match anything
|
|
QueryAtom *query = new QueryAtom(0);
|
|
query->setQuery(makeAtomNullQuery());
|
|
delete res;
|
|
res = query;
|
|
} else if (symb == "HEV") {
|
|
QueryAtom *query = new QueryAtom(1);
|
|
query->getQuery()->setNegation(true);
|
|
delete res;
|
|
res = query;
|
|
} else if (symb == "HET") {
|
|
// Tripos: N,O,P,S
|
|
QueryAtom *query = new QueryAtom(7);
|
|
query->expandQuery(makeAtomNumQuery(8), Queries::COMPOSITE_OR, true);
|
|
query->expandQuery(makeAtomNumQuery(15), Queries::COMPOSITE_OR, true);
|
|
query->expandQuery(makeAtomNumQuery(16), Queries::COMPOSITE_OR, true);
|
|
delete res;
|
|
res = query;
|
|
} else if (symb == "HAL") {
|
|
// Tripos: F,Cl,Br,I
|
|
QueryAtom *query = new QueryAtom(9);
|
|
query->expandQuery(makeAtomNumQuery(17), Queries::COMPOSITE_OR, true);
|
|
query->expandQuery(makeAtomNumQuery(35), Queries::COMPOSITE_OR, true);
|
|
query->expandQuery(makeAtomNumQuery(53), Queries::COMPOSITE_OR, true);
|
|
delete res;
|
|
res = query;
|
|
} else {
|
|
res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(symb));
|
|
}
|
|
|
|
// now assign the properties
|
|
res->setProp("_TriposAtomName", tAN); // maybe remove that since it's
|
|
// useless?
|
|
res->setProp(common_properties::_TriposAtomType, tAT);
|
|
// no implicit hydrogens for mol2 files
|
|
res->setNoImplicit(true);
|
|
|
|
// up to here the fields must be written - check if we have more
|
|
// next comes the subst_id, subst_name - skip those
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
return res;
|
|
}
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
return res;
|
|
}
|
|
++itemIt;
|
|
// the Partial charge in the file
|
|
if (itemIt != tokens.end()) {
|
|
res->setProp("_TriposPartialCharge", *itemIt);
|
|
}
|
|
// we skip the status bit ...
|
|
|
|
return res;
|
|
}
|
|
|
|
Bond *ParseMol2FileBondLine(const std::string bondLine,
|
|
const INT_VECT &idxCorresp) {
|
|
unsigned int idx1, idx2;
|
|
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
|
|
tokenizer tokens(bondLine, sep);
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
|
|
try {
|
|
// tripos bond id skip
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
idx1 = boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
idx2 = boost::lexical_cast<unsigned int>(*itemIt);
|
|
++itemIt;
|
|
if (itemIt == tokens.end()) {
|
|
throw FileParseException("no info in mol2 bond line");
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
throw FileParseException("Cannot process mol2 bonds.");
|
|
}
|
|
|
|
// adjust the numbering
|
|
idx1--;
|
|
idx2--;
|
|
|
|
// if either of both ends of the bond is not an atom in the mol - return NULL
|
|
if (!(idx1 < idxCorresp.size() || idx2 < idxCorresp.size())) {
|
|
throw FileParseException("index mismatch");
|
|
}
|
|
|
|
if (idxCorresp[idx1] < 0 || idxCorresp[idx2] < 0) {
|
|
return NULL;
|
|
}
|
|
|
|
// lexical casts of bond types is not really useful in this case since we
|
|
// could also have strings as values ...
|
|
// use if/else if/end statements for now - maybe change to mapping later - no
|
|
// idea if it's worth it ...?
|
|
Bond::BondType type;
|
|
std::string tBType = *itemIt;
|
|
if (tBType == "1" || tBType == "am") {
|
|
type = Bond::SINGLE;
|
|
} else if (tBType == "2") {
|
|
type = Bond::DOUBLE;
|
|
} else if (tBType == "3") {
|
|
type = Bond::TRIPLE;
|
|
} else if (tBType == "ar") {
|
|
type = Bond::AROMATIC;
|
|
} else if (tBType == "du" || tBType == "un") {
|
|
type = Bond::UNSPECIFIED; // have to come back here - see if there is any
|
|
// comment in th documentation ...
|
|
} else {
|
|
// this happens only if some weird thing is in the file or if we encounter a
|
|
// "nc" - not connected ...
|
|
// but why would anyone specify a bond which is not connected?
|
|
BOOST_LOG(rdWarningLog) << "Warning - unsupported bond type: " << tBType
|
|
<< " ignored!" << std::endl;
|
|
return NULL;
|
|
}
|
|
Bond *res = new Bond(type);
|
|
res->setBeginAtomIdx(idxCorresp[idx1]);
|
|
res->setEndAtomIdx(idxCorresp[idx2]);
|
|
return res;
|
|
}
|
|
|
|
void ParseMol2AtomBlock(std::istream *inStream, RWMol *res, unsigned int nAtoms,
|
|
INT_VECT &idxCorresp) {
|
|
PRECONDITION(inStream, "inStream not valid");
|
|
PRECONDITION(!inStream->eof(), "inStream is at eof");
|
|
PRECONDITION(res, "RWMol not valid");
|
|
PRECONDITION(idxCorresp.size() == nAtoms, "vector size mismatch");
|
|
unsigned int nLP = 0;
|
|
bool hasHAtoms = false;
|
|
|
|
std::vector<RDGeom::Point3D> threeDPs;
|
|
threeDPs.reserve(nAtoms);
|
|
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
throw FileParseException("premature EOF");
|
|
}
|
|
RDGeom::Point3D pos;
|
|
Atom *atom = ParseMol2FileAtomLine(tempStr, pos);
|
|
|
|
// if atom is NULL then we hit LP
|
|
if (atom) {
|
|
int aid = res->addAtom(atom, false, true);
|
|
idxCorresp[i] = aid;
|
|
threeDPs.push_back(pos);
|
|
if (atom->getSymbol() == "H") {
|
|
hasHAtoms = true;
|
|
}
|
|
} else {
|
|
++nLP;
|
|
}
|
|
}
|
|
// mol2 files need to have hydrogen atoms otherwise formal charge estimation
|
|
// will be problematic
|
|
if (!hasHAtoms) {
|
|
std::string nm;
|
|
res->getProp(common_properties::_Name, nm);
|
|
BOOST_LOG(rdWarningLog) << nm << ": Warning - no explicit hydrogens in "
|
|
"mol2 file but needed for formal charge "
|
|
"estimation." << std::endl;
|
|
}
|
|
// create conformer based on 3DPoints and add to RWMol
|
|
Conformer *conf = new Conformer(nAtoms - nLP);
|
|
std::vector<RDGeom::Point3D>::const_iterator threeDPsIt = threeDPs.begin();
|
|
for (unsigned int i = 0; i < threeDPs.size(); ++i) {
|
|
conf->setAtomPos(i, *threeDPsIt);
|
|
++threeDPsIt;
|
|
}
|
|
res->addConformer(conf, true);
|
|
|
|
POSTCONDITION(res->getNumAtoms() == (nAtoms - nLP),
|
|
"Wrong number of atoms in molecule");
|
|
}
|
|
|
|
void ParseMol2BondBlock(std::istream *inStream, RWMol *res, unsigned int nBonds,
|
|
const INT_VECT &idxCorresp) {
|
|
PRECONDITION(inStream, "inStream not valid");
|
|
PRECONDITION(!inStream->eof(), "inStream is at eof");
|
|
PRECONDITION(res, "RWMol not valid");
|
|
unsigned int nBadBonds = 0;
|
|
|
|
for (unsigned int i = 0; i < nBonds; ++i) {
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
throw FileParseException("premature EOF");
|
|
}
|
|
Bond *bond = ParseMol2FileBondLine(tempStr, idxCorresp);
|
|
// if something weird happened there will be no bond for that line
|
|
if (bond) {
|
|
// if we got an aromatic bond set the flag on the bond and the connected
|
|
// atoms
|
|
if (bond->getBondType() == Bond::AROMATIC) {
|
|
bond->setIsAromatic(true);
|
|
res->getAtomWithIdx(bond->getBeginAtomIdx())->setIsAromatic(true);
|
|
res->getAtomWithIdx(bond->getEndAtomIdx())->setIsAromatic(true);
|
|
}
|
|
res->addBond(bond, true);
|
|
} else {
|
|
nBadBonds++;
|
|
}
|
|
}
|
|
POSTCONDITION(res->getNumBonds() == (nBonds - nBadBonds),
|
|
"Wrong number of atoms in molecule");
|
|
}
|
|
|
|
}; // end of anonymous namespace
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a stream
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize, bool removeHs,
|
|
Mol2Type variant) {
|
|
RDUNUSED_PARAM(variant);
|
|
PRECONDITION(inStream, "no stream");
|
|
std::string tempStr, lineBeg;
|
|
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
|
|
boost::char_separator<char> sep(" \t\n");
|
|
Utils::LocaleSwitcher ls;
|
|
|
|
// all molecules start with a @<TRIPOS>MOLECULE! There is no other way to
|
|
// define an end of
|
|
// molecule than to find a new one or an eof. Hence I have to read until I
|
|
// find one of the two ...
|
|
std::streampos molStart = 0, atomStart = 0, bondStart = 0, chargeStart = 0;
|
|
while (!inStream->eof()) {
|
|
tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
break;
|
|
}
|
|
|
|
if (tempStr != "" && tempStr[0] == '@') {
|
|
tokenizer tokens(tempStr, sep);
|
|
std::string firstToken = *tokens.begin();
|
|
if (firstToken == "@<TRIPOS>MOLECULE") {
|
|
if (!molStart) {
|
|
// we reach that point in a multimol2 when we have not seen a
|
|
// @<TRIPOS>MOLECULE
|
|
// and set all important flags
|
|
molStart = inStream->tellg();
|
|
} else {
|
|
break;
|
|
}
|
|
} else if (firstToken == "@<TRIPOS>ATOM") {
|
|
atomStart = inStream->tellg();
|
|
} else if (firstToken == "@<TRIPOS>BOND") {
|
|
bondStart = inStream->tellg();
|
|
} else if (firstToken == "@<TRIPOS>UNITY_ATOM_ATTR") {
|
|
// tripos dbtranslate will write not Tripos conform atom types for
|
|
// various aromatic atoms
|
|
// this results in problems with the formal charge guesser. Hence, if we
|
|
// find the
|
|
// UNITY_ATOM_ATTR that contains formal charges we will read those and
|
|
// skip the charger and
|
|
// the substructure cleanup
|
|
chargeStart = inStream->tellg();
|
|
} // end if seenMolBefore
|
|
} // end if @
|
|
} // end while
|
|
|
|
// we should reach this point with at least the molStart and atomStart set
|
|
if (!molStart) {
|
|
throw FileParseException("No MOLECULE block found in Mol2 data");
|
|
}
|
|
if (!atomStart) {
|
|
throw FileParseException("No ATOM block found in Mol2 data");
|
|
}
|
|
|
|
if (inStream->eof()) {
|
|
inStream->clear();
|
|
}
|
|
|
|
inStream->seekg(molStart, std::ios::beg);
|
|
tempStr = getLine(inStream);
|
|
RWMol *res = new RWMol();
|
|
boost::trim_right(tempStr);
|
|
res->setProp(common_properties::_Name, tempStr);
|
|
|
|
tempStr = getLine(inStream);
|
|
tokenizer tokens(tempStr, sep);
|
|
if (tokens.begin() == tokens.end()) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
throw FileParseException("Empty counts line");
|
|
}
|
|
|
|
unsigned int nAtoms = 0, nBonds = 0;
|
|
tokenizer::const_iterator itemIt = tokens.begin();
|
|
// counts line, this is where we really get started
|
|
try {
|
|
nAtoms = boost::lexical_cast<unsigned int>(*itemIt);
|
|
|
|
++itemIt;
|
|
if (itemIt != tokens.end()) {
|
|
nBonds = boost::lexical_cast<unsigned int>(*itemIt);
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert " << *itemIt << " to unsigned int";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
if (nAtoms == 0) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
throw FileParseException("molecule has no atoms");
|
|
}
|
|
tempStr = getLine(inStream); // mol_type - ignore
|
|
tempStr = getLine(inStream);
|
|
boost::trim(tempStr);
|
|
res->setProp("_TriposChargeType", tempStr);
|
|
// stop here since we don't support anything else from the MOLECULE block
|
|
INT_VECT idxCorresp(nAtoms, -1);
|
|
inStream->seekg(atomStart, std::ios::beg);
|
|
try {
|
|
ParseMol2AtomBlock(inStream, res, nAtoms, idxCorresp);
|
|
} catch (const FileParseException &e) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
if (nBonds) {
|
|
// stop here since we don't support anything else from the MOLECULE block
|
|
inStream->seekg(bondStart, std::ios::beg);
|
|
try {
|
|
ParseMol2BondBlock(inStream, res, nBonds, idxCorresp);
|
|
} catch (const FileParseException &e) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
if (!chargeStart) {
|
|
bool molFixed = cleanUpMol2Substructures(res);
|
|
|
|
if (!molFixed) {
|
|
delete res;
|
|
return NULL;
|
|
}
|
|
|
|
// mol2 format does not support formal charge information, hence we need to
|
|
// guess it
|
|
// based on default and explicit valences
|
|
guessFormalCharges(res);
|
|
} else {
|
|
inStream->seekg(chargeStart, std::ios::beg);
|
|
try {
|
|
readFormalChargesFromAttr(inStream, res);
|
|
} catch (const FileParseException &e) {
|
|
if (res) {
|
|
delete res;
|
|
}
|
|
throw e;
|
|
}
|
|
}
|
|
|
|
// set chirality prior to sanitisation since it happens from 3D and it's not
|
|
// possible anymore once the hydrogens are removed
|
|
// FIX: for now this is only for the first conformer - need to be changed once
|
|
// we
|
|
// use multiconformer files
|
|
MolOps::assignChiralTypesFrom3D(*res);
|
|
|
|
if (res && sanitize) {
|
|
MolOps::cleanUp(*res);
|
|
|
|
try {
|
|
if (removeHs) {
|
|
MolOps::removeHs(*res, false, false);
|
|
} else {
|
|
MolOps::sanitizeMol(*res);
|
|
}
|
|
|
|
// call DetectBondStereoChemistry after sanitization "because we need
|
|
// the ring information". Also this will set the E/Z labels on the bond.
|
|
// Similar in spirit to what happens in MolFileParser
|
|
const Conformer &conf = res->getConformer();
|
|
DetectBondStereoChemistry(*res, &conf);
|
|
|
|
} catch (MolSanitizeException &se) {
|
|
BOOST_LOG(rdWarningLog) << "sanitise ";
|
|
std::string molName;
|
|
res->getProp(common_properties::_Name, molName);
|
|
BOOST_LOG(rdWarningLog) << molName << ": ";
|
|
delete res;
|
|
throw se;
|
|
}
|
|
|
|
res->updatePropertyCache(false);
|
|
MolOps::assignStereochemistry(*res, true, true);
|
|
}
|
|
|
|
return res;
|
|
};
|
|
|
|
RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize, bool removeHs,
|
|
Mol2Type variant) {
|
|
RDUNUSED_PARAM(variant);
|
|
return Mol2DataStreamToMol(&inStream, sanitize, removeHs);
|
|
};
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a string
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize, bool removeHs,
|
|
Mol2Type variant) {
|
|
RDUNUSED_PARAM(variant);
|
|
std::istringstream inStream(molBlock);
|
|
return Mol2DataStreamToMol(inStream, sanitize, removeHs);
|
|
}
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a file
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *Mol2FileToMol(const std::string &fName, bool sanitize, bool removeHs,
|
|
Mol2Type variant) {
|
|
// FIX: this binary mode of opening file is here because of a bug in VC++ 6.0
|
|
// the function "tellg" does not work correctly if we do not open it this way
|
|
// Jan 2009: Confirmed that this is still the case in visual studio 2008
|
|
RDUNUSED_PARAM(variant);
|
|
std::ifstream inStream(fName.c_str(), std::ios_base::binary);
|
|
if (!inStream || (inStream.bad())) {
|
|
std::ostringstream errout;
|
|
errout << "Bad input file " << fName;
|
|
throw BadFileException(errout.str());
|
|
}
|
|
RWMol *res = NULL;
|
|
if (!inStream.eof()) {
|
|
res = Mol2DataStreamToMol(inStream, sanitize, removeHs);
|
|
}
|
|
return res;
|
|
}
|
|
}
|