mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* add id members to StereoGroup class * add optional read id argument to StereoGroup constructors * add functions forward Stereo Group Ids and assign the missing ones * update ops updating stereogroups to forward read id * update CX Smiles to parse/write stereogroup ids * Add test cases for stereo group id forwarding/canonicalization * update mol block (V3K only) to parse/write stereogroup ids * update pickling to parse/write stereogroup ids * update cdxml parser to store stereogroup ids * update mol interchange to parse/write stereogroup ids * update draw code with new stere group ids * update test * add some tests * Update Code/GraphMol/Wrap/rdmolfiles.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> * Update Code/GraphMol/Wrap/rdmolfiles.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> * Update Code/GraphMol/Canon.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> * Update Code/GraphMol/SmilesParse/CXSmilesOps.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> * review --------- Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
3561 lines
117 KiB
C++
3561 lines
117 KiB
C++
//
|
|
// Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/tokenizer.hpp>
|
|
#include <boost/algorithm/string/trim.hpp>
|
|
#include <boost/format.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
#include "FileParsers.h"
|
|
#include "FileParserUtils.h"
|
|
#include "MolSGroupParsing.h"
|
|
#include "MolFileStereochem.h"
|
|
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
#include <GraphMol/RDKitQueries.h>
|
|
#include <GraphMol/StereoGroup.h>
|
|
#include <GraphMol/SubstanceGroup.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <RDGeneral/RDLog.h>
|
|
|
|
#include <fstream>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <RDGeneral/LocaleSwitcher.h>
|
|
#include <typeinfo>
|
|
#include <exception>
|
|
#include <charconv>
|
|
|
|
#ifdef RDKIT_USE_BOOST_REGEX
|
|
#include <boost/regex.hpp>
|
|
using boost::regex;
|
|
using boost::regex_match;
|
|
using boost::smatch;
|
|
#else
|
|
#include <regex>
|
|
using std::regex;
|
|
using std::regex_match;
|
|
using std::smatch;
|
|
#endif
|
|
#include <sstream>
|
|
#include <locale>
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <string_view>
|
|
|
|
using namespace RDKit::SGroupParsing;
|
|
|
|
namespace RDKit {
|
|
|
|
namespace FileParserUtils {
|
|
|
|
int toInt(const std::string_view input, bool acceptSpaces) {
|
|
// don't need to worry about locale stuff here because
|
|
// we're not going to have delimiters
|
|
|
|
// sanity check on the input since strtol doesn't do it for us:
|
|
const char *txt = input.data();
|
|
for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
|
|
if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
|
|
*txt == '+' || *txt == '-') {
|
|
++txt;
|
|
} else {
|
|
throw boost::bad_lexical_cast();
|
|
}
|
|
}
|
|
// remove leading spaces
|
|
txt = input.data();
|
|
unsigned int sz = input.size();
|
|
if (acceptSpaces) {
|
|
while (*txt == ' ') {
|
|
++txt;
|
|
--sz;
|
|
// have we run off the end of the view?
|
|
if (sz < 1U) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
int res = 0;
|
|
std::from_chars(txt, txt + sz, res);
|
|
|
|
return res;
|
|
}
|
|
int toInt(const std::string &input, bool acceptSpaces) {
|
|
return toInt(std::string_view(input.c_str()), acceptSpaces);
|
|
}
|
|
unsigned int toUnsigned(const std::string_view input, bool acceptSpaces) {
|
|
// don't need to worry about locale stuff here because
|
|
// we're not going to have delimiters
|
|
|
|
// sanity check on the input since strtol doesn't do it for us:
|
|
const char *txt = input.data();
|
|
for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
|
|
if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
|
|
*txt == '+') {
|
|
++txt;
|
|
} else {
|
|
throw boost::bad_lexical_cast();
|
|
}
|
|
}
|
|
// remove leading spaces
|
|
txt = input.data();
|
|
unsigned int sz = input.size();
|
|
if (acceptSpaces) {
|
|
while (*txt == ' ') {
|
|
++txt;
|
|
--sz;
|
|
// have we run off the end of the view?
|
|
if (sz < 1U) {
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
unsigned int res = 0;
|
|
std::from_chars(txt, txt + sz, res);
|
|
return res;
|
|
}
|
|
unsigned int toUnsigned(const std::string &input, bool acceptSpaces) {
|
|
return toUnsigned(std::string_view(input.c_str()), acceptSpaces);
|
|
}
|
|
double toDouble(const std::string_view input, bool acceptSpaces) {
|
|
// sanity check on the input since strtol doesn't do it for us:
|
|
const char *txt = input.data();
|
|
for (size_t i = 0u; i < input.size() && *txt != '\x00'; ++i) {
|
|
// check for ',' and '.' because locale
|
|
if ((*txt >= '0' && *txt <= '9') || (acceptSpaces && *txt == ' ') ||
|
|
*txt == '+' || *txt == '-' || *txt == ',' || *txt == '.') {
|
|
++txt;
|
|
} else {
|
|
throw boost::bad_lexical_cast();
|
|
}
|
|
}
|
|
// unfortunately from_chars() with doubles didn't work on g++ until v11.1
|
|
// and the status with clang is hard to figure out... we remain old-school
|
|
// remove leading spaces
|
|
double res = atof(input.data());
|
|
return res;
|
|
}
|
|
double toDouble(const std::string &input, bool acceptSpaces) {
|
|
return toDouble(std::string_view(input.c_str()), acceptSpaces);
|
|
}
|
|
std::string getV3000Line(std::istream *inStream, unsigned int &line) {
|
|
// FIX: technically V3K blocks are case-insensitive. We should really be
|
|
// up-casing everything here.
|
|
PRECONDITION(inStream, "bad stream");
|
|
std::string res;
|
|
++line;
|
|
auto inl = getLine(inStream);
|
|
std::string_view tempStr = inl;
|
|
if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M V30 ") {
|
|
std::ostringstream errout;
|
|
errout << "Line " << line << " does not start with 'M V30 '" << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
// FIX: do we need to handle trailing whitespace after a -?
|
|
while (tempStr.back() == '-') {
|
|
// continuation character, append what we read:
|
|
res += tempStr.substr(7, tempStr.length() - 8);
|
|
// and then read another line:
|
|
++line;
|
|
inl = getLine(inStream);
|
|
tempStr = inl;
|
|
if (tempStr.size() < 7 || tempStr.substr(0, 7) != "M V30 ") {
|
|
std::ostringstream errout;
|
|
errout << "Line " << line << " does not start with 'M V30 '"
|
|
<< std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
res += tempStr.substr(7, tempStr.length() - 7);
|
|
|
|
return res;
|
|
}
|
|
|
|
Atom *replaceAtomWithQueryAtom(RWMol *mol, Atom *atom) {
|
|
return QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
} // namespace FileParserUtils
|
|
using RDKit::FileParserUtils::getV3000Line;
|
|
|
|
namespace {
|
|
|
|
bool startsWith(const std::string &haystack, const char *needle, size_t size) {
|
|
return haystack.compare(0u, size, needle, size) == 0;
|
|
}
|
|
|
|
//! parse a collection block to find enhanced stereo groups
|
|
std::string parseEnhancedStereo(std::istream *inStream, unsigned int &line,
|
|
RWMol *mol) {
|
|
// Lines like (absolute, relative, racemic):
|
|
// M V30 MDLV30/STEABS ATOMS=(2 2 3)
|
|
// M V30 MDLV30/STEREL1 ATOMS=(1 12)
|
|
// M V30 MDLV30/STERAC1 ATOMS=(1 12)
|
|
const regex stereo_label(
|
|
R"regex(MDLV30/STE(...)([0-9]*) +ATOMS=\(([0-9]+) +(.*)\) *)regex");
|
|
|
|
smatch match;
|
|
std::vector<StereoGroup> groups;
|
|
|
|
// Read the collection until the end
|
|
auto tempStr = getV3000Line(inStream, line);
|
|
boost::to_upper(tempStr);
|
|
while (!startsWith(tempStr, "END", 3)) {
|
|
// If this line in the collection is part of a stereo group
|
|
if (regex_match(tempStr, match, stereo_label)) {
|
|
StereoGroupType grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;
|
|
unsigned groupid = 0;
|
|
|
|
if (match[1] == "ABS") {
|
|
grouptype = RDKit::StereoGroupType::STEREO_ABSOLUTE;
|
|
} else if (match[1] == "REL") {
|
|
grouptype = RDKit::StereoGroupType::STEREO_OR;
|
|
groupid = FileParserUtils::toUnsigned(match[2], true);
|
|
} else if (match[1] == "RAC") {
|
|
grouptype = RDKit::StereoGroupType::STEREO_AND;
|
|
groupid = FileParserUtils::toUnsigned(match[2], true);
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "Unrecognized stereogroup type : '" << tempStr << "' on line"
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
const unsigned int count = FileParserUtils::toUnsigned(match[3], true);
|
|
std::vector<Atom *> atoms;
|
|
std::stringstream ss(match[4]);
|
|
unsigned int index;
|
|
for (size_t i = 0; i < count; ++i) {
|
|
ss >> index;
|
|
// atoms are 1 indexed in molfiles
|
|
atoms.push_back(mol->getAtomWithIdx(index - 1));
|
|
}
|
|
groups.emplace_back(grouptype, std::move(atoms), groupid);
|
|
} else {
|
|
// skip collection types we don't know how to read. Only one documented
|
|
// is MDLV30/HILITE
|
|
BOOST_LOG(rdWarningLog) << "Skipping unrecognized collection type at "
|
|
"line "
|
|
<< line << ": " << tempStr << std::endl;
|
|
}
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
|
|
if (!groups.empty()) {
|
|
mol->setStereoGroups(std::move(groups));
|
|
}
|
|
tempStr = getV3000Line(inStream, line);
|
|
return tempStr;
|
|
}
|
|
|
|
//*************************************
|
|
//
|
|
// Every effort has been made to adhere to MDL's standard
|
|
// for mol files
|
|
//
|
|
//*************************************
|
|
|
|
void ParseOldAtomList(RWMol *mol, const std::string_view &text,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
unsigned int idx;
|
|
try {
|
|
idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(0, 3)) -
|
|
1;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(0, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
URANGE_CHECK(idx, mol->getNumAtoms());
|
|
QueryAtom a(*(mol->getAtomWithIdx(idx)));
|
|
|
|
auto *q = new ATOM_OR_QUERY;
|
|
q->setDescription("AtomOr");
|
|
|
|
switch (text[4]) {
|
|
case 'T':
|
|
q->setNegation(true);
|
|
break;
|
|
case 'F':
|
|
q->setNegation(false);
|
|
break;
|
|
default:
|
|
delete q;
|
|
std::ostringstream errout;
|
|
errout << "Unrecognized atom-list query modifier: '" << text[4]
|
|
<< "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
int nQueries;
|
|
try {
|
|
nQueries = FileParserUtils::toInt(text.substr(9, 1));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
delete q;
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(9, 1) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
RANGE_CHECK(0, nQueries, 5);
|
|
for (int i = 0; i < nQueries; i++) {
|
|
int pos = 11 + i * 4;
|
|
int atNum;
|
|
try {
|
|
atNum = FileParserUtils::toInt(text.substr(pos, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
delete q;
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
RANGE_CHECK(0, atNum, 200); // goofy!
|
|
q->addChild(
|
|
QueryAtom::QUERYATOM_QUERY::CHILD_TYPE(makeAtomNumQuery(atNum)));
|
|
if (!i) {
|
|
a.setAtomicNum(atNum);
|
|
}
|
|
}
|
|
|
|
a.setQuery(q);
|
|
a.setProp(common_properties::_MolFileAtomQuery, 1);
|
|
|
|
mol->replaceAtom(idx, &a);
|
|
}
|
|
|
|
void ParseChargeLine(RWMol *mol, const std::string &text, bool firstCall,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M CHG"), "bad charge line");
|
|
|
|
// if this line is specified all the atom other than those specified
|
|
// here should carry a charge of 0; but we should only do this once:
|
|
if (firstCall) {
|
|
for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms();
|
|
++ai) {
|
|
(*ai)->setFormalCharge(0);
|
|
}
|
|
}
|
|
|
|
int ie, nent;
|
|
try {
|
|
nent = FileParserUtils::toInt(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
int spos = 9;
|
|
for (ie = 0; ie < nent; ie++) {
|
|
int aid, chg;
|
|
try {
|
|
aid = FileParserUtils::toInt(text.substr(spos, 4));
|
|
spos += 4;
|
|
chg = FileParserUtils::toInt(text.substr(spos, 4));
|
|
spos += 4;
|
|
mol->getAtomWithIdx(aid - 1)->setFormalCharge(chg);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseRadicalLine(RWMol *mol, const std::string &text, bool firstCall,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M RAD"), "bad charge line");
|
|
|
|
// if this line is specified all the atom other than those specified
|
|
// here should carry a charge of 0; but we should only do this once:
|
|
if (firstCall) {
|
|
for (ROMol::AtomIterator ai = mol->beginAtoms(); ai != mol->endAtoms();
|
|
++ai) {
|
|
(*ai)->setFormalCharge(0);
|
|
}
|
|
}
|
|
|
|
int ie, nent;
|
|
try {
|
|
nent = FileParserUtils::toInt(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
int spos = 9;
|
|
for (ie = 0; ie < nent; ie++) {
|
|
int aid, rad;
|
|
std::ostringstream errout;
|
|
|
|
try {
|
|
aid = FileParserUtils::toInt(text.substr(spos, 4));
|
|
spos += 4;
|
|
rad = FileParserUtils::toInt(text.substr(spos, 4));
|
|
spos += 4;
|
|
|
|
switch (rad) {
|
|
case 1:
|
|
mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2);
|
|
break;
|
|
case 2:
|
|
mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(1);
|
|
break;
|
|
case 3:
|
|
mol->getAtomWithIdx(aid - 1)->setNumRadicalElectrons(2);
|
|
break;
|
|
default:
|
|
errout << "Unrecognized radical value " << rad << " for atom "
|
|
<< aid - 1 << " on line " << line << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParsePXALine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == "M PXA", "bad PXA line");
|
|
unsigned int pos = 7;
|
|
try {
|
|
auto atIdx =
|
|
FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(pos, 3));
|
|
pos += 3;
|
|
mol->getAtomWithIdx(atIdx - 1)->setProp(
|
|
"_MolFile_PXA", text.substr(pos, text.length() - pos));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
|
|
void ParseIsotopeLine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M ISO"), "bad isotope line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
Atom *atom = mol->getAtomWithIdx(aid - 1);
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
int isotope = FileParserUtils::toInt(text.substr(spos, 4));
|
|
if (isotope < 0) {
|
|
BOOST_LOG(rdErrorLog)
|
|
<< " atom " << aid
|
|
<< " has a negative isotope value. line: " << line << std::endl;
|
|
} else {
|
|
atom->setIsotope(isotope);
|
|
}
|
|
}
|
|
spos += 4;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseSubstitutionCountLine(RWMol *mol, const std::string &text,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M SUB"), "bad SUB line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid;
|
|
int count = 0;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
Atom *atom = mol->getAtomWithIdx(aid - 1);
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
count = FileParserUtils::toInt(text.substr(spos, 4));
|
|
}
|
|
spos += 4;
|
|
if (count == 0) {
|
|
continue;
|
|
}
|
|
ATOM_EQUALS_QUERY *q = makeAtomExplicitDegreeQuery(0);
|
|
switch (count) {
|
|
case -1:
|
|
q->setVal(0);
|
|
break;
|
|
case -2:
|
|
q->setVal(atom->getDegree());
|
|
break;
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
case 5:
|
|
q->setVal(count);
|
|
break;
|
|
case 6:
|
|
BOOST_LOG(rdWarningLog) << " atom degree query with value 6 found. "
|
|
"This will not match degree >6. The MDL "
|
|
"spec says it should. line: "
|
|
<< line;
|
|
q->setVal(6);
|
|
break;
|
|
default:
|
|
std::ostringstream errout;
|
|
errout << "Value " << count
|
|
<< " is not supported as a degree query. line: " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
atom->expandQuery(q, Queries::COMPOSITE_AND);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseUnsaturationLine(RWMol *mol, const std::string &text,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M UNS"), "bad UNS line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid;
|
|
int count = 0;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
Atom *atom = mol->getAtomWithIdx(aid - 1);
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
count = FileParserUtils::toInt(text.substr(spos, 4));
|
|
}
|
|
spos += 4;
|
|
if (count == 0) {
|
|
continue;
|
|
} else if (count == 1) {
|
|
ATOM_EQUALS_QUERY *q = makeAtomUnsaturatedQuery();
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
atom->expandQuery(q, Queries::COMPOSITE_AND);
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "Value " << count
|
|
<< " is not supported as an unsaturation "
|
|
"query (only 0 and 1 are allowed). "
|
|
"line: "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseRingBondCountLine(RWMol *mol, const std::string &text,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M RBC"), "bad RBC line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid;
|
|
int count = 0;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
Atom *atom = mol->getAtomWithIdx(aid - 1);
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
count = FileParserUtils::toInt(text.substr(spos, 4));
|
|
}
|
|
spos += 4;
|
|
if (count == 0) {
|
|
continue;
|
|
}
|
|
ATOM_EQUALS_QUERY *q = makeAtomRingBondCountQuery(0);
|
|
switch (count) {
|
|
case -1:
|
|
q->setVal(0);
|
|
break;
|
|
case -2:
|
|
q->setVal(0xDEADBEEF);
|
|
mol->setProp(common_properties::_NeedsQueryScan, 1);
|
|
break;
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
q->setVal(count);
|
|
break;
|
|
case 4:
|
|
delete q;
|
|
q = static_cast<ATOM_EQUALS_QUERY *>(new ATOM_LESSEQUAL_QUERY);
|
|
q->setVal(4);
|
|
q->setDescription("AtomRingBondCount");
|
|
q->setDataFunc(queryAtomRingBondCount);
|
|
break;
|
|
default:
|
|
std::ostringstream errout;
|
|
errout << "Value " << count
|
|
<< " is not supported as a ring-bond count query. line: "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
atom->expandQuery(q, Queries::COMPOSITE_AND);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseZCHLine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
// part of Alex Clark's ZBO proposal
|
|
// from JCIM 51:3149-57 (2011)
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M ZCH"), "bad ZCH line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid = 0;
|
|
int val = 0;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
|
|
}
|
|
if (!aid || aid > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad ZCH specification on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
spos += 4;
|
|
--aid;
|
|
Atom *atom = mol->getAtomWithIdx(aid);
|
|
if (!atom) {
|
|
std::ostringstream errout;
|
|
errout << "Atom " << aid << " from ZCH specification on line " << line
|
|
<< " not found";
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
atom->setFormalCharge(val);
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseHYDLine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
// part of Alex Clark's ZBO proposal
|
|
// from JCIM 51:3149-57 (2011)
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M HYD"), "bad HYD line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid = 0;
|
|
int val = -1;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
|
|
}
|
|
if (!aid || aid > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad HYD specification on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
spos += 4;
|
|
--aid;
|
|
Atom *atom = mol->getAtomWithIdx(aid);
|
|
if (!atom) {
|
|
std::ostringstream errout;
|
|
errout << "Atom " << aid << " from HYD specification on line " << line
|
|
<< " not found";
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
if (val >= 0) {
|
|
atom->setProp("_ZBO_H", true);
|
|
atom->setNumExplicitHs(val);
|
|
}
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseZBOLine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
// part of Alex Clark's ZBO proposal
|
|
// from JCIM 51:3149-57 (2011)
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M ZBO"), "bad ZBO line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int bid = 0;
|
|
unsigned int order = 0;
|
|
try {
|
|
bid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
order = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
}
|
|
if (!bid || bid > mol->getNumBonds()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad ZBO specification on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
spos += 4;
|
|
--bid;
|
|
Bond *bnd = mol->getBondWithIdx(bid);
|
|
if (!bnd) {
|
|
std::ostringstream errout;
|
|
errout << "Bond " << bid << " from ZBO specification on line " << line
|
|
<< " not found";
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
if (order == 0) {
|
|
bnd->setBondType(Bond::ZERO);
|
|
} else {
|
|
bnd->setBondType(static_cast<Bond::BondType>(order));
|
|
}
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
void ParseMarvinSmartsLine(RWMol *mol, const std::string &text,
|
|
unsigned int line) {
|
|
const unsigned int atomNumStart = 10;
|
|
const unsigned int smartsStart = 15;
|
|
// M MRV SMA 1 [*;A]
|
|
// 01234567890123456789
|
|
// 1111111111
|
|
if (text.substr(0, 10) != "M MRV SMA") {
|
|
return;
|
|
}
|
|
|
|
unsigned int idx;
|
|
std::string idxTxt = text.substr(atomNumStart, smartsStart - atomNumStart);
|
|
try {
|
|
idx = FileParserUtils::stripSpacesAndCast<unsigned int>(idxTxt) - 1;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << idxTxt << "' to an atom index on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
URANGE_CHECK(idx, mol->getNumAtoms());
|
|
// Should we check the validity of the marvin line here? Should we
|
|
// automatically
|
|
// Add these as recursive smarts? I tend to think so...
|
|
std::string sma = text.substr(smartsStart);
|
|
Atom *at = mol->getAtomWithIdx(idx);
|
|
at->setProp(common_properties::MRV_SMA, sma);
|
|
RWMol *m = nullptr;
|
|
try {
|
|
m = SmartsToMol(sma);
|
|
} catch (...) {
|
|
// Is this ever used?
|
|
}
|
|
|
|
if (m) {
|
|
QueryAtom::QUERYATOM_QUERY *query = new RecursiveStructureQuery(m);
|
|
if (!at->hasQuery()) {
|
|
QueryAtom qAt(*at);
|
|
int oidx = at->getIdx();
|
|
mol->replaceAtom(oidx, &qAt);
|
|
at = mol->getAtomWithIdx(oidx);
|
|
}
|
|
at->expandQuery(query, Queries::COMPOSITE_AND);
|
|
at->setProp(common_properties::_MolFileAtomQuery, 1);
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "Cannot parse smarts: '" << sma << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
|
|
void ParseAttachPointLine(RWMol *mol, const std::string &text,
|
|
unsigned int line, bool strictParsing) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M APO"), "bad APO line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
unsigned int aid = 0;
|
|
int val = 0;
|
|
try {
|
|
aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
val = FileParserUtils::stripSpacesAndCast<int>(text.substr(spos, 4));
|
|
}
|
|
if (!aid || aid > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad APO specification on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
spos += 4;
|
|
--aid;
|
|
Atom *atom = mol->getAtomWithIdx(aid);
|
|
if (!atom) {
|
|
std::ostringstream errout;
|
|
errout << "Atom " << aid << " from APO specification on line " << line
|
|
<< " not found";
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
if (val < 0 || val > 3) {
|
|
std::ostringstream errout;
|
|
errout << "Value " << val << " from APO specification on line "
|
|
<< line << " is invalid";
|
|
throw FileParseException(errout.str());
|
|
} else if (val) {
|
|
if (val == 3) {
|
|
// this is -1 in v3k mol blocks, so use that:
|
|
val = -1;
|
|
}
|
|
if (atom->hasProp(common_properties::molAttachPoint)) {
|
|
std::ostringstream errout;
|
|
errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1
|
|
<< " on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
} else {
|
|
atom->setProp(common_properties::molAttachPoint, val);
|
|
}
|
|
}
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
// the format differs between V2000 and V3000, so we have to do a bit of
|
|
// translation here
|
|
void ParseLinkNodeLine(RWMol *mol, const std::string &text, unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M LIN"), "bad LIN line");
|
|
|
|
unsigned int nent;
|
|
try {
|
|
nent = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
std::string propVal = "";
|
|
unsigned int spos = 9;
|
|
for (unsigned int ie = 0; ie < nent; ie++) {
|
|
try {
|
|
auto aid = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
if (!aid || aid > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "LIN specification has bad atom idx on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
spos += 4;
|
|
|
|
if (text.size() < spos + 4 || text.substr(spos, 4) == " ") {
|
|
std::ostringstream errout;
|
|
errout << "LIN specification missing repeat count on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
auto repeatCount = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
spos += 4;
|
|
if (repeatCount < 2) {
|
|
std::ostringstream errout;
|
|
errout << "LIN specification: repeat count must be >=2 on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int substB = 0;
|
|
unsigned int substC = 0;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
substB = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
}
|
|
spos += 4;
|
|
if (text.size() >= spos + 4 && text.substr(spos, 4) != " ") {
|
|
substC = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(spos, 4));
|
|
}
|
|
spos += 4;
|
|
|
|
if (!substB || substB > mol->getNumAtoms() ||
|
|
substC > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "LIN specification has bad substituent idx on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
boost::format formatter;
|
|
if (substC) {
|
|
formatter = boost::format("1 %1% 2 %2% %3% %2% %4%") % repeatCount %
|
|
aid % substB % substC;
|
|
} else {
|
|
formatter = boost::format("1 %1% 1 %2% %3%") % repeatCount % aid %
|
|
substB % substC;
|
|
}
|
|
if (!propVal.empty()) {
|
|
propVal += "|";
|
|
}
|
|
propVal += formatter.str();
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 4)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
mol->setProp(common_properties::molFileLinkNodes, propVal);
|
|
}
|
|
}
|
|
|
|
// Recursively populates queryVect with COMPOSITE_AND queries
|
|
// present in the input query. If the logic of the input query
|
|
// is more complex, it returns nullptr and empty set.
|
|
// The returned ptr should only be checked for not being null
|
|
// and not used for any other purposes, as the actual result is
|
|
// the queryVect
|
|
const QueryAtom::QUERYATOM_QUERY *getAndQueries(
|
|
const QueryAtom::QUERYATOM_QUERY *q,
|
|
std::vector<const QueryAtom::QUERYATOM_QUERY *> &queryVect) {
|
|
if (q) {
|
|
auto qOrig = q;
|
|
for (auto cq = qOrig->beginChildren(); cq != qOrig->endChildren(); ++cq) {
|
|
if (q == qOrig && q->getDescription() != "AtomAnd") {
|
|
q = nullptr;
|
|
break;
|
|
}
|
|
q = getAndQueries(cq->get(), queryVect);
|
|
}
|
|
if (q == qOrig) {
|
|
queryVect.push_back(q);
|
|
}
|
|
}
|
|
if (!q) {
|
|
queryVect.clear();
|
|
}
|
|
return q;
|
|
}
|
|
|
|
void ParseNewAtomList(RWMol *mol, const std::string &text, unsigned int line) {
|
|
if (text.size() < 15) {
|
|
std::ostringstream errout;
|
|
errout << "Atom list line too short: '" << text << "'";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M ALS"),
|
|
"bad atom list line");
|
|
|
|
unsigned int idx;
|
|
try {
|
|
idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(7, 3)) -
|
|
1;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(7, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
URANGE_CHECK(idx, mol->getNumAtoms());
|
|
|
|
int nQueries;
|
|
try {
|
|
nQueries = FileParserUtils::toInt(text.substr(10, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(10, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
if (!nQueries) {
|
|
BOOST_LOG(rdWarningLog) << "Empty atom list: '" << text << "' on line "
|
|
<< line << "." << std::endl;
|
|
return;
|
|
}
|
|
|
|
if (nQueries < 0) {
|
|
std::ostringstream errout;
|
|
errout << "negative length atom list: '" << text << "' on line " << line
|
|
<< "." << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
QueryAtom *a = nullptr;
|
|
QueryAtom *qaOrig = nullptr;
|
|
QueryAtom::QUERYATOM_QUERY *qOrig = nullptr;
|
|
Atom *aOrig = mol->getAtomWithIdx(idx);
|
|
for (unsigned int i = 0; i < static_cast<unsigned int>(nQueries); i++) {
|
|
unsigned int pos = 16 + i * 4;
|
|
if (text.size() < pos + 4) {
|
|
std::ostringstream errout;
|
|
errout << "Atom list line too short: '" << text << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
std::string atSymb = text.substr(pos, 4);
|
|
atSymb.erase(atSymb.find(' '), atSymb.size());
|
|
int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb);
|
|
if (!i) {
|
|
if (aOrig->hasQuery()) {
|
|
qaOrig = dynamic_cast<QueryAtom *>(aOrig);
|
|
if (qaOrig) {
|
|
qOrig = qaOrig->getQuery();
|
|
}
|
|
}
|
|
a = new QueryAtom(*aOrig);
|
|
a->setAtomicNum(atNum);
|
|
if (!qOrig) {
|
|
qOrig = a->getQuery()->copy();
|
|
}
|
|
a->setQuery(makeAtomNumQuery(atNum));
|
|
} else {
|
|
a->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true);
|
|
}
|
|
}
|
|
ASSERT_INVARIANT(a, "no atom built");
|
|
if (qOrig) {
|
|
std::vector<const QueryAtom::QUERYATOM_QUERY *> queryVect;
|
|
if (getAndQueries(qOrig, queryVect)) {
|
|
for (const auto &q : queryVect) {
|
|
if (q->getDescription() != "AtomAtomicNum") {
|
|
a->expandQuery(q->copy(), Queries::COMPOSITE_AND, true);
|
|
}
|
|
}
|
|
}
|
|
if (!qaOrig) {
|
|
delete qOrig;
|
|
}
|
|
}
|
|
a->setProp(common_properties::_MolFileAtomQuery, 1);
|
|
switch (text[14]) {
|
|
case 'T':
|
|
a->getQuery()->setNegation(true);
|
|
break;
|
|
case 'F':
|
|
a->getQuery()->setNegation(false);
|
|
break;
|
|
default:
|
|
std::ostringstream errout;
|
|
errout << "Unrecognized atom-list query modifier: '" << text[14]
|
|
<< "' on line " << line;
|
|
delete a;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
mol->replaceAtom(idx, a);
|
|
delete a;
|
|
}
|
|
|
|
void ParseV3000RGroups(RWMol *mol, Atom *&atom, std::string_view text,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(atom, "bad atom");
|
|
if (text[0] != '(' || text.back() != ')') {
|
|
std::ostringstream errout;
|
|
errout << "Bad RGROUPS specification '" << text << "' on line " << line
|
|
<< ". Missing parens.";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
std::vector<std::string> splitToken;
|
|
std::string resid = std::string(text.substr(1, text.size() - 2));
|
|
boost::split(splitToken, resid, boost::is_any_of(std::string(" ")));
|
|
if (splitToken.size() < 1) {
|
|
std::ostringstream errout;
|
|
errout << "Bad RGROUPS specification '" << text << "' on line " << line
|
|
<< ". Missing values.";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int nRs;
|
|
try {
|
|
nRs = FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[0]);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << splitToken[0] << "' to int on line" << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (splitToken.size() < nRs + 1) {
|
|
std::ostringstream errout;
|
|
errout << "Bad RGROUPS specification '" << text << "' on line " << line
|
|
<< ". Not enough values.";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
for (unsigned int i = 0; i < nRs; ++i) {
|
|
unsigned int rLabel;
|
|
try {
|
|
rLabel =
|
|
FileParserUtils::stripSpacesAndCast<unsigned int>(splitToken[i + 1]);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << splitToken[i + 1] << "' to int on line"
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
atom->setProp(common_properties::_MolFileRLabel, rLabel);
|
|
std::string dLabel = "R" + std::to_string(rLabel);
|
|
atom->setProp(common_properties::dummyLabel, dLabel);
|
|
atom->setIsotope(rLabel);
|
|
atom->setQuery(makeAtomNullQuery());
|
|
}
|
|
}
|
|
|
|
void ParseRGroupLabels(RWMol *mol, const std::string &text, unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 6) == std::string("M RGP"),
|
|
"bad R group label line");
|
|
|
|
int nLabels;
|
|
try {
|
|
nLabels = FileParserUtils::toInt(text.substr(6, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(6, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
for (int i = 0; i < nLabels; i++) {
|
|
int pos = 10 + i * 8;
|
|
unsigned int atIdx;
|
|
try {
|
|
atIdx = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(pos, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(pos, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int rLabel;
|
|
try {
|
|
rLabel = FileParserUtils::stripSpacesAndCast<unsigned int>(
|
|
text.substr(pos + 4, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(pos + 4, 3)
|
|
<< "' to int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
atIdx -= 1;
|
|
if (atIdx > mol->getNumAtoms()) {
|
|
std::ostringstream errout;
|
|
errout << "Attempt to set R group label on nonexistent atom " << atIdx
|
|
<< " on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
QueryAtom qatom(*(mol->getAtomWithIdx(atIdx)));
|
|
qatom.setProp(common_properties::_MolFileRLabel, rLabel);
|
|
|
|
// set the dummy label so that this is shown correctly
|
|
// in other pieces of the code :
|
|
// (this was sf.net issue 3316600)
|
|
std::string dLabel = "R" + std::to_string(rLabel);
|
|
qatom.setProp(common_properties::dummyLabel, dLabel);
|
|
|
|
// the CTFile spec (June 2005 version) technically only allows
|
|
// R labels up to 32. Since there are three digits, we'll accept
|
|
// anything: so long as it's positive and less than 1000:
|
|
if (rLabel > 0 && rLabel < 999) {
|
|
qatom.setIsotope(rLabel);
|
|
}
|
|
qatom.setQuery(makeAtomNullQuery());
|
|
mol->replaceAtom(atIdx, &qatom);
|
|
}
|
|
}
|
|
|
|
void ParseAtomAlias(RWMol *mol, std::string text, const std::string &nextLine,
|
|
unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 2) == std::string("A "), "bad atom alias line");
|
|
|
|
unsigned int idx;
|
|
try {
|
|
idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) -
|
|
1;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
URANGE_CHECK(idx, mol->getNumAtoms());
|
|
Atom *at = mol->getAtomWithIdx(idx);
|
|
at->setProp(common_properties::molFileAlias, nextLine);
|
|
}
|
|
|
|
void ParseAtomValue(RWMol *mol, std::string text, unsigned int line) {
|
|
PRECONDITION(mol, "bad mol");
|
|
PRECONDITION(text.substr(0, 2) == std::string("V "), "bad atom value line");
|
|
|
|
unsigned int idx;
|
|
try {
|
|
idx = FileParserUtils::stripSpacesAndCast<unsigned int>(text.substr(3, 3)) -
|
|
1;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(3, 3) << "' to int on line"
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
URANGE_CHECK(idx, mol->getNumAtoms());
|
|
Atom *at = mol->getAtomWithIdx(idx);
|
|
at->setProp(common_properties::molFileValue,
|
|
text.substr(7, text.length() - 7));
|
|
}
|
|
|
|
// We support the same special atom queries that we can read from
|
|
// CXSMILES
|
|
const std::vector<std::string> complexQueries = {"A", "AH", "Q", "QH",
|
|
"X", "XH", "M", "MH"};
|
|
void convertComplexNameToQuery(Atom *query, std::string_view symb) {
|
|
if (symb == "Q") {
|
|
query->setQuery(makeQAtomQuery());
|
|
} else if (symb == "QH") {
|
|
query->setQuery(makeQHAtomQuery());
|
|
} else if (symb == "A") {
|
|
query->setQuery(makeAAtomQuery());
|
|
} else if (symb == "AH") {
|
|
query->setQuery(makeAHAtomQuery());
|
|
} else if (symb == "X") {
|
|
query->setQuery(makeXAtomQuery());
|
|
} else if (symb == "XH") {
|
|
query->setQuery(makeXHAtomQuery());
|
|
} else if (symb == "M") {
|
|
query->setQuery(makeMAtomQuery());
|
|
} else if (symb == "MH") {
|
|
query->setQuery(makeMHAtomQuery());
|
|
} else {
|
|
// we control what this function gets called with, so we should never land
|
|
// here
|
|
ASSERT_INVARIANT(0, "bad complex query symbol");
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
void setRGPProps(const std::string_view symb, Atom *res) {
|
|
PRECONDITION(res, "bad atom pointer");
|
|
// set the dummy label so that this is shown correctly
|
|
// in other pieces of the code :
|
|
std::string symbc(symb);
|
|
res->setProp(common_properties::dummyLabel, symbc);
|
|
}
|
|
|
|
void lookupAtomicNumber(Atom *res, const std::string &symb,
|
|
bool strictParsing) {
|
|
try {
|
|
res->setAtomicNum(PeriodicTable::getTable()->getAtomicNumber(symb));
|
|
} catch (const Invar::Invariant &e) {
|
|
if (strictParsing || symb.empty()) {
|
|
delete res;
|
|
throw FileParseException(e.what());
|
|
} else {
|
|
res->setAtomicNum(0);
|
|
res->setProp(common_properties::dummyLabel, symb);
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
Atom *ParseMolFileAtomLine(const std::string_view text, RDGeom::Point3D &pos,
|
|
unsigned int line, bool strictParsing) {
|
|
std::string symb;
|
|
int massDiff, chg, hCount;
|
|
|
|
if ((strictParsing && text.size() < 34) || text.size() < 32) {
|
|
std::ostringstream errout;
|
|
errout << "Atom line too short: '" << text << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
try {
|
|
pos.x = FileParserUtils::toDouble(text.substr(0, 10));
|
|
pos.y = FileParserUtils::toDouble(text.substr(10, 10));
|
|
pos.z = FileParserUtils::toDouble(text.substr(20, 10));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot process coordinates on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
symb = text.substr(31, 3);
|
|
boost::trim(symb);
|
|
|
|
// REVIEW: should we handle missing fields at the end of the line?
|
|
massDiff = 0;
|
|
if (text.size() >= 36 && text.substr(34, 2) != " 0") {
|
|
try {
|
|
massDiff = FileParserUtils::toInt(text.substr(34, 2), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(34, 2) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
chg = 0;
|
|
if (text.size() >= 39 && text.substr(36, 3) != " 0") {
|
|
try {
|
|
chg = FileParserUtils::toInt(text.substr(36, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(36, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
hCount = 0;
|
|
if (text.size() >= 45 && text.substr(42, 3) != " 0") {
|
|
try {
|
|
hCount = FileParserUtils::toInt(text.substr(42, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(42, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
auto *res = new Atom;
|
|
bool isComplexQueryName =
|
|
std::find(complexQueries.begin(), complexQueries.end(), symb) !=
|
|
complexQueries.end();
|
|
if (isComplexQueryName || symb == "L" || symb == "*" || symb == "LP" ||
|
|
symb == "R" || symb == "R#" ||
|
|
(symb[0] == 'R' && symb >= "R0" && symb <= "R99")) {
|
|
if (isComplexQueryName || symb == "*" || symb == "R") {
|
|
auto *query = new QueryAtom(0);
|
|
if (symb == "*" || symb == "R") {
|
|
// according to the MDL spec, these match anything
|
|
query->setQuery(makeAtomNullQuery());
|
|
} else if (isComplexQueryName) {
|
|
convertComplexNameToQuery(query, symb);
|
|
}
|
|
delete res;
|
|
res = query;
|
|
// queries have no implicit Hs:
|
|
res->setNoImplicit(true);
|
|
} else {
|
|
res->setAtomicNum(0);
|
|
}
|
|
if (massDiff == 0 && symb[0] == 'R') {
|
|
if (symb.length() > 1 && symb >= "R0" && symb <= "R99") {
|
|
std::string rlabel = "";
|
|
rlabel = symb.substr(1, symb.length() - 1);
|
|
int rnumber;
|
|
try {
|
|
rnumber = boost::lexical_cast<int>(rlabel);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
rnumber = -1;
|
|
}
|
|
if (rnumber >= 0) {
|
|
res->setIsotope(rnumber);
|
|
}
|
|
}
|
|
}
|
|
if (symb[0] == 'R') {
|
|
// we used to skip R# here because that really should be handled by an
|
|
// RGP spec, but that turned out to not be permissive enough... <sigh>
|
|
setRGPProps(symb, res);
|
|
}
|
|
} else if (symb == "D") { // mol blocks support "D" and "T" as shorthand...
|
|
// handle that.
|
|
res->setAtomicNum(1);
|
|
res->setIsotope(2);
|
|
} else if (symb == "T") { // mol blocks support "D" and "T" as shorthand...
|
|
// handle that.
|
|
res->setAtomicNum(1);
|
|
res->setIsotope(3);
|
|
} else if (symb == "Pol" || symb == "Mod") {
|
|
res->setAtomicNum(0);
|
|
res->setProp(common_properties::dummyLabel, symb);
|
|
} else {
|
|
if (symb.size() == 2 && symb[1] >= 'A' && symb[1] <= 'Z') {
|
|
symb[1] = static_cast<char>(tolower(symb[1]));
|
|
}
|
|
lookupAtomicNumber(res, symb, strictParsing);
|
|
}
|
|
|
|
// res->setPos(pX,pY,pZ);
|
|
if (chg != 0) {
|
|
res->setFormalCharge(4 - chg);
|
|
}
|
|
|
|
if (hCount >= 1) {
|
|
if (!res->hasQuery()) {
|
|
auto qatom = new QueryAtom(*res);
|
|
delete res;
|
|
res = qatom;
|
|
}
|
|
res->setNoImplicit(true);
|
|
if (hCount > 1) {
|
|
ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hCount - 1);
|
|
auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
|
|
hCount - 1, oq->getDataFunc(),
|
|
std::string("less_") + oq->getDescription());
|
|
res->expandQuery(nq);
|
|
delete oq;
|
|
} else {
|
|
res->expandQuery(makeAtomImplicitHCountQuery(0));
|
|
}
|
|
}
|
|
|
|
if (massDiff != 0) {
|
|
int defIso =
|
|
PeriodicTable::getTable()->getMostCommonIsotope(res->getAtomicNum());
|
|
int dIso = defIso + massDiff;
|
|
if (dIso < 0) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< " atom " << res->getIdx()
|
|
<< " has a negative isotope offset. line: " << line << std::endl;
|
|
}
|
|
res->setIsotope(dIso);
|
|
res->setProp(common_properties::_hasMassQuery, true);
|
|
}
|
|
|
|
if (text.size() >= 42 && text.substr(39, 3) != " 0") {
|
|
int parity = 0;
|
|
try {
|
|
parity = FileParserUtils::toInt(text.substr(39, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(39, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->setProp(common_properties::molParity, parity);
|
|
}
|
|
|
|
if (text.size() >= 48 && text.substr(45, 3) != " 0") {
|
|
int stereoCare = 0;
|
|
try {
|
|
stereoCare = FileParserUtils::toInt(text.substr(45, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(45, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->setProp(common_properties::molStereoCare, stereoCare);
|
|
}
|
|
if (text.size() >= 51 && text.substr(48, 3) != " 0") {
|
|
int totValence = 0;
|
|
try {
|
|
totValence = FileParserUtils::toInt(text.substr(48, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(48, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (totValence != 0) {
|
|
// only set if it's a non-default value
|
|
res->setProp(common_properties::molTotValence, totValence);
|
|
}
|
|
}
|
|
if (text.size() >= 57 && text.substr(54, 3) != " 0") {
|
|
int rxnRole = 0;
|
|
try {
|
|
rxnRole = FileParserUtils::toInt(text.substr(54, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(54, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (rxnRole != 0) {
|
|
// only set if it's a non-default value
|
|
res->setProp(common_properties::molRxnRole, rxnRole);
|
|
}
|
|
}
|
|
if (text.size() >= 60 && text.substr(57, 3) != " 0") {
|
|
int rxnComponent = 0;
|
|
try {
|
|
rxnComponent = FileParserUtils::toInt(text.substr(57, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(57, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (rxnComponent != 0) {
|
|
// only set if it's a non-default value
|
|
res->setProp(common_properties::molRxnComponent, rxnComponent);
|
|
}
|
|
}
|
|
if (text.size() >= 63 && text.substr(60, 3) != " 0") {
|
|
int atomMapNumber = 0;
|
|
try {
|
|
atomMapNumber = FileParserUtils::toInt(text.substr(60, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(60, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->setProp(common_properties::molAtomMapNumber, atomMapNumber);
|
|
}
|
|
if (text.size() >= 66 && text.substr(63, 3) != " 0") {
|
|
int inversionFlag = 0;
|
|
try {
|
|
inversionFlag = FileParserUtils::toInt(text.substr(63, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(63, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->setProp(common_properties::molInversionFlag, inversionFlag);
|
|
}
|
|
if (text.size() >= 69 && text.substr(66, 3) != " 0") {
|
|
int exactChangeFlag = 0;
|
|
try {
|
|
exactChangeFlag = FileParserUtils::toInt(text.substr(66, 3), true);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(66, 3) << "' to int on line "
|
|
<< line;
|
|
delete res;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->setProp("molExactChangeFlag", exactChangeFlag);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
Bond *ParseMolFileBondLine(const std::string_view text, unsigned int line) {
|
|
unsigned int idx1, idx2, bType, stereo;
|
|
int spos = 0;
|
|
|
|
if (text.size() < 9) {
|
|
std::ostringstream errout;
|
|
errout << "Bond line too short: '" << text << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
try {
|
|
idx1 = FileParserUtils::toUnsigned(text.substr(spos, 3));
|
|
spos += 3;
|
|
idx2 = FileParserUtils::toUnsigned(text.substr(spos, 3));
|
|
spos += 3;
|
|
bType = FileParserUtils::toUnsigned(text.substr(spos, 3));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << text.substr(spos, 3) << "' to int on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
// adjust the numbering
|
|
idx1--;
|
|
idx2--;
|
|
|
|
Bond::BondType type;
|
|
Bond *res = nullptr;
|
|
switch (bType) {
|
|
case 1:
|
|
type = Bond::SINGLE;
|
|
res = new Bond;
|
|
break;
|
|
case 2:
|
|
type = Bond::DOUBLE;
|
|
res = new Bond;
|
|
break;
|
|
case 3:
|
|
type = Bond::TRIPLE;
|
|
res = new Bond;
|
|
break;
|
|
case 4:
|
|
type = Bond::AROMATIC;
|
|
res = new Bond;
|
|
break;
|
|
case 9:
|
|
type = Bond::DATIVE;
|
|
res = new Bond;
|
|
break;
|
|
case 0:
|
|
type = Bond::UNSPECIFIED;
|
|
res = new Bond;
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "bond with order 0 found on line " << line
|
|
<< ". This is not part of the MDL specification." << std::endl;
|
|
break;
|
|
default:
|
|
type = Bond::UNSPECIFIED;
|
|
// it's a query bond of some type
|
|
res = new QueryBond;
|
|
if (bType == 8) {
|
|
BOND_NULL_QUERY *q;
|
|
q = makeBondNullQuery();
|
|
res->setQuery(q);
|
|
} else if (bType == 5) {
|
|
res->setQuery(makeSingleOrDoubleBondQuery());
|
|
res->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else if (bType == 6) {
|
|
res->setQuery(makeSingleOrAromaticBondQuery());
|
|
res->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else if (bType == 7) {
|
|
res->setQuery(makeDoubleOrAromaticBondQuery());
|
|
res->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else {
|
|
BOND_NULL_QUERY *q;
|
|
q = makeBondNullQuery();
|
|
res->setQuery(q);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "unrecognized query bond type, " << bType << ", found on line "
|
|
<< line << ". Using an \"any\" query." << std::endl;
|
|
}
|
|
break;
|
|
}
|
|
res->setBeginAtomIdx(idx1);
|
|
res->setEndAtomIdx(idx2);
|
|
res->setBondType(type);
|
|
res->setProp(common_properties::_MolFileBondType, bType);
|
|
|
|
if (text.size() >= 12 && text.substr(9, 3) != " 0") {
|
|
try {
|
|
stereo = FileParserUtils::toUnsigned(text.substr(9, 3));
|
|
switch (stereo) {
|
|
case 0:
|
|
res->setBondDir(Bond::NONE);
|
|
break;
|
|
case 1:
|
|
res->setBondDir(Bond::BEGINWEDGE);
|
|
break;
|
|
case 6:
|
|
res->setBondDir(Bond::BEGINDASH);
|
|
break;
|
|
case 3: // "either" double bond
|
|
res->setBondDir(Bond::EITHERDOUBLE);
|
|
res->setStereo(Bond::STEREOANY);
|
|
break;
|
|
case 4: // "either" single bond
|
|
res->setBondDir(Bond::UNKNOWN);
|
|
break;
|
|
}
|
|
res->setProp(common_properties::_MolFileBondStereo, stereo);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
;
|
|
}
|
|
}
|
|
if (text.size() >= 18 && text.substr(15, 3) != " 0") {
|
|
try {
|
|
int topology = FileParserUtils::toInt(text.substr(15, 3));
|
|
if (topology) {
|
|
if (!res->hasQuery()) {
|
|
auto *qBond = new QueryBond(*res);
|
|
delete res;
|
|
res = qBond;
|
|
}
|
|
BOND_EQUALS_QUERY *q = makeBondIsInRingQuery();
|
|
switch (topology) {
|
|
case 1:
|
|
break;
|
|
case 2:
|
|
q->setNegation(true);
|
|
break;
|
|
default:
|
|
std::ostringstream errout;
|
|
errout << "Unrecognized bond topology specifier: " << topology
|
|
<< " on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
res->expandQuery(q);
|
|
}
|
|
} catch (boost::bad_lexical_cast &) {
|
|
;
|
|
}
|
|
}
|
|
if (text.size() >= 21 && text.substr(18, 3) != " 0") {
|
|
try {
|
|
int reactStatus = FileParserUtils::toInt(text.substr(18, 3));
|
|
res->setProp("molReactStatus", reactStatus);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
;
|
|
}
|
|
}
|
|
return res;
|
|
} // namespace
|
|
|
|
void ParseMolBlockAtoms(std::istream *inStream, unsigned int &line,
|
|
unsigned int nAtoms, RWMol *mol, Conformer *conf,
|
|
bool strictParsing) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(mol, "bad molecule");
|
|
PRECONDITION(conf, "bad conformer");
|
|
for (unsigned int i = 1; i <= nAtoms; ++i) {
|
|
++line;
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
throw FileParseException("EOF hit while reading atoms");
|
|
}
|
|
RDGeom::Point3D pos;
|
|
Atom *atom = ParseMolFileAtomLine(tempStr, pos, line, strictParsing);
|
|
unsigned int aid = mol->addAtom(atom, false, true);
|
|
conf->setAtomPos(aid, pos);
|
|
mol->setAtomBookmark(atom, i);
|
|
}
|
|
}
|
|
|
|
void ParseMolBlockBonds(std::istream *inStream, unsigned int &line,
|
|
unsigned int nBonds, RWMol *mol,
|
|
bool &chiralityPossible) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(mol, "bad molecule");
|
|
for (unsigned int i = 1; i <= nBonds; ++i) {
|
|
++line;
|
|
std::string tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
throw FileParseException("EOF hit while reading bonds");
|
|
}
|
|
Bond *bond = ParseMolFileBondLine(tempStr, line);
|
|
// if we got an aromatic bond set the flag on the bond and the connected
|
|
// atoms
|
|
if (bond->getBondType() == Bond::AROMATIC) {
|
|
bond->setIsAromatic(true);
|
|
}
|
|
// if the bond might have chirality info associated with it, set a flag:
|
|
if (bond->getBondDir() != Bond::NONE &&
|
|
bond->getBondDir() != Bond::UNKNOWN) {
|
|
chiralityPossible = true;
|
|
}
|
|
// v2k has no way to set stereoCare on bonds, so set the property if both
|
|
// the beginning and end atoms have it set:
|
|
int care1 = 0;
|
|
int care2 = 0;
|
|
if (!bond->hasProp(common_properties::molStereoCare) &&
|
|
mol->getAtomWithIdx(bond->getBeginAtomIdx())
|
|
->getPropIfPresent(common_properties::molStereoCare, care1) &&
|
|
mol->getAtomWithIdx(bond->getEndAtomIdx())
|
|
->getPropIfPresent(common_properties::molStereoCare, care2)) {
|
|
if (care1 && care2) {
|
|
bond->setProp(common_properties::molStereoCare, 1);
|
|
}
|
|
}
|
|
mol->addBond(bond, true);
|
|
mol->setBondBookmark(bond, i);
|
|
}
|
|
}
|
|
|
|
bool checkAttachmentPointsAreValid(
|
|
const RWMol *mol, std::pair<const int, SubstanceGroup> &sgroup) {
|
|
bool res = true;
|
|
int nAtoms = static_cast<int>(mol->getNumAtoms());
|
|
std::vector<SubstanceGroup::AttachPoint> &attachPoints =
|
|
sgroup.second.getAttachPoints();
|
|
for (auto &attachPoint : attachPoints) {
|
|
if (attachPoint.lvIdx == nAtoms) {
|
|
const std::vector<unsigned int> &bonds = sgroup.second.getBonds();
|
|
if (bonds.size() == 1) {
|
|
const auto bond = mol->getBondWithIdx(bonds.front());
|
|
if (bond->getBeginAtomIdx() == attachPoint.aIdx ||
|
|
bond->getEndAtomIdx() == attachPoint.aIdx) {
|
|
attachPoint.lvIdx = bond->getOtherAtomIdx(attachPoint.aIdx);
|
|
}
|
|
}
|
|
}
|
|
if (attachPoint.lvIdx == nAtoms) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "Could not infer missing lvIdx on malformed SAP line for SGroup "
|
|
<< sgroup.first << std::endl;
|
|
res = false;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
bool ParseMolBlockProperties(std::istream *inStream, unsigned int &line,
|
|
RWMol *mol, bool strictParsing) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(mol, "bad molecule");
|
|
// older mol files can have an atom list block here
|
|
std::string tempStr = getLine(inStream);
|
|
++line;
|
|
// there is apparently some software out there that puts a
|
|
// blank line in mol blocks before the "M END". If we aren't
|
|
// doing strict parsing, deal with that here.
|
|
if (!tempStr.size()) {
|
|
if (!strictParsing) {
|
|
tempStr = getLine(inStream);
|
|
++line;
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "Problems encountered parsing Mol data, unexpected blank line "
|
|
"found at line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
} else {
|
|
if (tempStr[0] != 'M' && tempStr[0] != 'A' && tempStr[0] != 'V' &&
|
|
tempStr[0] != 'G' && tempStr[0] != 'S') {
|
|
ParseOldAtomList(mol, std::string_view(tempStr.c_str()), line);
|
|
}
|
|
}
|
|
|
|
IDX_TO_SGROUP_MAP sGroupMap;
|
|
IDX_TO_STR_VECT_MAP dataFieldsMap;
|
|
bool fileComplete = false;
|
|
bool firstChargeLine = true;
|
|
unsigned int SCDcounter = 0;
|
|
unsigned int lastDataSGroup = 0;
|
|
std::ostringstream currentDataField;
|
|
std::string lineBeg = tempStr.substr(0, 6);
|
|
while (!inStream->eof() && !inStream->fail() && lineBeg != "M END" &&
|
|
tempStr.substr(0, 4) != "$$$$") {
|
|
if (tempStr[0] == 'A') {
|
|
line++;
|
|
std::string nextLine = getLine(inStream);
|
|
if (lineBeg != "M END") {
|
|
ParseAtomAlias(mol, tempStr, nextLine, line);
|
|
}
|
|
} else if (tempStr[0] == 'G') {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< " deprecated group abbreviation ignored on line " << line
|
|
<< std::endl;
|
|
// we need to skip the next line, which holds the abbreviation:
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
} else if (tempStr[0] == 'V') {
|
|
ParseAtomValue(mol, tempStr, line);
|
|
} else if (lineBeg == "S SKP") {
|
|
int nToSkip = FileParserUtils::toInt(tempStr.substr(6, 3));
|
|
if (nToSkip < 0) {
|
|
std::ostringstream errout;
|
|
errout << "negative skip value " << nToSkip << " on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
for (unsigned int i = 0; i < static_cast<unsigned int>(nToSkip); ++i) {
|
|
++line;
|
|
tempStr = getLine(inStream);
|
|
}
|
|
} else if (lineBeg == "M ALS") {
|
|
ParseNewAtomList(mol, tempStr, line);
|
|
} else if (lineBeg == "M ISO") {
|
|
ParseIsotopeLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M RGP") {
|
|
ParseRGroupLabels(mol, tempStr, line);
|
|
} else if (lineBeg == "M RBC") {
|
|
ParseRingBondCountLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M SUB") {
|
|
ParseSubstitutionCountLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M UNS") {
|
|
ParseUnsaturationLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M CHG") {
|
|
ParseChargeLine(mol, tempStr, firstChargeLine, line);
|
|
firstChargeLine = false;
|
|
} else if (lineBeg == "M RAD") {
|
|
ParseRadicalLine(mol, tempStr, firstChargeLine, line);
|
|
firstChargeLine = false;
|
|
} else if (lineBeg == "M PXA") {
|
|
ParsePXALine(mol, tempStr, line);
|
|
|
|
/* SGroup parsing start */
|
|
} else if (lineBeg == "M STY") {
|
|
ParseSGroupV2000STYLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SST") {
|
|
ParseSGroupV2000SSTLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SLB") {
|
|
ParseSGroupV2000SLBLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SCN") {
|
|
ParseSGroupV2000SCNLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SDS") {
|
|
ParseSGroupV2000SDSLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SAL" || lineBeg == "M SBL" ||
|
|
lineBeg == "M SPA") {
|
|
ParseSGroupV2000VectorDataLine(sGroupMap, mol, tempStr, line,
|
|
strictParsing);
|
|
} else if (lineBeg == "M SMT") {
|
|
ParseSGroupV2000SMTLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SDI") {
|
|
ParseSGroupV2000SDILine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M CRS") {
|
|
std::ostringstream errout;
|
|
errout << "Unsupported SGroup subtype '" << lineBeg << "' on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
} else if (lineBeg == "M SBV") {
|
|
ParseSGroupV2000SBVLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SDT") {
|
|
ParseSGroupV2000SDTLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SDD") {
|
|
ParseSGroupV2000SDDLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SCD" || lineBeg == "M SED") {
|
|
ParseSGroupV2000SCDSEDLine(sGroupMap, dataFieldsMap, mol, tempStr, line,
|
|
strictParsing, SCDcounter, lastDataSGroup,
|
|
currentDataField);
|
|
} else if (lineBeg == "M SPL") {
|
|
ParseSGroupV2000SPLLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SNC") {
|
|
ParseSGroupV2000SNCLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SAP") {
|
|
ParseSGroupV2000SAPLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SCL") {
|
|
ParseSGroupV2000SCLLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M SBT") {
|
|
ParseSGroupV2000SBTLine(sGroupMap, mol, tempStr, line, strictParsing);
|
|
|
|
/* SGroup parsing end */
|
|
} else if (lineBeg == "M ZBO") {
|
|
ParseZBOLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M ZCH") {
|
|
ParseZCHLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M HYD") {
|
|
ParseHYDLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M MRV") {
|
|
ParseMarvinSmartsLine(mol, tempStr, line);
|
|
} else if (lineBeg == "M APO") {
|
|
ParseAttachPointLine(mol, tempStr, line, strictParsing);
|
|
} else if (lineBeg == "M LIN") {
|
|
ParseLinkNodeLine(mol, tempStr, line);
|
|
}
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
lineBeg = tempStr.substr(0, 6);
|
|
}
|
|
if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M END") {
|
|
// All went well, make final updates to SGroups, and add them to Mol
|
|
for (auto &sgroup : sGroupMap) {
|
|
if (sgroup.second.getIsValid()) {
|
|
sgroup.second.setProp("DATAFIELDS", dataFieldsMap[sgroup.first]);
|
|
sgroup.second.setIsValid(checkAttachmentPointsAreValid(mol, sgroup));
|
|
}
|
|
if (sgroup.second.getIsValid()) {
|
|
addSubstanceGroup(*mol, sgroup.second);
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "SGroup " << sgroup.first << " is invalid";
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< errout.str() << " and will be ignored" << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
fileComplete = true;
|
|
}
|
|
return fileComplete;
|
|
}
|
|
|
|
Atom *ParseV3000AtomSymbol(std::string_view token, unsigned int &line,
|
|
bool strictParsing) {
|
|
bool negate = false;
|
|
token = FileParserUtils::strip(token);
|
|
if (token.size() > 3 && (token[0] == 'N' || token[0] == 'n') &&
|
|
(token[1] == 'O' || token[1] == 'o') &&
|
|
(token[2] == 'T' || token[2] == 't')) {
|
|
negate = true;
|
|
token = token.substr(3, token.size() - 3);
|
|
token = FileParserUtils::strip(token);
|
|
}
|
|
|
|
Atom *res = nullptr;
|
|
if (token[0] == '[') {
|
|
// atom list:
|
|
if (token.back() != ']') {
|
|
std::ostringstream errout;
|
|
errout << "Bad atom token '" << token << "' on line: " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
token = token.substr(1, token.size() - 2);
|
|
|
|
std::vector<std::string> splitToken;
|
|
boost::split(splitToken, token, boost::is_any_of(","));
|
|
|
|
for (std::vector<std::string>::const_iterator stIt = splitToken.begin();
|
|
stIt != splitToken.end(); ++stIt) {
|
|
std::string_view stoken = *stIt;
|
|
std::string atSymb(FileParserUtils::strip(stoken));
|
|
if (atSymb.empty()) {
|
|
continue;
|
|
}
|
|
if (atSymb.size() == 2 && atSymb[1] >= 'A' && atSymb[1] <= 'Z') {
|
|
atSymb[1] = static_cast<char>(tolower(atSymb[1]));
|
|
}
|
|
|
|
int atNum = PeriodicTable::getTable()->getAtomicNumber(atSymb);
|
|
if (!res) {
|
|
res = new QueryAtom(atNum);
|
|
} else {
|
|
res->expandQuery(makeAtomNumQuery(atNum), Queries::COMPOSITE_OR, true);
|
|
}
|
|
}
|
|
res->getQuery()->setNegation(negate);
|
|
} else {
|
|
if (negate) {
|
|
std::ostringstream errout;
|
|
errout << "NOT tokens only supported for atom lists. line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
// it's a normal CTAB atom symbol:
|
|
// NOTE: "R" and "R0"-"R99" are not in the v3K CTAB spec, but we're going to
|
|
// support them anyway
|
|
bool isComplexQueryName =
|
|
std::find(complexQueries.begin(), complexQueries.end(), token) !=
|
|
complexQueries.end();
|
|
if (isComplexQueryName || token == "R" ||
|
|
(token[0] == 'R' && token >= "R0" && token <= "R99") || token == "R#" ||
|
|
token == "*") {
|
|
if (isComplexQueryName || token == "*") {
|
|
res = new QueryAtom(0);
|
|
if (token == "*") {
|
|
// according to the MDL spec, these match anything
|
|
res->setQuery(makeAtomNullQuery());
|
|
} else if (isComplexQueryName) {
|
|
convertComplexNameToQuery(res, token);
|
|
}
|
|
// queries have no implicit Hs:
|
|
res->setNoImplicit(true);
|
|
} else {
|
|
res = new Atom(1);
|
|
res->setAtomicNum(0);
|
|
}
|
|
if (token[0] == 'R' && token >= "R0" && token <= "R99") {
|
|
auto rlabel = token.substr(1, token.length() - 1);
|
|
int rnumber;
|
|
try {
|
|
rnumber = boost::lexical_cast<int>(rlabel);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
rnumber = -1;
|
|
}
|
|
if (rnumber >= 0) {
|
|
res->setIsotope(rnumber);
|
|
}
|
|
}
|
|
if (token[0] == 'R') {
|
|
// we used to skip R# here because that really should be handled by an
|
|
// RGP spec, but that turned out to not be permissive enough... <sigh>
|
|
setRGPProps(token, res);
|
|
}
|
|
} else if (token == "D") { // mol blocks support "D" and "T" as
|
|
// shorthand... handle that.
|
|
res = new Atom(1);
|
|
res->setIsotope(2);
|
|
} else if (token == "T") { // mol blocks support "D" and "T" as
|
|
// shorthand... handle that.
|
|
res = new Atom(1);
|
|
res->setIsotope(3);
|
|
} else if (token == "Pol" || token == "Mod") {
|
|
res = new Atom(0);
|
|
res->setProp(common_properties::dummyLabel, std::string(token));
|
|
} else {
|
|
std::string tcopy(token);
|
|
if (token.size() == 2 && token[1] >= 'A' && token[1] <= 'Z') {
|
|
tcopy[1] = static_cast<char>(tolower(token[1]));
|
|
}
|
|
res = new Atom(0);
|
|
lookupAtomicNumber(res, tcopy, strictParsing);
|
|
}
|
|
}
|
|
|
|
POSTCONDITION(res, "no atom built");
|
|
return res;
|
|
}
|
|
|
|
bool splitAssignToken(std::string_view token, std::string &prop,
|
|
std::string_view &val) {
|
|
auto equalsLoc = token.find("=");
|
|
if (equalsLoc == token.npos || equalsLoc != token.rfind("=")) {
|
|
return false;
|
|
}
|
|
prop = token.substr(0, equalsLoc);
|
|
boost::to_upper(prop);
|
|
val = token.substr(equalsLoc + 1);
|
|
return true;
|
|
}
|
|
|
|
template <class T>
|
|
void ParseV3000AtomProps(RWMol *mol, Atom *&atom, typename T::iterator &token,
|
|
const T &tokens, unsigned int &line,
|
|
bool strictParsing) {
|
|
PRECONDITION(mol, "bad molecule");
|
|
PRECONDITION(atom, "bad atom");
|
|
std::ostringstream errout;
|
|
while (token != tokens.end()) {
|
|
std::string prop;
|
|
std::string_view val;
|
|
if (!splitAssignToken(*token, prop, val)) {
|
|
errout << "Invalid atom property: '" << *token << "' for atom "
|
|
<< atom->getIdx() + 1 << " on line " << line << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
if (prop == "CHG") {
|
|
auto charge = FileParserUtils::toInt(val);
|
|
if (!atom->hasQuery()) {
|
|
atom->setFormalCharge(charge);
|
|
} else {
|
|
atom->expandQuery(makeAtomFormalChargeQuery(charge));
|
|
}
|
|
} else if (prop == "RAD") {
|
|
// FIX handle queries here
|
|
switch (FileParserUtils::toInt(val)) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
atom->setNumRadicalElectrons(2);
|
|
break;
|
|
case 2:
|
|
atom->setNumRadicalElectrons(1);
|
|
break;
|
|
case 3:
|
|
atom->setNumRadicalElectrons(2);
|
|
break;
|
|
default:
|
|
errout << "Unrecognized RAD value " << val << " for atom "
|
|
<< atom->getIdx() + 1 << " on line " << line << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
} else if (prop == "MASS") {
|
|
// the documentation for V3000 CTABs says that this should contain the
|
|
// "absolute atomic weight" (whatever that means).
|
|
// Online examples seem to have integer (isotope) values and Marvin won't
|
|
// even read something that has a float.
|
|
// We'll go with the int
|
|
int v;
|
|
double dv;
|
|
try {
|
|
v = FileParserUtils::toInt(val);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
try {
|
|
dv = FileParserUtils::toDouble(val);
|
|
v = static_cast<int>(floor(dv));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
v = -1;
|
|
}
|
|
}
|
|
if (v < 0) {
|
|
errout << "Bad value for MASS :" << val << " for atom "
|
|
<< atom->getIdx() + 1 << " on line " << line << std::endl;
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
if (!atom->hasQuery()) {
|
|
atom->setIsotope(v);
|
|
} else {
|
|
atom->expandQuery(makeAtomIsotopeQuery(v));
|
|
}
|
|
}
|
|
} else if (prop == "CFG") {
|
|
auto cfg = FileParserUtils::toInt(val);
|
|
switch (cfg) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
atom->setProp(common_properties::molParity, cfg);
|
|
break;
|
|
default:
|
|
errout << "Unrecognized CFG value : " << val << " for atom "
|
|
<< atom->getIdx() + 1 << " on line " << line << std::endl;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
} else if (prop == "HCOUNT") {
|
|
if (val != "0") {
|
|
auto hcount = FileParserUtils::toInt(val);
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
if (hcount == -1) {
|
|
hcount = 0;
|
|
}
|
|
if (hcount > 0) {
|
|
ATOM_EQUALS_QUERY *oq = makeAtomImplicitHCountQuery(hcount);
|
|
auto nq = makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
|
|
hcount, oq->getDataFunc(),
|
|
std::string("less_") + oq->getDescription());
|
|
atom->expandQuery(nq);
|
|
delete oq;
|
|
} else {
|
|
atom->expandQuery(makeAtomImplicitHCountQuery(0));
|
|
}
|
|
}
|
|
} else if (prop == "UNSAT") {
|
|
if (val == "1") {
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
atom->expandQuery(makeAtomUnsaturatedQuery());
|
|
}
|
|
} else if (prop == "RBCNT") {
|
|
if (val != "0") {
|
|
auto rbcount = FileParserUtils::toInt(val);
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
if (rbcount == -1) {
|
|
rbcount = 0;
|
|
}
|
|
atom->expandQuery(makeAtomRingBondCountQuery(rbcount));
|
|
}
|
|
} else if (prop == "VAL") {
|
|
if (val != "0") {
|
|
auto totval = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molTotValence, totval);
|
|
}
|
|
} else if (prop == "RGROUPS") {
|
|
ParseV3000RGroups(mol, atom, val, line);
|
|
// FIX
|
|
} else if (prop == "STBOX") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molStereoCare, ival);
|
|
}
|
|
} else if (prop == "SUBST") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molSubstCount, ival);
|
|
}
|
|
} else if (prop == "EXACHG") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molRxnExactChange, ival);
|
|
}
|
|
} else if (prop == "INVRET") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molInversionFlag, ival);
|
|
}
|
|
} else if (prop == "ATTCHPT") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
if (atom->hasProp(common_properties::molAttachPoint)) {
|
|
errout << "Multiple ATTCHPT values for atom " << atom->getIdx() + 1
|
|
<< " on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
errout.str(std::string());
|
|
}
|
|
} else {
|
|
atom->setProp(common_properties::molAttachPoint, ival);
|
|
}
|
|
}
|
|
} else if (prop == "ATTCHORD") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molAttachOrder, ival);
|
|
}
|
|
} else if (prop == "CLASS") {
|
|
atom->setProp(common_properties::molAtomClass, std::string(val));
|
|
} else if (prop == "SEQID") {
|
|
if (val != "0") {
|
|
auto ival = FileParserUtils::toInt(val);
|
|
atom->setProp(common_properties::molAtomSeqId, ival);
|
|
}
|
|
}
|
|
++token;
|
|
}
|
|
}
|
|
|
|
void tokenizeV3000Line(std::string_view line,
|
|
std::vector<std::string_view> &tokens) {
|
|
tokens.clear();
|
|
bool inQuotes = false, inParens = false;
|
|
unsigned int start = 0;
|
|
unsigned int pos = 0;
|
|
while (pos < line.size()) {
|
|
if (line[pos] == ' ' || line[pos] == '\t') {
|
|
if (start == pos) {
|
|
++start;
|
|
++pos;
|
|
} else if (!inQuotes && !inParens) {
|
|
tokens.push_back(line.substr(start, pos - start));
|
|
++pos;
|
|
start = pos;
|
|
} else {
|
|
++pos;
|
|
}
|
|
} else if (line[pos] == ')' && inParens) {
|
|
tokens.push_back(line.substr(start, pos - start + 1));
|
|
inParens = false;
|
|
++pos;
|
|
start = pos;
|
|
} else if (line[pos] == '(' && !inQuotes) {
|
|
inParens = true;
|
|
++pos;
|
|
} else if (line[pos] == '"' && !inParens) {
|
|
if (pos + 1 < line.size() && line[pos + 1] == '"') {
|
|
pos += 2;
|
|
} else if (inQuotes) {
|
|
// don't push on the quotes themselves
|
|
tokens.push_back(line.substr(start + 1, pos - start - 1));
|
|
++pos;
|
|
start = pos;
|
|
inQuotes = false;
|
|
} else {
|
|
++pos;
|
|
inQuotes = true;
|
|
}
|
|
} else {
|
|
++pos;
|
|
}
|
|
}
|
|
if (start != pos) {
|
|
tokens.push_back(line.substr(start, line.size() - start));
|
|
}
|
|
#if 0
|
|
std::cerr<<"tokens: ";
|
|
std::copy(tokens.begin(),tokens.end(),std::ostream_iterator<std::string>(std::cerr,"|"));
|
|
std::cerr<<std::endl;
|
|
#endif
|
|
}
|
|
|
|
void ParseV3000AtomBlock(std::istream *inStream, unsigned int &line,
|
|
unsigned int nAtoms, RWMol *mol, Conformer *conf,
|
|
bool strictParsing) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(nAtoms > 0, "bad atom count");
|
|
PRECONDITION(mol, "bad molecule");
|
|
PRECONDITION(conf, "bad conformer");
|
|
std::vector<std::string> splitLine;
|
|
|
|
auto inl = getV3000Line(inStream, line);
|
|
std::string_view tempStr = inl;
|
|
if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN ATOM") {
|
|
std::ostringstream errout;
|
|
errout << "BEGIN ATOM line not found on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
for (unsigned int i = 0; i < nAtoms; ++i) {
|
|
inl = getV3000Line(inStream, line);
|
|
tempStr = inl;
|
|
auto trimmed = FileParserUtils::strip(tempStr);
|
|
|
|
std::vector<std::string_view> tokens;
|
|
std::vector<std::string_view>::iterator token;
|
|
|
|
tokenizeV3000Line(trimmed, tokens);
|
|
token = tokens.begin();
|
|
|
|
if (token == tokens.end()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line" << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
unsigned int molIdx = 0;
|
|
std::from_chars(token->data(), token->data() + token->size(), molIdx);
|
|
|
|
// start with the symbol:
|
|
++token;
|
|
if (token == tokens.end()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
Atom *atom = ParseV3000AtomSymbol(*token, line, strictParsing);
|
|
|
|
// now the position;
|
|
RDGeom::Point3D pos;
|
|
++token;
|
|
if (token == tokens.end()) {
|
|
delete atom;
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
pos.x = atof(std::string(*token).c_str());
|
|
++token;
|
|
if (token == tokens.end()) {
|
|
delete atom;
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
pos.y = atof(std::string(*token).c_str());
|
|
++token;
|
|
if (token == tokens.end()) {
|
|
delete atom;
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
pos.z = atof(std::string(*token).c_str());
|
|
// the map number:
|
|
++token;
|
|
if (token == tokens.end()) {
|
|
delete atom;
|
|
std::ostringstream errout;
|
|
errout << "Bad atom line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
int mapNum = atoi(std::string(*token).c_str());
|
|
if (mapNum > 0) {
|
|
atom->setProp(common_properties::molAtomMapNumber, mapNum);
|
|
}
|
|
++token;
|
|
|
|
unsigned int aid = mol->addAtom(atom, false, true);
|
|
|
|
// additional properties this may change the atom,
|
|
// so be careful with it:
|
|
ParseV3000AtomProps(mol, atom, token, tokens, line, strictParsing);
|
|
|
|
mol->setAtomBookmark(atom, molIdx);
|
|
conf->setAtomPos(aid, pos);
|
|
}
|
|
inl = getV3000Line(inStream, line);
|
|
tempStr = inl;
|
|
if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END ATOM") {
|
|
std::ostringstream errout;
|
|
errout << "END ATOM line not found on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
bool nonzeroZ = hasNonZeroZCoords(*conf);
|
|
if (mol->hasProp(common_properties::_3DConf)) {
|
|
conf->set3D(true);
|
|
mol->clearProp(common_properties::_3DConf);
|
|
if (!nonzeroZ) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "Warning: molecule is tagged as 3D, but all Z coords are zero"
|
|
<< std::endl;
|
|
}
|
|
} else {
|
|
conf->set3D(nonzeroZ);
|
|
}
|
|
}
|
|
void ParseV3000BondBlock(std::istream *inStream, unsigned int &line,
|
|
unsigned int nBonds, RWMol *mol,
|
|
bool &chiralityPossible) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(nBonds > 0, "bad bond count");
|
|
PRECONDITION(mol, "bad molecule");
|
|
|
|
auto inl = getV3000Line(inStream, line);
|
|
std::string_view tempStr = inl;
|
|
if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN BOND") {
|
|
throw FileParseException("BEGIN BOND line not found");
|
|
}
|
|
for (unsigned int i = 0; i < nBonds; ++i) {
|
|
inl = getV3000Line(inStream, line);
|
|
tempStr = inl;
|
|
tempStr = FileParserUtils::strip(tempStr);
|
|
std::vector<std::string_view> splitLine;
|
|
tokenizeV3000Line(tempStr, splitLine);
|
|
if (splitLine.size() < 4) {
|
|
std::ostringstream errout;
|
|
errout << "bond line " << line << " is too short";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
Bond *bond;
|
|
unsigned int bondIdx = 0;
|
|
std::from_chars(splitLine[0].data(),
|
|
splitLine[0].data() + splitLine[0].size(), bondIdx);
|
|
unsigned int bType = 0;
|
|
std::from_chars(splitLine[1].data(),
|
|
splitLine[1].data() + splitLine[1].size(), bType);
|
|
unsigned int a1Idx = 0;
|
|
std::from_chars(splitLine[2].data(),
|
|
splitLine[2].data() + splitLine[2].size(), a1Idx);
|
|
unsigned int a2Idx = 0;
|
|
std::from_chars(splitLine[3].data(),
|
|
splitLine[3].data() + splitLine[3].size(), a2Idx);
|
|
|
|
switch (bType) {
|
|
case 1:
|
|
bond = new Bond(Bond::SINGLE);
|
|
break;
|
|
case 2:
|
|
bond = new Bond(Bond::DOUBLE);
|
|
break;
|
|
case 3:
|
|
bond = new Bond(Bond::TRIPLE);
|
|
break;
|
|
case 4:
|
|
bond = new Bond(Bond::AROMATIC);
|
|
bond->setIsAromatic(true);
|
|
break;
|
|
case 9:
|
|
bond = new Bond(Bond::DATIVE);
|
|
break;
|
|
case 10:
|
|
bond = new Bond(Bond::HYDROGEN);
|
|
break;
|
|
case 0:
|
|
bond = new Bond(Bond::UNSPECIFIED);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "bond with order 0 found on line " << line
|
|
<< ". This is not part of the MDL specification." << std::endl;
|
|
break;
|
|
default:
|
|
// it's a query bond of some type
|
|
bond = new QueryBond;
|
|
if (bType == 8) {
|
|
BOND_NULL_QUERY *q;
|
|
q = makeBondNullQuery();
|
|
bond->setQuery(q);
|
|
} else if (bType == 5) {
|
|
bond->setQuery(makeSingleOrDoubleBondQuery());
|
|
bond->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else if (bType == 6) {
|
|
bond->setQuery(makeSingleOrAromaticBondQuery());
|
|
bond->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else if (bType == 7) {
|
|
bond->setQuery(makeDoubleOrAromaticBondQuery());
|
|
bond->setProp(common_properties::_MolFileBondQuery, 1);
|
|
} else {
|
|
BOND_NULL_QUERY *q;
|
|
q = makeBondNullQuery();
|
|
bond->setQuery(q);
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "unrecognized query bond type, " << bType << ", found on line "
|
|
<< line << ". Using an \"any\" query." << std::endl;
|
|
}
|
|
break;
|
|
}
|
|
bond->setProp(common_properties::_MolFileBondType, bType);
|
|
|
|
// additional bond properties:
|
|
unsigned int lPos = 4;
|
|
std::ostringstream errout;
|
|
while (lPos < splitLine.size()) {
|
|
std::string prop;
|
|
std::string_view val;
|
|
if (!splitAssignToken(splitLine[lPos], prop, val)) {
|
|
errout << "bad bond property '" << splitLine[lPos] << "' on line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
if (prop == "CFG") {
|
|
unsigned int cfg = 0;
|
|
std::from_chars(val.data(), val.data() + val.size(), cfg);
|
|
switch (cfg) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
bond->setBondDir(Bond::BEGINWEDGE);
|
|
chiralityPossible = true;
|
|
break;
|
|
case 2:
|
|
if (bType == 1) {
|
|
bond->setBondDir(Bond::UNKNOWN);
|
|
} else if (bType == 2) {
|
|
bond->setBondDir(Bond::EITHERDOUBLE);
|
|
bond->setStereo(Bond::STEREOANY);
|
|
}
|
|
break;
|
|
case 3:
|
|
bond->setBondDir(Bond::BEGINDASH);
|
|
chiralityPossible = true;
|
|
break;
|
|
default:
|
|
errout << "bad bond CFG " << val << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
bond->setProp(common_properties::_MolFileBondCfg, cfg);
|
|
} else if (prop == "TOPO") {
|
|
if (val != "0") {
|
|
if (!bond->hasQuery()) {
|
|
auto *qBond = new QueryBond(*bond);
|
|
delete bond;
|
|
bond = qBond;
|
|
}
|
|
BOND_EQUALS_QUERY *q = makeBondIsInRingQuery();
|
|
if (val == "1") {
|
|
// nothing
|
|
} else if (val == "2") {
|
|
q->setNegation(true);
|
|
} else {
|
|
errout << "bad bond TOPO " << val << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
bond->expandQuery(q);
|
|
}
|
|
} else if (prop == "RXCTR") {
|
|
int reactStatus = FileParserUtils::toInt(val);
|
|
bond->setProp(common_properties::molReactStatus, reactStatus);
|
|
} else if (prop == "STBOX") {
|
|
bond->setProp(common_properties::molStereoCare, std::string(val));
|
|
} else if (prop == "ENDPTS") {
|
|
bond->setProp(common_properties::_MolFileBondEndPts, std::string(val));
|
|
} else if (prop == "ATTACH") {
|
|
bond->setProp(common_properties::_MolFileBondAttach, std::string(val));
|
|
}
|
|
++lPos;
|
|
}
|
|
|
|
bond->setBeginAtomIdx(mol->getAtomWithBookmark(a1Idx)->getIdx());
|
|
bond->setEndAtomIdx(mol->getAtomWithBookmark(a2Idx)->getIdx());
|
|
mol->addBond(bond, true);
|
|
mol->setBondBookmark(bond, bondIdx);
|
|
|
|
// set the stereoCare property on the bond if it's not set already and both
|
|
// the beginning and end atoms have it set:
|
|
int care1 = 0;
|
|
int care2 = 0;
|
|
if (!bond->hasProp(common_properties::molStereoCare) &&
|
|
mol->getAtomWithIdx(bond->getBeginAtomIdx())
|
|
->getPropIfPresent(common_properties::molStereoCare, care1) &&
|
|
mol->getAtomWithIdx(bond->getEndAtomIdx())
|
|
->getPropIfPresent(common_properties::molStereoCare, care2)) {
|
|
if (care1 == care2) {
|
|
bond->setProp(common_properties::molStereoCare, care1);
|
|
}
|
|
}
|
|
}
|
|
inl = getV3000Line(inStream, line);
|
|
tempStr = inl;
|
|
if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END BOND") {
|
|
std::ostringstream errout;
|
|
errout << "END BOND line not found at line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
// The documentation about MRV_COORDINATE_BOND_TYPE in
|
|
// https://docs.chemaxon.com/display/docs/chemaxon-specific-information-in-mdl-mol-files.md
|
|
// seems to be wrong: it says the only data field in this group contains the
|
|
// index for the coordinate atom. But behavior in Marvin Sketch seems to
|
|
// indicate that it references the bond index instead (see
|
|
// https://github.com/rdkit/rdkit/issues/4473)
|
|
|
|
void processMrvCoordinateBond(RWMol &mol, const SubstanceGroup &sg) {
|
|
std::vector<std::string> dataFields;
|
|
if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
|
|
if (dataFields.empty()) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "ignoring MRV_COORDINATE_BOND_TYPE SGroup without data fields."
|
|
<< std::endl;
|
|
return;
|
|
}
|
|
|
|
auto coordinate_bond_idx =
|
|
FileParserUtils::toUnsigned(dataFields[0], true) - 1;
|
|
|
|
if (dataFields.size() > 1) {
|
|
BOOST_LOG(rdWarningLog) << "ignoring extra data fields in "
|
|
"MRV_COORDINATE_BOND_TYPE SGroup for bond "
|
|
<< coordinate_bond_idx << '.' << std::endl;
|
|
}
|
|
|
|
Bond *old_bond = nullptr;
|
|
try {
|
|
old_bond = mol.getBondWithIdx(coordinate_bond_idx);
|
|
} catch (const Invar::Invariant &) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "molecule does not contain a bond matching the "
|
|
"MRV_COORDINATE_BOND_TYPE SGroup for bond "
|
|
<< coordinate_bond_idx << ", ignoring." << std::endl;
|
|
return;
|
|
}
|
|
|
|
if (!old_bond || old_bond->getBondType() != Bond::BondType::UNSPECIFIED) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "MRV_COORDINATE_BOND_TYPE SGroup with value "
|
|
<< coordinate_bond_idx
|
|
<< " does not reference a query bond, ignoring." << std::endl;
|
|
return;
|
|
}
|
|
|
|
Bond new_bond(Bond::BondType::DATIVE);
|
|
auto preserveProps = true;
|
|
auto keepSGroups = true;
|
|
mol.replaceBond(coordinate_bond_idx, &new_bond, preserveProps, keepSGroups);
|
|
}
|
|
}
|
|
|
|
void processSMARTSQ(RWMol &mol, const SubstanceGroup &sg) {
|
|
std::string field;
|
|
if (sg.getPropIfPresent("QUERYOP", field) && field != "=") {
|
|
BOOST_LOG(rdWarningLog) << "unrecognized QUERYOP '" << field
|
|
<< "' for SMARTSQ. Query ignored." << std::endl;
|
|
return;
|
|
}
|
|
std::vector<std::string> dataFields;
|
|
if (!sg.getPropIfPresent("DATAFIELDS", dataFields) || dataFields.empty()) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "empty FIELDDATA for SMARTSQ. Query ignored." << std::endl;
|
|
return;
|
|
}
|
|
if (dataFields.size() > 1) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "multiple FIELDDATA values for SMARTSQ. Taking the first."
|
|
<< std::endl;
|
|
}
|
|
const std::string &sma = dataFields[0];
|
|
if (sma.empty()) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "Skipping empty SMARTS value for SMARTSQ." << std::endl;
|
|
return;
|
|
}
|
|
|
|
for (auto aidx : sg.getAtoms()) {
|
|
auto at = mol.getAtomWithIdx(aidx);
|
|
|
|
std::unique_ptr<RWMol> m;
|
|
try {
|
|
m.reset(SmartsToMol(sma));
|
|
} catch (...) {
|
|
// Is this ever used?
|
|
}
|
|
|
|
if (!m || !m->getNumAtoms()) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "SMARTS for SMARTSQ '" << sma
|
|
<< "' could not be parsed or has no atoms. Ignoring it." << std::endl;
|
|
return;
|
|
}
|
|
|
|
if (!at->hasQuery()) {
|
|
QueryAtom qAt(*at);
|
|
int oidx = at->getIdx();
|
|
mol.replaceAtom(oidx, &qAt);
|
|
at = mol.getAtomWithIdx(oidx);
|
|
}
|
|
QueryAtom::QUERYATOM_QUERY *query = nullptr;
|
|
if (m->getNumAtoms() == 1) {
|
|
query = m->getAtomWithIdx(0)->getQuery()->copy();
|
|
} else {
|
|
query = new RecursiveStructureQuery(m.release());
|
|
}
|
|
at->setQuery(query);
|
|
at->setProp(common_properties::MRV_SMA, sma);
|
|
at->setProp(common_properties::_MolFileAtomQuery, 1);
|
|
}
|
|
}
|
|
|
|
void processMrvImplicitH(RWMol &mol, const SubstanceGroup &sg) {
|
|
std::vector<std::string> dataFields;
|
|
if (sg.getPropIfPresent("DATAFIELDS", dataFields)) {
|
|
for (const auto &df : dataFields) {
|
|
if (df.substr(0, 6) == "IMPL_H") {
|
|
auto val = FileParserUtils::toInt(df.substr(6));
|
|
for (auto atIdx : sg.getAtoms()) {
|
|
if (atIdx < mol.getNumAtoms()) {
|
|
// if the atom has aromatic bonds to it, then set the explicit
|
|
// value, otherwise skip it.
|
|
auto atom = mol.getAtomWithIdx(atIdx);
|
|
bool hasAromaticBonds = false;
|
|
for (auto bndI :
|
|
boost::make_iterator_range(mol.getAtomBonds(atom))) {
|
|
auto bnd = (mol)[bndI];
|
|
if (bnd->getIsAromatic() ||
|
|
bnd->getBondType() == Bond::AROMATIC) {
|
|
hasAromaticBonds = true;
|
|
break;
|
|
}
|
|
}
|
|
if (hasAromaticBonds) {
|
|
atom->setNumExplicitHs(val);
|
|
} else {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "MRV_IMPLICIT_H SGroup on atom without aromatic "
|
|
"bonds, "
|
|
<< atIdx << ", ignored." << std::endl;
|
|
}
|
|
} else {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "bad atom index, " << atIdx
|
|
<< ", found in MRV_IMPLICIT_H SGroup. Ignoring it."
|
|
<< std::endl;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// process (and remove) SGroups which modify the structure
|
|
// and which we can unambiguously apply
|
|
void processSGroups(RWMol *mol) {
|
|
std::vector<unsigned int> sgsToRemove;
|
|
unsigned int sgIdx = 0;
|
|
for (auto &sg : getSubstanceGroups(*mol)) {
|
|
if (sg.getProp<std::string>("TYPE") == "DAT") {
|
|
std::string field;
|
|
if (sg.getPropIfPresent("FIELDNAME", field)) {
|
|
if (field == "MRV_COORDINATE_BOND_TYPE") {
|
|
// V2000 support for coordinate bonds
|
|
processMrvCoordinateBond(*mol, sg);
|
|
sgsToRemove.push_back(sgIdx);
|
|
continue;
|
|
} else if (field == "MRV_IMPLICIT_H") {
|
|
// CXN extension to specify implicit Hs, used for aromatic rings
|
|
processMrvImplicitH(*mol, sg);
|
|
sgsToRemove.push_back(sgIdx);
|
|
continue;
|
|
}
|
|
}
|
|
if (sg.getPropIfPresent("QUERYTYPE", field) &&
|
|
(field == "SMARTSQ" || field == "SQ")) {
|
|
processSMARTSQ(*mol, sg);
|
|
sgsToRemove.push_back(sgIdx);
|
|
continue;
|
|
}
|
|
}
|
|
++sgIdx;
|
|
}
|
|
// now remove the S groups we processed, we saved indices so do this in
|
|
// backwards
|
|
auto &sgs = getSubstanceGroups(*mol);
|
|
for (auto it = sgsToRemove.rbegin(); it != sgsToRemove.rend(); ++it) {
|
|
sgs.erase(sgs.begin() + *it);
|
|
}
|
|
}
|
|
|
|
void ProcessMolProps(RWMol *mol) {
|
|
PRECONDITION(mol, "no molecule");
|
|
// we have to loop the ugly way because we may need to actually replace an
|
|
// atom
|
|
for (unsigned int aidx = 0; aidx < mol->getNumAtoms(); ++aidx) {
|
|
auto atom = mol->getAtomWithIdx(aidx);
|
|
int ival = 0;
|
|
if (atom->getPropIfPresent(common_properties::molSubstCount, ival) &&
|
|
ival != 0) {
|
|
if (!atom->hasQuery()) {
|
|
atom = QueryOps::replaceAtomWithQueryAtom(mol, atom);
|
|
}
|
|
bool gtQuery = false;
|
|
if (ival == -1) {
|
|
ival = 0;
|
|
} else if (ival == -2) {
|
|
// as drawn
|
|
ival = atom->getDegree();
|
|
} else if (ival >= 6) {
|
|
// 6 or more
|
|
gtQuery = true;
|
|
}
|
|
if (!gtQuery) {
|
|
atom->expandQuery(makeAtomExplicitDegreeQuery(ival));
|
|
} else {
|
|
// create a temp query the normal way so that we can be sure to get
|
|
// the description right
|
|
std::unique_ptr<ATOM_EQUALS_QUERY> tmp{
|
|
makeAtomExplicitDegreeQuery(ival)};
|
|
atom->expandQuery(makeAtomSimpleQuery<ATOM_LESSEQUAL_QUERY>(
|
|
ival, tmp->getDataFunc(),
|
|
std::string("less_") + tmp->getDescription()));
|
|
}
|
|
}
|
|
if (atom->getPropIfPresent(common_properties::molTotValence, ival) &&
|
|
ival != 0 && !atom->hasProp("_ZBO_H")) {
|
|
atom->setNoImplicit(true);
|
|
if (ival == 15 // V2000
|
|
|| ival == -1 // v3000
|
|
) {
|
|
atom->setNumExplicitHs(0);
|
|
} else {
|
|
if (atom->getExplicitValence() > ival) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "atom " << atom->getIdx() << " has specified valence (" << ival
|
|
<< ") smaller than the drawn valence "
|
|
<< atom->getExplicitValence() << "." << std::endl;
|
|
atom->setNumExplicitHs(0);
|
|
} else {
|
|
atom->setNumExplicitHs(ival - atom->getExplicitValence());
|
|
}
|
|
}
|
|
}
|
|
atom->clearProp(common_properties::molTotValence);
|
|
}
|
|
processSGroups(mol);
|
|
}
|
|
|
|
} // namespace
|
|
namespace FileParserUtils {
|
|
bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol,
|
|
Conformer *&conf, bool &chiralityPossible,
|
|
unsigned int &nAtoms, unsigned int &nBonds,
|
|
bool strictParsing, bool expectMEND) {
|
|
PRECONDITION(inStream, "bad stream");
|
|
PRECONDITION(mol, "bad molecule");
|
|
|
|
std::string tempStr;
|
|
std::vector<std::string> splitLine;
|
|
|
|
bool fileComplete = false;
|
|
|
|
tempStr = getV3000Line(inStream, line);
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 10 || tempStr.substr(0, 10) != "BEGIN CTAB") {
|
|
std::ostringstream errout;
|
|
errout << "BEGIN CTAB line not found on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
tempStr = getV3000Line(inStream, line);
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.size() < 8 || tempStr.substr(0, 7) != "COUNTS ") {
|
|
std::ostringstream errout;
|
|
errout << "Bad counts line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
std::string trimmed =
|
|
boost::trim_copy(tempStr.substr(7, tempStr.length() - 7));
|
|
boost::split(splitLine, trimmed, boost::is_any_of(" \t"),
|
|
boost::token_compress_on);
|
|
if (splitLine.size() < 2) {
|
|
std::ostringstream errout;
|
|
errout << "Bad counts line : '" << tempStr << "' on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
nAtoms = FileParserUtils::toUnsigned(splitLine[0]);
|
|
nBonds = FileParserUtils::toUnsigned(splitLine[1]);
|
|
conf = new Conformer(nAtoms);
|
|
|
|
unsigned int nSgroups = 0, n3DConstraints = 0, chiralFlag = 0;
|
|
|
|
if (splitLine.size() > 2) {
|
|
nSgroups = FileParserUtils::toUnsigned(splitLine[2]);
|
|
}
|
|
if (splitLine.size() > 3) {
|
|
n3DConstraints = FileParserUtils::toUnsigned(splitLine[3]);
|
|
}
|
|
if (splitLine.size() > 4) {
|
|
chiralFlag = FileParserUtils::toUnsigned(splitLine[4]);
|
|
}
|
|
|
|
mol->setProp(common_properties::_MolFileChiralFlag, chiralFlag);
|
|
|
|
if (nAtoms) {
|
|
ParseV3000AtomBlock(inStream, line, nAtoms, mol, conf, strictParsing);
|
|
}
|
|
if (nBonds) {
|
|
ParseV3000BondBlock(inStream, line, nBonds, mol, chiralityPossible);
|
|
}
|
|
|
|
tempStr = getV3000Line(inStream, line);
|
|
// do link nodes:
|
|
boost::to_upper(tempStr);
|
|
while (tempStr.length() > 8 && tempStr.substr(0, 8) == "LINKNODE") {
|
|
boost::to_upper(tempStr);
|
|
// if the line has nothing on it we just ignore it
|
|
if (tempStr.size() > 9) {
|
|
std::string existing = "";
|
|
if (mol->getPropIfPresent(common_properties::molFileLinkNodes,
|
|
existing)) {
|
|
existing += "|";
|
|
}
|
|
existing += tempStr.substr(9); // skip the "LINKNODE "
|
|
mol->setProp(common_properties::molFileLinkNodes, existing);
|
|
}
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
|
|
if (nSgroups) {
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 12 || tempStr.substr(0, 12) != "BEGIN SGROUP") {
|
|
std::ostringstream errout;
|
|
errout << "BEGIN SGROUP line not found on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
} else {
|
|
tempStr =
|
|
ParseV3000SGroupsBlock(inStream, line, nSgroups, mol, strictParsing);
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 10 || tempStr.substr(0, 10) != "END SGROUP") {
|
|
std::ostringstream errout;
|
|
errout << "END SGROUP line not found on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
} else {
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
}
|
|
}
|
|
|
|
while (tempStr.length() > 5 && tempStr.substr(0, 5) == "BEGIN") {
|
|
if (tempStr.length() > 15 && tempStr.substr(6, 10) == "COLLECTION") {
|
|
tempStr = parseEnhancedStereo(inStream, line, mol);
|
|
} else {
|
|
// skip blocks we don't know how to read
|
|
BOOST_LOG(rdWarningLog) << "skipping block at line " << line << ": '"
|
|
<< tempStr << "'" << std::endl;
|
|
while (tempStr.length() < 3 || tempStr.substr(0, 3) != "END") {
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
}
|
|
|
|
if (n3DConstraints) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "3D constraint information in mol block ignored at line " << line
|
|
<< std::endl;
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 11 || tempStr.substr(0, 11) != "BEGIN OBJ3D") {
|
|
std::ostringstream errout;
|
|
errout << "BEGIN OBJ3D line not found on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
}
|
|
for (unsigned int i = 0; i < n3DConstraints; ++i) {
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
tempStr = getV3000Line(inStream, line);
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 9 || tempStr.substr(0, 9) != "END OBJ3D") {
|
|
std::ostringstream errout;
|
|
errout << "END OBJ3D line not found on line " << line;
|
|
if (strictParsing) {
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
} else {
|
|
tempStr = getV3000Line(inStream, line);
|
|
}
|
|
}
|
|
|
|
boost::to_upper(tempStr);
|
|
if (tempStr.length() < 8 || tempStr.substr(0, 8) != "END CTAB") {
|
|
if (strictParsing) {
|
|
throw FileParseException("END CTAB line not found");
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << "END CTAB line not found." << std::endl;
|
|
}
|
|
}
|
|
|
|
if (expectMEND) {
|
|
tempStr = getLine(inStream);
|
|
++line;
|
|
if (tempStr[0] == 'M' && tempStr.substr(0, 6) == "M END") {
|
|
fileComplete = true;
|
|
}
|
|
} else {
|
|
fileComplete = true;
|
|
}
|
|
|
|
mol->addConformer(conf, true);
|
|
conf = nullptr;
|
|
|
|
return fileComplete;
|
|
} // namespace FileParserUtils
|
|
|
|
bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol,
|
|
Conformer *&conf, bool &chiralityPossible,
|
|
unsigned int &nAtoms, unsigned int &nBonds,
|
|
bool strictParsing) {
|
|
conf = new Conformer(nAtoms);
|
|
if (nAtoms == 0) {
|
|
conf->set3D(false);
|
|
} else {
|
|
ParseMolBlockAtoms(inStream, line, nAtoms, mol, conf, strictParsing);
|
|
|
|
bool nonzeroZ = hasNonZeroZCoords(*conf);
|
|
if (mol->hasProp(common_properties::_3DConf)) {
|
|
conf->set3D(true);
|
|
mol->clearProp(common_properties::_3DConf);
|
|
if (!nonzeroZ) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "Warning: molecule is tagged as 3D, but all Z coords are zero"
|
|
<< std::endl;
|
|
}
|
|
} else {
|
|
conf->set3D(nonzeroZ);
|
|
}
|
|
}
|
|
mol->addConformer(conf, true);
|
|
conf = nullptr;
|
|
|
|
ParseMolBlockBonds(inStream, line, nBonds, mol, chiralityPossible);
|
|
|
|
bool fileComplete =
|
|
ParseMolBlockProperties(inStream, line, mol, strictParsing);
|
|
return fileComplete;
|
|
}
|
|
|
|
void finishMolProcessing(RWMol *res, bool chiralityPossible, bool sanitize,
|
|
bool removeHs) {
|
|
if (!res) {
|
|
return;
|
|
}
|
|
res->clearAllAtomBookmarks();
|
|
res->clearAllBondBookmarks();
|
|
|
|
// calculate explicit valence on each atom:
|
|
for (RWMol::AtomIterator atomIt = res->beginAtoms();
|
|
atomIt != res->endAtoms(); ++atomIt) {
|
|
(*atomIt)->calcExplicitValence(false);
|
|
}
|
|
|
|
// postprocess mol file flags
|
|
ProcessMolProps(res);
|
|
|
|
// update the chirality and stereo-chemistry
|
|
//
|
|
// NOTE: we detect the stereochemistry before sanitizing/removing
|
|
// hydrogens because the removal of H atoms may actually remove
|
|
// the wedged bond from the molecule. This wipes out the only
|
|
// sign that chirality ever existed and makes us sad... so first
|
|
// perceive chirality, then remove the Hs and sanitize.
|
|
//
|
|
const Conformer &conf = res->getConformer();
|
|
if (chiralityPossible || conf.is3D()) {
|
|
if (!conf.is3D()) {
|
|
DetectAtomStereoChemistry(*res, &conf);
|
|
} else {
|
|
res->updatePropertyCache(false);
|
|
MolOps::assignChiralTypesFrom3D(*res, conf.getId(), true);
|
|
}
|
|
}
|
|
|
|
if (sanitize) {
|
|
try {
|
|
if (removeHs) {
|
|
MolOps::removeHs(*res, false, false);
|
|
} else {
|
|
MolOps::sanitizeMol(*res);
|
|
}
|
|
// now that atom stereochem has been perceived, the wedging
|
|
// information is no longer needed, so we clear
|
|
// single bond dir flags:
|
|
MolOps::clearSingleBondDirFlags(*res);
|
|
|
|
// unlike DetectAtomStereoChemistry we call detectBondStereochemistry
|
|
// here after sanitization because we need the ring information:
|
|
MolOps::detectBondStereochemistry(*res);
|
|
} catch (...) {
|
|
delete res;
|
|
res = nullptr;
|
|
throw;
|
|
}
|
|
MolOps::assignStereochemistry(*res, true, true, true);
|
|
} else {
|
|
// we still need to do something about double bond stereochemistry
|
|
// (was github issue 337)
|
|
// now that atom stereochem has been perceived, the wedging
|
|
// information is no longer needed, so we clear
|
|
// single bond dir flags:
|
|
ClearSingleBondDirFlags(*res);
|
|
MolOps::detectBondStereochemistry(*res);
|
|
}
|
|
|
|
if (res->hasProp(common_properties::_NeedsQueryScan)) {
|
|
res->clearProp(common_properties::_NeedsQueryScan);
|
|
QueryOps::completeMolQueries(res);
|
|
}
|
|
}
|
|
} // namespace FileParserUtils
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a stream
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line,
|
|
bool sanitize, bool removeHs, bool strictParsing) {
|
|
PRECONDITION(inStream, "no stream");
|
|
std::string tempStr;
|
|
bool fileComplete = false;
|
|
bool chiralityPossible = false;
|
|
Utils::LocaleSwitcher ls;
|
|
// mol name
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if (inStream->eof()) {
|
|
return nullptr;
|
|
}
|
|
auto *res = new RWMol();
|
|
res->setProp(common_properties::_Name, tempStr);
|
|
|
|
// info
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
res->setProp("_MolFileInfo", tempStr);
|
|
if (tempStr.length() >= 22) {
|
|
std::string dimLabel = tempStr.substr(20, 2);
|
|
// Unless labelled as 3D we assume 2D
|
|
if (dimLabel == "3d" || dimLabel == "3D") {
|
|
res->setProp(common_properties::_3DConf, 1);
|
|
}
|
|
}
|
|
// comments
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
res->setProp("_MolFileComments", tempStr);
|
|
|
|
unsigned int nAtoms = 0, nBonds = 0, nLists = 0, chiralFlag = 0, nsText = 0,
|
|
nRxnComponents = 0;
|
|
int nReactants = 0, nProducts = 0, nIntermediates = 0;
|
|
(void)nLists; // read from the file but unused
|
|
(void)nsText;
|
|
(void)nRxnComponents;
|
|
(void)nReactants;
|
|
(void)nProducts;
|
|
(void)nIntermediates;
|
|
// counts line, this is where we really get started
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
|
|
if (tempStr.size() < 6) {
|
|
if (res) {
|
|
delete res;
|
|
res = nullptr;
|
|
}
|
|
std::ostringstream errout;
|
|
errout << "Counts line too short: '" << tempStr << "' on line" << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
unsigned int spos = 0;
|
|
// this needs to go into a try block because if the lexical_cast throws an
|
|
// exception we want to catch and delete mol before leaving this function
|
|
try {
|
|
nAtoms = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
spos = 3;
|
|
nBonds = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
spos = 6;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
if (res) {
|
|
delete res;
|
|
res = nullptr;
|
|
}
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert '" << tempStr.substr(spos, 3)
|
|
<< "' to unsigned int on line " << line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
try {
|
|
spos = 6;
|
|
if (tempStr.size() >= 9) {
|
|
nLists = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 12;
|
|
if (tempStr.size() >= spos + 3) {
|
|
chiralFlag = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 15;
|
|
if (tempStr.size() >= spos + 3) {
|
|
nsText = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 18;
|
|
if (tempStr.size() >= spos + 3) {
|
|
nRxnComponents =
|
|
FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 21;
|
|
if (tempStr.size() >= spos + 3) {
|
|
nReactants = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 24;
|
|
if (tempStr.size() >= spos + 3) {
|
|
nProducts = FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
spos = 27;
|
|
if (tempStr.size() >= spos + 3) {
|
|
nIntermediates =
|
|
FileParserUtils::toUnsigned(tempStr.substr(spos, 3), true);
|
|
}
|
|
|
|
} catch (boost::bad_lexical_cast &) {
|
|
// some SD files (such as some from NCI) lack all the extra information
|
|
// on the header line, so ignore problems parsing there.
|
|
}
|
|
|
|
unsigned int ctabVersion = 2000;
|
|
if (tempStr.size() > 35) {
|
|
if (tempStr.size() < 39 || tempStr[34] != 'V') {
|
|
std::ostringstream errout;
|
|
errout << "CTAB version string invalid at line " << line;
|
|
if (strictParsing) {
|
|
delete res;
|
|
res = nullptr;
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
} else if (tempStr.substr(34, 5) == "V3000") {
|
|
ctabVersion = 3000;
|
|
} else if (tempStr.substr(34, 5) != "V2000") {
|
|
std::ostringstream errout;
|
|
errout << "Unsupported CTAB version: '" << tempStr.substr(34, 5)
|
|
<< "' at line " << line;
|
|
if (strictParsing) {
|
|
delete res;
|
|
res = nullptr;
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
res->setProp(common_properties::_MolFileChiralFlag, chiralFlag);
|
|
|
|
Conformer *conf = nullptr;
|
|
try {
|
|
if (ctabVersion == 2000) {
|
|
fileComplete = FileParserUtils::ParseV2000CTAB(inStream, line, res, conf,
|
|
chiralityPossible, nAtoms,
|
|
nBonds, strictParsing);
|
|
} else {
|
|
if (nAtoms != 0 || nBonds != 0) {
|
|
std::ostringstream errout;
|
|
errout << "V3000 mol blocks should have 0s in the initial counts line. "
|
|
"(line: "
|
|
<< line << ")";
|
|
if (strictParsing) {
|
|
delete res;
|
|
res = nullptr;
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
BOOST_LOG(rdWarningLog) << errout.str() << std::endl;
|
|
}
|
|
}
|
|
fileComplete = FileParserUtils::ParseV3000CTAB(inStream, line, res, conf,
|
|
chiralityPossible, nAtoms,
|
|
nBonds, strictParsing);
|
|
}
|
|
} catch (MolFileUnhandledFeatureException &e) {
|
|
// unhandled mol file feature, just delete the result
|
|
delete res;
|
|
delete conf;
|
|
res = nullptr;
|
|
conf = nullptr;
|
|
BOOST_LOG(rdErrorLog) << " Unhandled CTAB feature: '" << e.what()
|
|
<< "'. Molecule skipped." << std::endl;
|
|
|
|
if (!inStream->eof()) {
|
|
tempStr = getLine(inStream);
|
|
}
|
|
++line;
|
|
while (!inStream->eof() && !inStream->fail() &&
|
|
tempStr.substr(0, 6) != "M END" && tempStr.substr(0, 4) != "$$$$") {
|
|
tempStr = getLine(inStream);
|
|
++line;
|
|
}
|
|
fileComplete = !inStream->eof() || tempStr.substr(0, 6) == "M END" ||
|
|
tempStr.substr(0, 4) == "$$$$";
|
|
} catch (FileParseException &e) {
|
|
// catch our exceptions and throw them back after cleanup
|
|
delete res;
|
|
delete conf;
|
|
res = nullptr;
|
|
conf = nullptr;
|
|
throw e;
|
|
}
|
|
|
|
if (!fileComplete) {
|
|
delete res;
|
|
delete conf;
|
|
res = nullptr;
|
|
conf = nullptr;
|
|
std::ostringstream errout;
|
|
errout
|
|
<< "Problems encountered parsing Mol data, M END missing around line "
|
|
<< line;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
if (res) {
|
|
FileParserUtils::finishMolProcessing(res, chiralityPossible, sanitize,
|
|
removeHs);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
|
|
bool sanitize, bool removeHs, bool strictParsing) {
|
|
return MolDataStreamToMol(&inStream, line, sanitize, removeHs, strictParsing);
|
|
}
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a string
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize, bool removeHs,
|
|
bool strictParsing) {
|
|
std::istringstream inStream(molBlock);
|
|
unsigned int line = 0;
|
|
return MolDataStreamToMol(inStream, line, sanitize, removeHs, strictParsing);
|
|
}
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a file
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *MolFileToMol(const std::string &fName, bool sanitize, bool removeHs,
|
|
bool strictParsing) {
|
|
std::ifstream inStream(fName.c_str());
|
|
if (!inStream || (inStream.bad())) {
|
|
std::ostringstream errout;
|
|
errout << "Bad input file " << fName;
|
|
throw BadFileException(errout.str());
|
|
}
|
|
RWMol *res = nullptr;
|
|
if (!inStream.eof()) {
|
|
unsigned int line = 0;
|
|
res = MolDataStreamToMol(inStream, line, sanitize, removeHs, strictParsing);
|
|
}
|
|
return res;
|
|
}
|
|
} // namespace RDKit
|