Files
rdkit/Code/GraphMol/FileParsers/TplFileParser.cpp
Greg Landrum fbb1e6baaa Add v2 API to FileParsers (#7077)
* basic approach to v2api

* does not work, backup commit

* well, that now compiles

* cleaner

* more cleanup and testing

* get the SWIG wrappers to build

* swig wrappers now build

* switch back to using references to default values

* adjust to new catch version

* move the implementation to v2 and call it from v1

* update the parameter object too

* move debugParse down

* a couple of review changes

* make v2 naming consistent with Python

* first pass at v2 for the ctab parsers

* move a the writing functionality to a different header

this is just an organizational thing at the moment; we still import the writers header in the parsers header so as to not break code

* do v2 of the tpl parser

* fix missing dependency

* Mol2 parser

* move over the XYZ parser

* switch over the PDB parsers too

* get mac and java builds working (hopefully)
2024-01-28 05:11:54 +01:00

318 lines
10 KiB
C++

//
// Copyright (C) 2007-2024 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/BoostStartInclude.h>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include "FileParsers.h"
#include "FileParserUtils.h"
#include <GraphMol/FileParsers/MolFileStereochem.h>
#include <RDGeneral/StreamOps.h>
#include <fstream>
#include <RDGeneral/FileParseException.h>
#include <RDGeneral/BadFileException.h>
#include <typeinfo>
namespace RDKit {
void ParseTPLAtomLine(std::string text, unsigned int lineNum, RWMol *mol,
Conformer *conf) {
PRECONDITION(mol, "no molecule");
PRECONDITION(conf, "no conformer");
std::vector<std::string> splitLine;
boost::split(splitLine, text, boost::is_any_of(" \t"),
boost::token_compress_on);
if (splitLine.size() < 8) {
std::ostringstream errout;
errout << "Atom line " << lineNum << " only has " << splitLine.size()
<< " tokens. 8 are required." << std::endl;
throw FileParseException(errout.str());
}
auto *atom = new Atom(splitLine[1]);
unsigned int atomId;
atomId = mol->addAtom(atom, false, true);
atom->setFormalCharge(FileParserUtils::stripSpacesAndCast<int>(splitLine[2]));
auto partialChg = FileParserUtils::stripSpacesAndCast<double>(splitLine[3]);
atom->setProp("TPLCharge", partialChg);
auto xp = FileParserUtils::stripSpacesAndCast<double>(splitLine[4]);
auto yp = FileParserUtils::stripSpacesAndCast<double>(splitLine[5]);
auto zp = FileParserUtils::stripSpacesAndCast<double>(splitLine[6]);
// coords in TPL files are in picometers, adjust:
xp /= 100.;
yp /= 100.;
zp /= 100.;
conf->setAtomPos(atomId, RDGeom::Point3D(xp, yp, zp));
auto nBonds = FileParserUtils::stripSpacesAndCast<unsigned int>(splitLine[7]);
// the only remaining info we care about is stereochem, and then only if
// the number of bonds is 4:
if (nBonds == 4 && splitLine.size() > 8 + nBonds) {
std::string stereoChem = splitLine[8 + nBonds];
atom->setProp("TPLStereoFlag", stereoChem);
}
}
void ParseTPLBondLine(std::string text, unsigned int lineNum, RWMol *mol) {
PRECONDITION(mol, "no molecule");
std::vector<std::string> splitLine;
boost::split(splitLine, text, boost::is_any_of(" \t"),
boost::token_compress_on);
if (splitLine.size() < 5) {
std::ostringstream errout;
errout << "Bond line " << lineNum << " only has " << splitLine.size()
<< " tokens. 5 are required." << std::endl;
throw FileParseException(errout.str());
}
std::string tplOrder = boost::trim_copy(splitLine[1]).substr(0, 3);
Bond::BondType bondOrder;
if (tplOrder == "1.5") {
bondOrder = Bond::AROMATIC;
} else if (tplOrder == "1.0") {
bondOrder = Bond::SINGLE;
} else if (tplOrder == "2.0") {
bondOrder = Bond::DOUBLE;
} else if (tplOrder == "3.0") {
bondOrder = Bond::TRIPLE;
} else {
std::ostringstream errout;
errout << "Bond line " << lineNum << " has unknown order: " << tplOrder
<< std::endl;
throw FileParseException(errout.str());
}
unsigned int idx1, idx2;
idx1 = FileParserUtils::stripSpacesAndCast<unsigned int>(splitLine[2]) - 1;
idx2 = FileParserUtils::stripSpacesAndCast<unsigned int>(splitLine[3]) - 1;
unsigned int bondIdx = mol->addBond(idx1, idx2, bondOrder) - 1;
std::string stereoFlag1 = "";
std::string stereoFlag2 = "";
stereoFlag1 = splitLine[4];
if (splitLine.size() > 5) {
stereoFlag2 = splitLine[5];
}
mol->getBondWithIdx(bondIdx)->setProp("TPLBondDir1", stereoFlag1);
mol->getBondWithIdx(bondIdx)->setProp("TPLBondDir2", stereoFlag2);
}
Conformer *ParseConfData(std::istream &inStream, unsigned int &line, RWMol *mol,
unsigned int confId) {
PRECONDITION(mol, "no mol");
std::string tempStr;
std::vector<std::string> splitLine;
line++;
tempStr = getLine(inStream);
boost::split(splitLine, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
if (splitLine[0] != "NAME") {
std::ostringstream errout;
errout << "Did not find NAME tag on line " << line
<< " while reading conformer " << confId << std::endl;
throw FileParseException(errout.str());
}
std::ostringstream propName;
propName << "Conf_" << mol->getNumConformers() << common_properties::_Name;
mol->setProp(propName.str(),
boost::trim_copy(tempStr.substr(4, tempStr.size() - 4)));
auto *conf = new Conformer(mol->getNumAtoms());
for (unsigned int i = 0; i < mol->getNumAtoms(); ++i) {
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
delete conf;
std::ostringstream errout;
errout << "EOF hit while reading conformer " << confId << std::endl;
throw FileParseException(errout.str());
}
boost::trim(tempStr);
boost::split(splitLine, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
if (splitLine.size() < 3) {
delete conf;
std::ostringstream errout;
errout << "Did not find enough fields on line " << line
<< " while reading conformer " << confId << std::endl;
throw FileParseException(errout.str());
}
auto xp = FileParserUtils::stripSpacesAndCast<double>(splitLine[0]);
auto yp = FileParserUtils::stripSpacesAndCast<double>(splitLine[1]);
auto zp = FileParserUtils::stripSpacesAndCast<double>(splitLine[2]);
// coords in TPL files are in picometers, adjust:
xp /= 100.;
yp /= 100.;
zp /= 100.;
conf->setAtomPos(i, RDGeom::Point3D(xp, yp, zp));
}
return conf;
}
namespace v2 {
namespace FileParsers {
//*************************************
//
// Every effort has been made to adhere to the BioCad tpl definition
//*************************************
std::unique_ptr<RWMol> MolFromTPLDataStream(std::istream &inStream,
unsigned int &line,
const TPLParserParams &params) {
std::string tempStr;
std::vector<std::string> splitText;
// format line:
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return nullptr;
}
// comment line:
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return nullptr;
}
// optional name line:
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return nullptr;
}
auto res = std::make_unique<RWMol>();
if (tempStr.size() >= 4 && tempStr.substr(0, 4) == "NAME") {
tempStr = boost::trim_copy(tempStr.substr(4, tempStr.size() - 4));
res->setProp(common_properties::_Name, tempStr);
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return res;
}
}
if (tempStr.size() >= 4 && tempStr.substr(0, 4) == "PROP") {
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return res;
}
}
// we're at the counts line:
boost::split(splitText, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
unsigned int nAtoms, nBonds;
nAtoms = FileParserUtils::stripSpacesAndCast<unsigned int>(splitText[0]);
nBonds = FileParserUtils::stripSpacesAndCast<unsigned int>(splitText[1]);
auto *conf = new Conformer(nAtoms);
conf->setId(0);
for (unsigned int i = 0; i < nAtoms; ++i) {
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
delete conf;
throw FileParseException("EOF hit while reading atoms.");
}
ParseTPLAtomLine(tempStr, line, res.get(), conf);
}
res->addConformer(conf, true);
for (unsigned int i = 0; i < nBonds; ++i) {
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
throw FileParseException("EOF hit while reading bonds.");
}
ParseTPLBondLine(tempStr, line, res.get());
}
line++;
tempStr = getLine(inStream);
if (inStream.eof()) {
return res;
}
unsigned int nConfs = 0;
if (tempStr.size() >= 5 && tempStr.substr(0, 5) == "CONFS") {
boost::split(splitText, tempStr, boost::is_any_of(" \t"),
boost::token_compress_on);
nConfs = FileParserUtils::stripSpacesAndCast<unsigned int>(splitText[1]);
}
for (unsigned int i = 0; i < nConfs; ++i) {
Conformer *conf = ParseConfData(inStream, line, res.get(), i + 1);
if (i > 0 || !params.skipFirstConf) {
conf->setId(i + 1);
res->addConformer(conf, true);
} else {
delete conf;
}
// there should be a blank line:
line++;
tempStr = getLine(inStream);
boost::trim(tempStr);
if (!inStream.eof() && tempStr != "") {
throw FileParseException("Found a non-blank line between conformers.");
}
}
if (params.sanitize) {
MolOps::sanitizeMol(*res);
}
return res;
}
//------------------------------------------------
//
// Read a molecule from a file
//
//------------------------------------------------
std::unique_ptr<RWMol> MolFromTPLFile(const std::string &fName,
const TPLParserParams &params) {
std::ifstream inStream(fName.c_str());
if (!inStream || (inStream.bad())) {
std::ostringstream errout;
errout << "Bad input file " << fName;
throw BadFileException(errout.str());
}
if (!inStream.eof()) {
unsigned int line = 0;
return MolFromTPLDataStream(inStream, line, params);
} else {
return nullptr;
}
}
} // namespace FileParsers
} // namespace v2
#if 0
RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
bool sanitize){
return MolDataStreamToMol(&inStream,line,sanitize);
};
//------------------------------------------------
//
// Read a molecule from a string
//
//------------------------------------------------
RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize){
std::istringstream inStream(molBlock);
RWMol *res = nullptr;
unsigned int line = 0;
return MolDataStreamToMol(inStream, line, sanitize);
}
#endif
} // namespace RDKit