Files
rdkit/Code/GraphMol/FileParsers/ForwardSDMolSupplier.cpp
Greg Landrum 6e838e3df3 Fixes #2479 (#2482)
* Fixes #2479

* a bit of cleanup

* catch some additional fun edge cases

* change in response to review
2019-06-07 12:44:51 -04:00

309 lines
10 KiB
C++

//
// Copyright (C) 2009-2019 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/FileParseException.h>
#include <RDGeneral/BadFileException.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/RDLog.h>
#include <GraphMol/SanitException.h>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/algorithm/string.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include "MolSupplier.h"
#include "FileParsers.h"
#include "FileParserUtils.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
namespace RDKit {
std::string strip(const std::string &orig) {
// FIX: this can be more efficeint
// strip the end of line, white spaces and tabs
std::string res =
boost::trim_right_copy_if(orig, boost::is_any_of(" \t\r\n"));
res = boost::trim_left_copy_if(res, boost::is_any_of(" \t\r\n"));
return res;
}
ForwardSDMolSupplier::ForwardSDMolSupplier(std::istream *inStream,
bool takeOwnership, bool sanitize,
bool removeHs, bool strictParsing) {
PRECONDITION(inStream, "bad stream");
init();
dp_inStream = inStream;
df_owner = takeOwnership;
df_sanitize = sanitize;
df_removeHs = removeHs;
df_strictParsing = strictParsing;
POSTCONDITION(dp_inStream, "bad instream");
}
void ForwardSDMolSupplier::init() {
dp_inStream = nullptr;
df_owner = false;
df_end = false;
d_line = 0;
df_processPropertyLists = true;
}
void ForwardSDMolSupplier::reset() {
UNDER_CONSTRUCTION("reset() not supported for ForwardSDMolSuppliers();");
}
void ForwardSDMolSupplier::readMolProps(ROMol *mol) {
PRECONDITION(dp_inStream, "no stream");
PRECONDITION(mol, "no molecule");
d_line++;
bool hasProp = false;
bool warningIssued = false;
std::string tempStr;
std::string dlabel = "";
std::getline(*dp_inStream, tempStr);
// FIX: report files missing the $$$$ marker
while (!(dp_inStream->eof()) &&
(tempStr[0] != '$' || tempStr.substr(0, 4) != "$$$$")) {
tempStr = strip(tempStr);
if (tempStr != "") {
if (tempStr[0] == '>') { // data header line: start of a data item
// ignore all other crap and seek for for a data label enclosed
// by '<' and '>'
// FIX: "CTfile.pdf" (page 51) says that the a data header line does not
// have to contain a data label (instead can have something line field
// id into a MACCS db). But we do not currently know what to do in this
// situation - so ignore such data items for now
hasProp = true;
warningIssued = false;
tempStr.erase(0, 1); // remove the first ">" sign
size_t sl = tempStr.find("<"); // begin datalabel
size_t se = tempStr.find(">"); // end datalabel
if ((sl == std::string::npos) || (se == std::string::npos) ||
(se == (sl + 1))) {
// we either do not have a data label or the label is emtpy
// no data label ignore until next data item
// i.e. until we hit a blank line
d_line++;
std::getline(*dp_inStream, tempStr);
std::string stmp = strip(tempStr);
while (stmp.length() != 0) {
d_line++;
std::getline(*dp_inStream, tempStr);
if (dp_inStream->eof())
throw FileParseException("End of data field name not found");
}
} else {
dlabel = tempStr.substr(sl + 1, se - sl - 1);
// we know the label - now read in the relevant properties
// until we hit a blank line
d_line++;
std::getline(*dp_inStream, tempStr);
std::string prop = "";
std::string stmp = strip(tempStr);
int nplines = 0; // number of lines for this property
while (stmp.length() != 0 || tempStr[0] == ' ' ||
tempStr[0] == '\t') {
nplines++;
if (nplines > 1) {
prop += "\n";
}
// take off \r if it's still in the property:
if (tempStr[tempStr.length() - 1] == '\r') {
tempStr.erase(tempStr.length() - 1);
}
prop += tempStr;
d_line++;
// erase tempStr in case the file does not end with a carrier
// return (we will end up in an infinite loop if we don't do
// this and we do not check for EOF in this while loop body)
tempStr.erase();
std::getline(*dp_inStream, tempStr);
stmp = strip(tempStr);
}
mol->setProp(dlabel, prop);
if (df_processPropertyLists) {
// apply this as an atom property list if that's appropriate
FileParserUtils::processMolPropertyList(*mol, dlabel);
}
}
} else {
if (df_strictParsing) {
// at this point we should always be at a line starting with '>'
// following a blank line. If this is not true and df_strictParsing
// is true, then throw an exception, otherwise truncate the rest of
// the data field following the blank line until the next '>' or EOF
// and issue a warning
// FIX: should we be deleting the molecule (which is probably fine)
// because we couldn't read the data ???
throw FileParseException("Problems encountered parsing data fields");
} else {
if (!warningIssued) {
if (hasProp) {
BOOST_LOG(rdWarningLog)
<< "Property <" << dlabel << "> will be truncated after "
<< "the first blank line" << std::endl;
} else {
BOOST_LOG(rdWarningLog)
<< "Spurious data before the first property will be "
"ignored"
<< std::endl;
}
warningIssued = true;
}
}
}
}
d_line++;
std::getline(*dp_inStream, tempStr);
}
}
ROMol *ForwardSDMolSupplier::next() {
PRECONDITION(dp_inStream, "no stream");
ROMol *res = nullptr;
if (dp_inStream->eof()) {
// FIX: we should probably be throwing an exception here
df_end = true;
return res;
}
res = _next();
return res;
}
ROMol *ForwardSDMolSupplier::_next() {
PRECONDITION(dp_inStream, "no stream");
std::string tempStr;
ROMol *res = nullptr;
if (dp_inStream->eof()) {
df_end = true;
return res;
}
df_eofHitOnRead = false;
unsigned int line = d_line;
try {
res = MolDataStreamToMol(dp_inStream, line, df_sanitize, df_removeHs,
df_strictParsing);
// there's a special case when trying to read an empty string that
// we get an empty molecule after only reading a single line without any
// additional error state.
if (!res && dp_inStream->eof() && (line - d_line < 2))
df_eofHitOnRead = true;
d_line = line;
if (res) {
this->readMolProps(res);
} else if (!dp_inStream->eof()) {
// FIX: report files missing the $$$$ marker
std::getline(*dp_inStream, tempStr);
++d_line;
while (!(dp_inStream->eof()) &&
(tempStr[0] != '$' || tempStr.substr(0, 4) != "$$$$")) {
std::getline(*dp_inStream, tempStr);
++d_line;
}
}
} catch (FileParseException &fe) {
if (d_line < static_cast<int>(line)) d_line = line;
// we couldn't read a mol block or the data for the molecule. In this case
// advance forward in the stream until we hit the next record and then
// rethrow
// the exception. This should allow us to read the next molecule.
BOOST_LOG(rdErrorLog) << "ERROR: " << fe.message() << std::endl;
BOOST_LOG(rdErrorLog)
<< "ERROR: moving to the begining of the next molecule\n";
// FIX: report files missing the $$$$ marker
d_line++;
std::getline(*dp_inStream, tempStr);
while (!(dp_inStream->eof()) &&
(tempStr[0] != '$' || tempStr.substr(0, 4) != "$$$$")) {
d_line++;
std::getline(*dp_inStream, tempStr);
}
} catch (MolSanitizeException &se) {
if (d_line < static_cast<int>(line)) d_line = line;
// We couldn't sanitize a molecule we got - write out an error message and
// move to
// the beginning of the next molecule
BOOST_LOG(rdErrorLog)
<< "ERROR: Could not sanitize molecule ending on line " << d_line
<< std::endl;
BOOST_LOG(rdErrorLog) << "ERROR: " << se.message() << "\n";
d_line++;
std::getline(*dp_inStream, tempStr);
if (dp_inStream->eof()) df_eofHitOnRead = true;
while (!(dp_inStream->eof()) &&
(tempStr[0] != '$' || tempStr.substr(0, 4) != "$$$$")) {
d_line++;
std::getline(*dp_inStream, tempStr);
}
} catch (...) {
if (dp_inStream->eof()) df_eofHitOnRead = true;
if (d_line < static_cast<int>(line)) d_line = line;
BOOST_LOG(rdErrorLog) << "Unexpected error hit on line " << d_line
<< std::endl;
BOOST_LOG(rdErrorLog)
<< "ERROR: moving to the begining of the next molecule\n";
d_line++;
std::getline(*dp_inStream, tempStr);
if (dp_inStream->eof()) df_eofHitOnRead = true;
while (!(dp_inStream->eof()) &&
(tempStr[0] != '$' || tempStr.substr(0, 4) != "$$$$")) {
d_line++;
std::getline(*dp_inStream, tempStr);
}
}
if (dp_inStream->eof()) {
// FIX: we should probably be throwing an exception here
df_end = true;
}
return res;
}
void ForwardSDMolSupplier::checkForEnd() {
PRECONDITION(dp_inStream, "no stream");
// we will call it end of file if we have more than 4 contiguous empty lines
// or we reach end of file in the meantime
if (dp_inStream->eof()) {
df_end = true;
return;
}
// we are not at the end of file, check for blank lines
unsigned int nempty = 0;
std::string tempStr;
for (unsigned int i = 0; i < 4; i++) {
tempStr = getLine(dp_inStream);
if (dp_inStream->eof()) {
df_end = true;
return;
}
if (tempStr.find_first_not_of(" \t\r\n") == std::string::npos) {
++nempty;
}
}
if (nempty == 4) {
df_end = true;
}
}
bool ForwardSDMolSupplier::atEnd() {
PRECONDITION(dp_inStream, "no stream");
return df_end;
}
} // namespace RDKit