mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* iterators for random-access MolSuppliers add optional caching to SDMolSupplier * add support to SmilesMolSupplier too There is a lot of duplicate code between the random-access suppliers that would be worth trying to remove but at the moment it looks like it would require multiple inheritance, and I think we want to avoid that * add input iterators for ForwardSDMolSupplier() * throw when calling begin() on a used supplier * switch to use the spaceship operator * init() should reset the mol cache * Make SDMolSupplier and SmilesMolSupplier safe for multi-threaded reads * add benchmarking * add TDTMolSupplier support improved testing add benchmarks for parallel iteration optional TBB support * better const handling, add reverse iterators doesn't look like const_iterator is possible since getting data from the underlyng supplier object is non-const * improve docs more usings add reverse iterator to TDTMolSupplier * tests only try execution::par when it is there * fix typo * more testing/demo * remove accidentally added files * review changes * add default ctors * disable a false-positive compiler warning it is stupid to have to do this --------- Co-authored-by: = <=>
548 lines
15 KiB
C++
548 lines
15 KiB
C++
//
|
|
// Copyright (C) 2002-2024 Greg Landrum and other RDKit contributors
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <RDGeneral/RDLog.h>
|
|
#include "MolSupplier.h"
|
|
#include "FileParsers.h"
|
|
#include <boost/tokenizer.hpp>
|
|
typedef boost::tokenizer<boost::char_separator<char>> tokenizer;
|
|
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <cstdlib>
|
|
|
|
namespace RDKit {
|
|
|
|
namespace v2 {
|
|
namespace FileParsers {
|
|
|
|
SmilesMolSupplier::SmilesMolSupplier() { init(); }
|
|
|
|
SmilesMolSupplier::SmilesMolSupplier(const std::string &fileName,
|
|
const SmilesMolSupplierParams ¶ms) {
|
|
init();
|
|
dp_inStream = openAndCheckStream(fileName);
|
|
CHECK_INVARIANT(dp_inStream, "bad instream");
|
|
CHECK_INVARIANT(!(dp_inStream->eof()), "early EOF");
|
|
|
|
d_params = params;
|
|
df_end = false;
|
|
|
|
this->checkForEnd();
|
|
POSTCONDITION(dp_inStream, "bad instream");
|
|
}
|
|
|
|
SmilesMolSupplier::SmilesMolSupplier(std::istream *inStream, bool takeOwnership,
|
|
const SmilesMolSupplierParams ¶ms) {
|
|
CHECK_INVARIANT(inStream, "bad instream");
|
|
CHECK_INVARIANT(!(inStream->eof()), "early EOF");
|
|
|
|
init();
|
|
dp_inStream = inStream;
|
|
df_owner = takeOwnership;
|
|
d_params = params;
|
|
df_end = false;
|
|
this->checkForEnd();
|
|
POSTCONDITION(dp_inStream, "bad instream");
|
|
}
|
|
|
|
void SmilesMolSupplier::init() {
|
|
dp_inStream = nullptr;
|
|
df_owner = true;
|
|
df_end = false;
|
|
|
|
d_len = -1;
|
|
d_next = -1;
|
|
d_line = -1;
|
|
d_molpos.clear();
|
|
d_lineNums.clear();
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
d_molCache.clear();
|
|
}
|
|
|
|
void SmilesMolSupplier::setData(const std::string &text,
|
|
const SmilesMolSupplierParams ¶ms) {
|
|
if (dp_inStream && df_owner) {
|
|
delete dp_inStream;
|
|
}
|
|
init();
|
|
|
|
dp_inStream = new std::stringstream(text);
|
|
|
|
d_params = params;
|
|
df_end = false;
|
|
|
|
this->checkForEnd();
|
|
POSTCONDITION(dp_inStream, "bad instream");
|
|
}
|
|
|
|
// ensures that there is a line available to be read
|
|
// from the file:
|
|
void SmilesMolSupplier::checkForEnd() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
int pos = this->skipComments();
|
|
if (pos != -1) {
|
|
d_line = -1;
|
|
dp_inStream->seekg(0);
|
|
df_end = false;
|
|
}
|
|
}
|
|
|
|
void SmilesMolSupplier::reset() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
dp_inStream->clear();
|
|
|
|
df_end = 0;
|
|
if (d_molpos.size() > 0) {
|
|
dp_inStream->seekg(d_molpos.front());
|
|
d_next = 0;
|
|
d_line = 0;
|
|
} else {
|
|
dp_inStream->seekg(0);
|
|
d_next = -1;
|
|
d_line = -1;
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<RWMol> SmilesMolSupplier::processLine(std::string inLine) {
|
|
std::unique_ptr<RWMol> res;
|
|
|
|
try {
|
|
// -----------
|
|
// tokenize the input line:
|
|
// -----------
|
|
boost::char_separator<char> sep(d_params.delimiter.c_str(), "",
|
|
boost::keep_empty_tokens);
|
|
tokenizer tokens(inLine, sep);
|
|
STR_VECT recs;
|
|
for (tokenizer::iterator tokIter = tokens.begin(); tokIter != tokens.end();
|
|
++tokIter) {
|
|
std::string rec = strip(*tokIter);
|
|
recs.push_back(rec);
|
|
}
|
|
if (recs.size() <= static_cast<unsigned int>(d_params.smilesColumn)) {
|
|
std::ostringstream errout;
|
|
errout << "ERROR: line #" << d_line << "does not contain enough tokens\n";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
// -----------
|
|
// get the smiles and create a molecule
|
|
// -----------
|
|
res = MolFromSmiles(recs[d_params.smilesColumn], d_params.parseParameters);
|
|
if (!res) {
|
|
std::stringstream errout;
|
|
errout << "Cannot create molecule from : '" << recs[d_params.smilesColumn]
|
|
<< "'";
|
|
throw SmilesParseException(errout.str());
|
|
}
|
|
|
|
// -----------
|
|
// get the name (if there's a name column)
|
|
// -----------
|
|
if (d_params.nameColumn == -1) {
|
|
// if no name defaults it to the line number we read it from string
|
|
std::ostringstream tstr;
|
|
tstr << d_line;
|
|
std::string mname = tstr.str();
|
|
res->setProp(common_properties::_Name, mname);
|
|
} else {
|
|
if (d_params.nameColumn >= static_cast<int>(recs.size())) {
|
|
BOOST_LOG(rdWarningLog)
|
|
<< "WARNING: no name column found on line " << d_line << std::endl;
|
|
} else {
|
|
res->setProp(common_properties::_Name, recs[d_params.nameColumn]);
|
|
}
|
|
}
|
|
|
|
// -----------
|
|
// read in the properties
|
|
// -----------
|
|
for (unsigned int col = 0; col < recs.size(); col++) {
|
|
if (static_cast<int>(col) == d_params.smilesColumn ||
|
|
static_cast<int>(col) == d_params.nameColumn) {
|
|
continue;
|
|
}
|
|
std::string pname, pval;
|
|
if (d_props.size() > col) {
|
|
pname = d_props[col];
|
|
}
|
|
if (pname.empty()) {
|
|
pname = "Column_";
|
|
pname += std::to_string(col);
|
|
}
|
|
|
|
pval = recs[col];
|
|
res->setProp(pname, pval);
|
|
}
|
|
|
|
} catch (const SmilesParseException &pe) {
|
|
// Couldn't parse the passed in smiles
|
|
// Simply print out a message
|
|
BOOST_LOG(rdErrorLog) << "ERROR: Smiles parse error on line " << d_line
|
|
<< "\n";
|
|
BOOST_LOG(rdErrorLog) << "ERROR: " << pe.what() << "\n";
|
|
res.reset();
|
|
} catch (const MolSanitizeException &se) {
|
|
// We couldn't sanitize the molecule
|
|
// write out an error message
|
|
BOOST_LOG(rdErrorLog) << "ERROR: Could not sanitize molecule on line "
|
|
<< d_line << std::endl;
|
|
BOOST_LOG(rdErrorLog) << "ERROR: " << se.what() << "\n";
|
|
res.reset();
|
|
} catch (...) {
|
|
// write out an error message
|
|
BOOST_LOG(rdErrorLog) << "ERROR: Could not process molecule on line "
|
|
<< d_line << std::endl;
|
|
res.reset();
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
// --------------------------------------------------
|
|
//
|
|
// Returns the next available line in the input stream.
|
|
//
|
|
// Side-effects:
|
|
// - If EOF is hit without reading anything, the df_end
|
|
// flag will be set.
|
|
// - If a real line is read, our d_line counter is
|
|
// incremented
|
|
//
|
|
// --------------------------------------------------
|
|
std::string SmilesMolSupplier::nextLine() {
|
|
PRECONDITION(dp_inStream, "bad stream");
|
|
if (df_end) {
|
|
return "";
|
|
}
|
|
std::string tempStr = getLine(dp_inStream);
|
|
|
|
if (tempStr == "") {
|
|
// got an empty string, check to see if we hit EOF:
|
|
if (dp_inStream->eof() || dp_inStream->bad()) {
|
|
// yes, set our flag:
|
|
df_end = true;
|
|
}
|
|
} else if (dp_inStream->eof()) {
|
|
// we got some data before hitting EOF. So clear the
|
|
// flag on inStream
|
|
dp_inStream->clear();
|
|
}
|
|
d_line++;
|
|
return tempStr;
|
|
}
|
|
|
|
// --------------------------------------------------
|
|
//
|
|
// Returns the position of the beginning of the next
|
|
// non-comment line in the input stream. -1 is returned if
|
|
// no line could be read;
|
|
//
|
|
// Side-effects:
|
|
// - If EOF is hit without finding a valid line, the df_end
|
|
// flag will be set.
|
|
// - Our d_line counter is incremented for each line read
|
|
//
|
|
long int SmilesMolSupplier::skipComments() {
|
|
PRECONDITION(dp_inStream, "bad stream");
|
|
if (this->atEnd()) {
|
|
return -1;
|
|
}
|
|
|
|
std::streampos prev = dp_inStream->tellg();
|
|
std::string tempStr = this->nextLine();
|
|
if (!df_end) {
|
|
// if we didn't immediately hit EOF, loop until we get a valid line:
|
|
while ((tempStr[0] == '#') || (strip(tempStr).size() == 0)) {
|
|
prev = dp_inStream->tellg();
|
|
tempStr = this->nextLine();
|
|
if (this->atEnd()) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// if we hit EOF without getting a proper line, return -1:
|
|
if (tempStr.empty() || (tempStr[0] == '#') || (strip(tempStr).size() == 0)) {
|
|
return -1;
|
|
}
|
|
return static_cast<long int>(prev);
|
|
}
|
|
|
|
// --------------------------------------------------
|
|
//
|
|
// Reads and processes the title line
|
|
//
|
|
void SmilesMolSupplier::processTitleLine() {
|
|
PRECONDITION(dp_inStream, "bad stream");
|
|
int pos = this->skipComments();
|
|
if (pos >= 0) {
|
|
dp_inStream->seekg(pos);
|
|
|
|
std::string tempStr = getLine(dp_inStream);
|
|
boost::char_separator<char> sep(d_params.delimiter.c_str(), "",
|
|
boost::keep_empty_tokens);
|
|
tokenizer tokens(tempStr, sep);
|
|
for (tokenizer::iterator tokIter = tokens.begin(); tokIter != tokens.end();
|
|
++tokIter) {
|
|
std::string pname = strip(*tokIter);
|
|
d_props.push_back(pname);
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string SmilesMolSupplier::getItemText(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
unsigned int holder = d_next;
|
|
bool endHolder = df_end;
|
|
// this throws the relevant exception if we go too far:
|
|
moveTo(idx);
|
|
std::string res = getLine(dp_inStream);
|
|
d_next = holder;
|
|
df_end = endHolder;
|
|
return res;
|
|
}
|
|
|
|
// --------------------------------------------------
|
|
//
|
|
// Moves to the position of a particular entry in the
|
|
// stream.
|
|
//
|
|
// If insufficient entries are present, a FileParseException
|
|
// will be thrown
|
|
//
|
|
void SmilesMolSupplier::moveTo(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "bad instream");
|
|
// get the easy situations (boundary conditions) out of the
|
|
// way first:
|
|
if (d_len > -1 && idx >= static_cast<unsigned int>(d_len)) {
|
|
df_end = true;
|
|
std::ostringstream errout;
|
|
errout << "ERROR: Index error (idx = " << idx << "): "
|
|
<< "ran out of lines\n";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
// dp_inStream->seekg() is called for all idx values
|
|
// and earlier calls to next() may have put the stream into a bad state
|
|
dp_inStream->clear();
|
|
|
|
// -----------
|
|
// Case 1: we have already read the particular entry:
|
|
//
|
|
// Set the stream position and return
|
|
// -----------
|
|
if (!d_molpos.empty() && d_molpos.size() > idx) {
|
|
dp_inStream->clear(); // clear the EOF tag if it has been set
|
|
df_end = false;
|
|
dp_inStream->seekg(d_molpos[idx]);
|
|
d_next = idx;
|
|
d_line = d_lineNums[idx];
|
|
return;
|
|
}
|
|
|
|
// -----------
|
|
// Case 2: we haven't read the entry, so move forward until
|
|
// we've gone far enough.
|
|
// -----------
|
|
if (d_molpos.empty()) {
|
|
// if we are just starting out, process the title line
|
|
dp_inStream->seekg(0);
|
|
if (d_params.titleLine) {
|
|
this->processTitleLine();
|
|
}
|
|
} else {
|
|
// move to the last position we've seen:
|
|
dp_inStream->seekg(d_molpos.back());
|
|
// read that line:
|
|
std::string tmp = getLine(dp_inStream);
|
|
}
|
|
|
|
// the stream pointer is now at the last thing we read in
|
|
while (d_molpos.size() <= idx) {
|
|
std::streampos nextP = this->skipComments();
|
|
if (nextP < 0) {
|
|
std::ostringstream errout;
|
|
errout << "ERROR: Index error (idx = " << idx << "): "
|
|
<< "ran out of lines\n";
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
d_molpos.emplace_back(nextP);
|
|
d_lineNums.push_back(d_line);
|
|
if (d_molpos.size() == idx + 1 && df_end) {
|
|
// boundary condition: we could read the point we were looking for
|
|
// but not the next one.
|
|
// indicate that we've reached EOF:
|
|
dp_inStream->clear();
|
|
dp_inStream->seekg(0, std::ios_base::end);
|
|
d_len = d_molpos.size();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
POSTCONDITION(d_molpos.size() > idx, "not enough lines");
|
|
dp_inStream->seekg(d_molpos[idx]);
|
|
d_next = idx;
|
|
return;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------
|
|
//
|
|
// Grabs and returns the next molecule from the input stream.
|
|
// After processing the line, the file is advanced to the next
|
|
// position in the file (skipping blank and comment lines).
|
|
//
|
|
// Throws a FileParseException if EOF has already been hit.
|
|
//
|
|
std::unique_ptr<RWMol> SmilesMolSupplier::next() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
|
|
if (d_next < 0) {
|
|
d_next = 0;
|
|
}
|
|
|
|
// This throws an exception if it fails:
|
|
moveTo(d_next);
|
|
CHECK_INVARIANT(static_cast<int>(d_molpos.size()) > d_next,
|
|
"bad index length");
|
|
|
|
// ---------
|
|
// if we get here we can just build the molecule:
|
|
// ---------
|
|
// set the stream to the relevant position:
|
|
dp_inStream->clear(); // clear the EOF tag if it has been set
|
|
dp_inStream->seekg(d_molpos[d_next]);
|
|
d_line = d_lineNums[d_next];
|
|
// grab the line:
|
|
std::string inLine = getLine(dp_inStream);
|
|
// and process it:
|
|
auto res = this->processLine(inLine);
|
|
// if we don't already know the length of the supplier,
|
|
// check if we can read another line:
|
|
if (d_len < 0 && this->skipComments() < 0) {
|
|
d_len = d_molpos.size();
|
|
}
|
|
|
|
// make sure the line number is correct:
|
|
if (d_next < static_cast<int>(d_lineNums.size())) {
|
|
d_line = d_lineNums[d_next];
|
|
}
|
|
|
|
++d_next;
|
|
// if we just hit the last one, simulate EOF:
|
|
if (d_len > 0 && d_next == d_len) {
|
|
df_end = true;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------
|
|
//
|
|
// Grabs and returns a particular molecule from the input stream.
|
|
//
|
|
// Raises a FileParseException on failure.
|
|
//
|
|
std::unique_ptr<RWMol> SmilesMolSupplier::operator[](unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_readMutex);
|
|
#endif
|
|
// ---------
|
|
// move to the appropriate location in the file:
|
|
// ---------
|
|
moveTo(idx);
|
|
|
|
// ---------
|
|
// and then pull the molecule:
|
|
// ---------
|
|
auto res = next();
|
|
|
|
return res;
|
|
}
|
|
std::shared_ptr<RWMol> SmilesMolSupplier::getShared(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
if (d_cacheMolecules) {
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
if (d_molCache.size() > idx && d_molCache[idx]) {
|
|
return d_molCache[idx].value();
|
|
}
|
|
}
|
|
// get the molecule with index idx
|
|
std::shared_ptr<RWMol> res;
|
|
{
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_readMutex);
|
|
#endif
|
|
moveTo(idx);
|
|
res.reset(next().release());
|
|
}
|
|
if (d_cacheMolecules) {
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
if (d_molCache.size() <= idx) {
|
|
constexpr unsigned int molCacheAllocChunkSize = 1000;
|
|
d_molCache.resize(idx + molCacheAllocChunkSize);
|
|
}
|
|
d_molCache[idx] = res;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------
|
|
//
|
|
// Returns the number of entries in the input stream
|
|
//
|
|
unsigned int SmilesMolSupplier::length() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
// return the number of molecule lines in the file
|
|
if (d_len > 0) {
|
|
return d_len;
|
|
} else {
|
|
std::streampos oPos = dp_inStream->tellg();
|
|
if (d_molpos.size()) {
|
|
// we've already read some molecules, go to the last
|
|
// one and read it in to initialize our location:
|
|
dp_inStream->seekg(d_molpos.back());
|
|
// skip that line and then continue:
|
|
this->skipComments();
|
|
} else {
|
|
// process the title line if need be:
|
|
if (d_params.titleLine) {
|
|
this->processTitleLine();
|
|
}
|
|
}
|
|
int pos = this->skipComments();
|
|
while (pos >= 0) {
|
|
d_molpos.emplace_back(pos);
|
|
d_lineNums.push_back(d_line);
|
|
pos = this->skipComments();
|
|
}
|
|
// now remember to set the stream to its original position:
|
|
dp_inStream->seekg(oPos);
|
|
d_len = d_molpos.size();
|
|
return d_len;
|
|
}
|
|
}
|
|
|
|
bool SmilesMolSupplier::atEnd() { return df_end; }
|
|
} // namespace FileParsers
|
|
} // namespace v2
|
|
} // namespace RDKit
|