mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* iterators for random-access MolSuppliers add optional caching to SDMolSupplier * add support to SmilesMolSupplier too There is a lot of duplicate code between the random-access suppliers that would be worth trying to remove but at the moment it looks like it would require multiple inheritance, and I think we want to avoid that * add input iterators for ForwardSDMolSupplier() * throw when calling begin() on a used supplier * switch to use the spaceship operator * init() should reset the mol cache * Make SDMolSupplier and SmilesMolSupplier safe for multi-threaded reads * add benchmarking * add TDTMolSupplier support improved testing add benchmarks for parallel iteration optional TBB support * better const handling, add reverse iterators doesn't look like const_iterator is possible since getting data from the underlyng supplier object is non-const * improve docs more usings add reverse iterator to TDTMolSupplier * tests only try execution::par when it is there * fix typo * more testing/demo * remove accidentally added files * review changes * add default ctors * disable a false-positive compiler warning it is stupid to have to do this --------- Co-authored-by: = <=>
450 lines
13 KiB
C++
450 lines
13 KiB
C++
//
|
|
// Copyright (C) 2005-2024 Greg Landrum and other RDKit contributors
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/tokenizer.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/RDLog.h>
|
|
#include "MolSupplier.h"
|
|
#include "FileParsers.h"
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
#include <RDGeneral/LocaleSwitcher.h>
|
|
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
namespace RDKit {
|
|
namespace TDTParseUtils {
|
|
typedef boost::tokenizer<boost::escaped_list_separator<char>> CommaTokenizer;
|
|
|
|
/*
|
|
* if inStream is valid, we'll allow the numbers to be broken across multiple
|
|
* lines.
|
|
*
|
|
* This will throw a boost::bad_lexical_cast exception if it hits a bogus number
|
|
*
|
|
*/
|
|
template <typename T>
|
|
void ParseNumberList(std::string inLine, std::vector<T> &res,
|
|
std::istream *inStream = nullptr) {
|
|
bool foundEnd = false;
|
|
while (!foundEnd) {
|
|
CommaTokenizer commaTok(inLine);
|
|
for (CommaTokenizer::const_iterator commaTokIt = commaTok.begin();
|
|
commaTokIt != commaTok.end(); commaTokIt++) {
|
|
std::string number = *commaTokIt;
|
|
bool atEnd = number.find(";>") != std::string::npos;
|
|
boost::trim_if(number, boost::is_any_of(" \r\n\t;>"));
|
|
if (number != "" && !atEnd) {
|
|
res.push_back(boost::lexical_cast<T>(number));
|
|
} else if (atEnd) {
|
|
// that's it, we're done:
|
|
foundEnd = true;
|
|
break;
|
|
}
|
|
}
|
|
if (foundEnd || !inStream || inStream->eof()) {
|
|
break;
|
|
} else {
|
|
std::getline(*inStream, inLine);
|
|
}
|
|
}
|
|
if (!foundEnd) {
|
|
throw FileParseException("no end tag found for numeric list");
|
|
}
|
|
}
|
|
|
|
} // end of namespace TDTParseUtils
|
|
|
|
namespace v2 {
|
|
namespace FileParsers {
|
|
TDTMolSupplier::TDTMolSupplier() { init(); }
|
|
|
|
TDTMolSupplier::TDTMolSupplier(const std::string &fileName,
|
|
const TDTMolSupplierParams ¶ms) {
|
|
d_params = params;
|
|
init();
|
|
dp_inStream = openAndCheckStream(fileName);
|
|
df_owner = true;
|
|
|
|
this->advanceToNextRecord();
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
this->checkForEnd();
|
|
}
|
|
|
|
TDTMolSupplier::TDTMolSupplier(std::istream *inStream, bool takeOwnership,
|
|
const TDTMolSupplierParams ¶ms) {
|
|
CHECK_INVARIANT(inStream, "bad instream");
|
|
CHECK_INVARIANT(!(inStream->eof()), "early EOF");
|
|
d_params = params;
|
|
init();
|
|
dp_inStream = inStream;
|
|
df_owner = takeOwnership;
|
|
this->advanceToNextRecord();
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
this->checkForEnd();
|
|
}
|
|
|
|
void TDTMolSupplier::init() {
|
|
dp_inStream = nullptr;
|
|
df_owner = false;
|
|
df_end = false;
|
|
d_len = -1;
|
|
d_last = 0;
|
|
d_line = 0;
|
|
d_molpos.clear();
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
d_molCache.clear();
|
|
}
|
|
|
|
void TDTMolSupplier::setData(const std::string &text,
|
|
const TDTMolSupplierParams ¶ms) {
|
|
if (dp_inStream && df_owner) {
|
|
delete dp_inStream;
|
|
}
|
|
d_params = params;
|
|
init();
|
|
std::istream *tmpStream = nullptr;
|
|
tmpStream = static_cast<std::istream *>(
|
|
new std::istringstream(text, std::ios_base::binary));
|
|
dp_inStream = tmpStream;
|
|
df_owner = true;
|
|
this->advanceToNextRecord();
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
this->checkForEnd();
|
|
POSTCONDITION(dp_inStream, "bad instream");
|
|
}
|
|
|
|
bool TDTMolSupplier::advanceToNextRecord() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
std::streampos pos;
|
|
bool res = false;
|
|
while (1) {
|
|
if (dp_inStream->eof() || dp_inStream->bad()) {
|
|
return false;
|
|
}
|
|
pos = dp_inStream->tellg();
|
|
std::string inL;
|
|
std::getline(*dp_inStream, inL);
|
|
if (inL.find("$SMI<") == 0) {
|
|
res = true;
|
|
break;
|
|
}
|
|
}
|
|
dp_inStream->clear();
|
|
dp_inStream->seekg(pos);
|
|
return res;
|
|
}
|
|
|
|
void TDTMolSupplier::checkForEnd() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
if (dp_inStream->eof() || dp_inStream->bad()) {
|
|
df_end = true;
|
|
// the -1 here is because by the time we get here we've already pushed on
|
|
// the
|
|
// position of the next line:
|
|
d_len = d_molpos.size() - 1;
|
|
return;
|
|
}
|
|
|
|
// we are not at the end of file, but check for blank lines:
|
|
std::string tempStr;
|
|
std::getline(*dp_inStream, tempStr);
|
|
|
|
boost::trim_left_if(tempStr, boost::is_any_of(std::string(" \t\r\n")));
|
|
|
|
if (tempStr.length() == 0) {
|
|
df_end = true;
|
|
// the -1 here is because by the time we get here we've already pushed on
|
|
// the
|
|
// position of the next line:
|
|
d_len = d_molpos.size() - 1;
|
|
}
|
|
return;
|
|
}
|
|
|
|
void TDTMolSupplier::reset() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
dp_inStream->clear();
|
|
|
|
dp_inStream->seekg(0, std::ios::beg);
|
|
df_end = false;
|
|
d_last = 0;
|
|
d_line = 0;
|
|
}
|
|
|
|
std::unique_ptr<RWMol> TDTMolSupplier::parseMol(std::string inLine) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
Utils::LocaleSwitcher ls;
|
|
std::size_t startP = inLine.find("<");
|
|
std::size_t endP = inLine.find_last_of(">");
|
|
std::string smiles = inLine.substr(startP + 1, endP - startP - 1);
|
|
auto res = v2::SmilesParse::MolFromSmiles(smiles, d_params.parseParameters);
|
|
|
|
if (res && res->getNumAtoms() > 0) {
|
|
// -----------
|
|
// Process the properties:
|
|
d_line++;
|
|
std::getline(*dp_inStream, inLine);
|
|
while (!dp_inStream->eof() && !dp_inStream->fail() &&
|
|
inLine.find("|") != 0) {
|
|
endP = inLine.find("<");
|
|
std::string propName = inLine.substr(0, endP);
|
|
boost::trim_if(propName, boost::is_any_of(" \t"));
|
|
startP = endP + 1;
|
|
|
|
if (propName == common_properties::TWOD && d_params.confId2D >= 0) {
|
|
std::string rest = inLine.substr(startP, inLine.size() - startP);
|
|
std::vector<double> coords;
|
|
TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
|
|
auto *conf = new Conformer(res->getNumAtoms());
|
|
conf->setId(d_params.confId2D);
|
|
conf->set3D(false);
|
|
for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
|
|
if (2 * atIdx + 1 < coords.size()) {
|
|
conf->setAtomPos(
|
|
atIdx,
|
|
RDGeom::Point3D(coords[2 * atIdx], coords[2 * atIdx + 1], 0.0));
|
|
} else {
|
|
// we're going to let this slide... but maybe we should do something
|
|
// else?
|
|
}
|
|
}
|
|
res->addConformer(conf, false);
|
|
} else if (propName == "3D" && d_params.confId3D >= 0) {
|
|
std::string rest = inLine.substr(startP, inLine.size() - startP);
|
|
std::vector<double> coords;
|
|
TDTParseUtils::ParseNumberList(rest, coords, dp_inStream);
|
|
auto *conf = new Conformer(res->getNumAtoms());
|
|
conf->setId(d_params.confId3D);
|
|
conf->set3D(true);
|
|
for (unsigned int atIdx = 0; atIdx < res->getNumAtoms(); atIdx++) {
|
|
if (3 * atIdx + 2 < coords.size()) {
|
|
conf->setAtomPos(
|
|
atIdx, RDGeom::Point3D(coords[3 * atIdx], coords[3 * atIdx + 1],
|
|
coords[3 * atIdx + 2]));
|
|
} else {
|
|
// we're going to let this slide... but maybe we should do something
|
|
// else?
|
|
}
|
|
}
|
|
res->addConformer(conf, false);
|
|
} else {
|
|
endP = inLine.find_last_of(">");
|
|
if (endP == std::string::npos) {
|
|
std::ostringstream errout;
|
|
errout << "no end tag found for property" << propName;
|
|
throw FileParseException(errout.str());
|
|
} else {
|
|
std::string propVal = inLine.substr(startP, endP - startP);
|
|
res->setProp(propName, propVal);
|
|
if (propName == d_params.nameRecord) {
|
|
res->setProp(common_properties::_Name, propVal);
|
|
}
|
|
}
|
|
}
|
|
std::getline(*dp_inStream, inLine);
|
|
}
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
std::unique_ptr<RWMol> TDTMolSupplier::next() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
// set the stream to the appropriate position
|
|
dp_inStream->seekg(d_molpos[d_last]);
|
|
|
|
// finally if we reached the end of the file set end to be true
|
|
if (dp_inStream->eof()) {
|
|
// FIX: we should probably be throwing an exception here
|
|
df_end = true;
|
|
d_len = d_molpos.size();
|
|
return nullptr;
|
|
}
|
|
|
|
// start by finding the $SMI element (we're assuming that this starts the
|
|
// block)
|
|
std::string tempp;
|
|
d_line++;
|
|
std::getline(*dp_inStream, tempp);
|
|
while (tempp.find("$SMI<") != 0 && !dp_inStream->eof() &&
|
|
!dp_inStream->fail()) {
|
|
d_line++;
|
|
std::getline(*dp_inStream, tempp);
|
|
}
|
|
std::unique_ptr<RWMol> res;
|
|
if (tempp.find("$SMI<") == 0) {
|
|
try {
|
|
parseMol(tempp).swap(res);
|
|
} catch (MolSanitizeException &se) {
|
|
// We couldn't sanitize a molecule we got - write out an error message and
|
|
// move to
|
|
BOOST_LOG(rdErrorLog)
|
|
<< "ERROR: Could not sanitize molecule ending on line " << d_line
|
|
<< std::endl;
|
|
BOOST_LOG(rdErrorLog) << "ERROR: " << se.what() << "\n";
|
|
std::string tempStr;
|
|
while (!dp_inStream->eof() && !dp_inStream->fail() &&
|
|
tempStr.find("|") != 0) {
|
|
d_line++;
|
|
std::getline(*dp_inStream, tempStr);
|
|
}
|
|
}
|
|
}
|
|
d_last++;
|
|
if (d_last >= static_cast<int>(d_molpos.size())) {
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
}
|
|
this->checkForEnd();
|
|
return res;
|
|
}
|
|
|
|
std::string TDTMolSupplier::getItemText(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_readMutex);
|
|
#endif
|
|
unsigned int holder = d_last;
|
|
moveTo(idx);
|
|
std::streampos begP = d_molpos[idx];
|
|
bool endHolder = df_end;
|
|
std::streampos endP;
|
|
try {
|
|
moveTo(idx + 1);
|
|
endP = d_molpos[idx + 1];
|
|
} catch (FileParseException &) {
|
|
dp_inStream->clear();
|
|
dp_inStream->seekg(0, std::ios_base::end);
|
|
endP = dp_inStream->tellg();
|
|
}
|
|
d_last = holder;
|
|
df_end = endHolder;
|
|
auto *buff = new char[endP - begP];
|
|
dp_inStream->seekg(begP);
|
|
dp_inStream->read(buff, endP - begP);
|
|
std::string res(buff, endP - begP);
|
|
delete[] buff;
|
|
return res;
|
|
}
|
|
|
|
void TDTMolSupplier::moveTo(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
|
|
// dp_inStream->seekg() is called for all idx values
|
|
// and earlier calls to next() may have put the stream into a bad state
|
|
dp_inStream->clear();
|
|
|
|
// move until we hit the desired idx
|
|
if (idx < d_molpos.size()) {
|
|
dp_inStream->seekg(d_molpos[idx]);
|
|
d_last = idx;
|
|
} else {
|
|
std::string tempStr;
|
|
d_last = d_molpos.size() - 1;
|
|
dp_inStream->seekg(d_molpos.back());
|
|
while (d_last < static_cast<int>(idx) && !dp_inStream->eof() &&
|
|
!dp_inStream->fail()) {
|
|
d_line++;
|
|
std::getline(*dp_inStream, tempStr);
|
|
|
|
if (tempStr.find("|") == 0) {
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
d_last++;
|
|
}
|
|
}
|
|
// if we reached end of file without reaching "idx" we have an index error
|
|
if (dp_inStream->eof()) {
|
|
d_len = d_molpos.size();
|
|
std::ostringstream errout;
|
|
errout << "ERROR: Index error (idx = " << idx << ") : "
|
|
<< " we do no have enough molecule blocks";
|
|
throw FileParseException(errout.str());
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<RWMol> TDTMolSupplier::operator[](unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_readMutex);
|
|
#endif
|
|
moveTo(idx);
|
|
return next();
|
|
}
|
|
std::shared_ptr<RWMol> TDTMolSupplier::getShared(unsigned int idx) {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
if (d_cacheMolecules) {
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
if (d_molCache.size() > idx && d_molCache[idx]) {
|
|
return d_molCache[idx].value();
|
|
}
|
|
}
|
|
// get the molecule with index idx
|
|
std::shared_ptr<RWMol> res;
|
|
{
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_readMutex);
|
|
#endif
|
|
moveTo(idx);
|
|
res.reset(next().release());
|
|
}
|
|
if (d_cacheMolecules) {
|
|
#ifdef RDK_BUILD_THREADSAFE_SSS
|
|
const std::lock_guard<std::mutex> guard(d_cacheMutex);
|
|
#endif
|
|
if (d_molCache.size() <= idx) {
|
|
constexpr unsigned int molCacheAllocChunkSize = 1000;
|
|
d_molCache.resize(idx + molCacheAllocChunkSize);
|
|
}
|
|
d_molCache[idx] = res;
|
|
}
|
|
return res;
|
|
}
|
|
unsigned int TDTMolSupplier::length() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
// return the number of mol blocks in the sdfile
|
|
if (d_len > 0) {
|
|
return d_len;
|
|
} else {
|
|
std::string tempStr;
|
|
d_len = d_molpos.size();
|
|
dp_inStream->seekg(d_molpos.back());
|
|
std::string inL;
|
|
std::getline(*dp_inStream, inL);
|
|
while (this->advanceToNextRecord()) {
|
|
d_molpos.push_back(dp_inStream->tellg());
|
|
d_len++;
|
|
std::getline(*dp_inStream, inL);
|
|
}
|
|
// now remember to set the stream to the last position we want to read
|
|
dp_inStream->clear();
|
|
dp_inStream->seekg(d_molpos[d_last]);
|
|
return d_len;
|
|
}
|
|
}
|
|
|
|
bool TDTMolSupplier::atEnd() {
|
|
PRECONDITION(dp_inStream, "no stream");
|
|
return df_end;
|
|
}
|
|
} // namespace FileParsers
|
|
} // namespace v2
|
|
} // namespace RDKit
|