Files
rdkit/Code/GraphMol/FileParsers/SDMolSupplier.cpp
Raul Sofia 372fbad131 Extended fix for #9101 (#9255)
* fix extended boundary issue (3 mols)

* clang pass

* no change. retrigger CI for failed java test

there's a failing java test that seems to be failing by chance rather than by changes, as it depends on rng. this is just to retrigger the CI pipeline to confirm this

* no change. retrigger the CI (yet again)

* raw strings and removed garbage collector
2026-05-06 06:10:37 +02:00

450 lines
13 KiB
C++

//
// Copyright (C) 2002-2022 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/FileParseException.h>
#include <RDGeneral/BadFileException.h>
#include <RDGeneral/StreamOps.h>
#include <RDGeneral/RDLog.h>
#include <GraphMol/SanitException.h>
#include <boost/algorithm/string.hpp>
#include "MolSupplier.h"
#include "FileParsers.h"
#include <fstream>
#include <sstream>
#include <string>
namespace RDKit {
namespace v2 {
namespace FileParsers {
SDMolSupplier::SDMolSupplier(const std::string &fileName,
const MolFileParserParams &params) {
init();
dp_inStream = openAndCheckStream(fileName);
df_owner = true;
d_molpos.push_back(dp_inStream->tellg());
d_params = params;
this->checkForEnd();
if (df_end) {
// checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue
// 19):
d_len = 0;
}
POSTCONDITION(dp_inStream, "bad instream");
}
SDMolSupplier::SDMolSupplier(std::istream *inStream, bool takeOwnership,
const MolFileParserParams &params) {
PRECONDITION(inStream, "bad stream");
init();
dp_inStream = inStream;
df_owner = takeOwnership;
d_molpos.push_back(dp_inStream->tellg());
d_params = params;
this->checkForEnd();
if (df_end) {
// checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue
// 19):
d_len = 0;
}
POSTCONDITION(dp_inStream, "bad instream");
}
void SDMolSupplier::init() {
ForwardSDMolSupplier::init();
d_len = -1;
d_last = 0;
#ifdef RDK_BUILD_THREADSAFE_SSS
const std::lock_guard<std::mutex> guard(d_cacheMutex);
#endif
d_molCache.clear();
}
void SDMolSupplier::setData(const std::string &text) {
if (dp_inStream && df_owner) {
delete dp_inStream;
}
init();
std::istream *tmpStream = nullptr;
tmpStream = static_cast<std::istream *>(
new std::istringstream(text, std::ios_base::binary));
dp_inStream = tmpStream;
df_owner = true;
d_molpos.push_back(dp_inStream->tellg());
this->checkForEnd();
if (df_end) {
// checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue
// 19):
d_len = 0;
}
POSTCONDITION(dp_inStream, "bad instream");
}
void SDMolSupplier::setData(const std::string &text,
const MolFileParserParams &params) {
d_params = params;
setData(text);
}
void SDMolSupplier::checkForEnd() {
PRECONDITION(dp_inStream, "no stream");
// we will call it end of file if we have more than 4 contiguous empty lines
// or we reach end of file in the meantime
if (dp_inStream->eof()) {
df_end = true;
d_len = rdcast<int>(d_molpos.size());
return;
}
// we are not at the end of file, check for blank lines
unsigned int nempty = 0;
std::string tempStr, stmp;
for (unsigned int i = 0; i < 4; i++) {
tempStr = getLine(dp_inStream);
if (dp_inStream->eof()) {
df_end = true;
d_len = rdcast<int>(d_molpos.size());
return;
}
if (tempStr.find_first_not_of(" \t\r\n") == std::string::npos) {
++nempty;
}
}
if (nempty == 4) {
df_end = true;
d_len = rdcast<int>(d_molpos.size());
}
}
void SDMolSupplier::peekCheckForEnd(char *bufPtr, char *bufEnd,
std::streampos molStartPos) {
PRECONDITION(dp_inStream, "no stream");
int emptyLines = 0;
char *p = bufPtr;
while (p < bufEnd) {
if (!std::isspace(*p)) {
return;
}
if (*p == '\n') {
++emptyLines;
if (emptyLines >= 4) { // the 4th empty line found
this->df_end = true;
this->d_len = rdcast<int>(this->d_molpos.size());
return;
}
}
++p;
}
// buffer was exhausted without finding 4 empty lines or data. Need to check
// the stream.
std::streampos saveBlockPos = dp_inStream->tellg();
dp_inStream->clear();
dp_inStream->seekg(molStartPos);
// run the standard (slow) logic
this->checkForEnd();
// restore the stream position
dp_inStream->clear();
dp_inStream->seekg(saveBlockPos);
}
void SDMolSupplier::reset() {
PRECONDITION(dp_inStream, "no stream");
dp_inStream->clear();
dp_inStream->seekg(0, std::ios::beg);
df_end = false;
d_last = 0;
d_line = 0;
}
std::unique_ptr<RWMol> SDMolSupplier::next() {
PRECONDITION(dp_inStream, "no stream");
if (df_end && d_last >= d_len) {
throw FileParseException("EOF hit.");
}
// set the stream to the current position
dp_inStream->seekg(d_molpos[d_last]);
std::string tempStr;
// finally if we reached the end of the file set end to be true
if (dp_inStream->eof()) {
// FIX: we should probably be throwing an exception here
df_end = true;
d_len = rdcast<int>(d_molpos.size());
return nullptr;
}
auto res = _next();
++d_last;
std::streampos posHold = dp_inStream->tellg();
this->checkForEnd();
if (!this->df_end && d_last >= static_cast<int>(d_molpos.size())) {
d_molpos.push_back(posHold);
}
return res;
}
std::string SDMolSupplier::getItemText(unsigned int idx) {
PRECONDITION(dp_inStream, "no stream");
#ifdef RDK_BUILD_THREADSAFE_SSS
const std::lock_guard<std::mutex> guard(d_readMutex);
#endif
unsigned int holder = d_last;
moveTo(idx);
std::streampos begP = d_molpos[idx];
std::streampos endP;
try {
moveTo(idx + 1);
endP = d_molpos[idx + 1];
} catch (FileParseException &) {
dp_inStream->clear();
dp_inStream->seekg(0, std::ios_base::end);
endP = dp_inStream->tellg();
}
d_last = holder;
auto *buff = new char[endP - begP];
dp_inStream->seekg(begP);
dp_inStream->read(buff, endP - begP);
std::string res(buff, endP - begP);
delete[] buff;
return res;
}
void SDMolSupplier::moveTo(unsigned int idx) {
PRECONDITION(dp_inStream, "no stream");
// dp_inStream->seekg() is called for all idx values
// and earlier calls to next() may have put the stream into a bad state
dp_inStream->clear();
// move until we hit the desired idx
if (idx < d_molpos.size()) {
dp_inStream->seekg(d_molpos[idx]);
d_last = idx;
}
// actually scan with buffering
else {
buildIndexTo(idx);
if (idx < d_molpos.size()) {
dp_inStream->clear();
dp_inStream->seekg(d_molpos[idx]);
d_last = idx;
} else {
/*Unfortunately, the FileParseException is not being catched and thrown on
python directly. Instead, we use this df_end flag workaround to indicate
that we reached the end of file (and signal the error). There's a comment
on MolSupplier.h about problems with Boost exception handling and the full
explanation That's the only reason for the following line*/
df_end = true;
// if we reached end of file without reaching "idx" we have an index error
d_len = rdcast<int>(d_molpos.size());
std::ostringstream errout;
errout << "ERROR: Index error (idx = " << idx << ") : "
<< " we do not have enough mol blocks";
throw FileParseException(errout.str());
}
}
}
std::unique_ptr<RWMol> SDMolSupplier::operator[](unsigned int idx) {
PRECONDITION(dp_inStream, "no stream");
#ifdef RDK_BUILD_THREADSAFE_SSS
const std::lock_guard<std::mutex> guard(d_readMutex);
#endif
// std::cerr << "get molecule with index " << idx << std::endl;
// get the molecule with index idx
moveTo(idx);
auto res = next();
return res;
}
std::shared_ptr<RWMol> SDMolSupplier::getShared(unsigned int idx) {
PRECONDITION(dp_inStream, "no stream");
if (d_cacheMolecules) {
#ifdef RDK_BUILD_THREADSAFE_SSS
const std::lock_guard<std::mutex> guard(d_cacheMutex);
#endif
if (d_molCache.size() > idx && d_molCache[idx]) {
return d_molCache[idx].value();
}
}
std::shared_ptr<RWMol> res;
{
#ifdef RDK_BUILD_THREADSAFE_SSS
const std::lock_guard<std::mutex> guard(d_readMutex);
#endif
// get the molecule with index idx
moveTo(idx);
res.reset(next().release());
}
if (d_cacheMolecules) {
if (d_molCache.size() <= idx) {
constexpr unsigned int molCacheAllocChunkSize = 1000;
d_molCache.resize(idx + molCacheAllocChunkSize);
}
d_molCache[idx] = res;
}
return res;
}
unsigned int SDMolSupplier::length() {
PRECONDITION(dp_inStream, "no stream");
// return the number of mol blocks in the sdfile
if (d_len > 0 || (df_end && d_len == 0)) {
return d_len;
} else {
int old_last = d_last;
buildIndexTo(UINT32_MAX);
d_len = rdcast<int>(d_molpos.size());
// safeguard to restore the pointer to the last read molecule
d_last = old_last;
dp_inStream->clear();
dp_inStream->seekg(d_molpos[d_last]);
df_end = false;
return d_len;
}
}
void SDMolSupplier::buildIndexTo(unsigned int targetIdx) {
dp_inStream->seekg(d_molpos.back());
d_last = rdcast<int>(d_molpos.size()) - 1;
const size_t CHUNK_SIZE = 65536;
const size_t OVERLAP =
4; // to catch "$$$$" at chunk boundaries ("...\n$$ <new chunk> $$...")
std::vector<char> buffer(CHUNK_SIZE + OVERLAP);
std::fill(buffer.begin(), buffer.begin() + OVERLAP, '\n'); // safe init
std::streampos currentStreamPos = dp_inStream->tellg();
bool foundTarget = false;
while (dp_inStream->good() && !foundTarget) {
std::streampos chunkStartPos = currentStreamPos;
dp_inStream->read(&buffer[OVERLAP], CHUNK_SIZE);
std::streamsize bytesRead = dp_inStream->gcount();
if (bytesRead == 0) {
break; // EOF
}
std::streampos chunkEndPos = dp_inStream->tellg();
// check if the stream is "honest" (binary or text mode with 1 byte newlines
// (like UNIX), meaning read bytes map 1:1 to disk bytes)
bool isBinaryLike = (bytesRead == (chunkEndPos - chunkStartPos));
char *bufStart = &buffer[0];
char *bufEnd = bufStart + OVERLAP + bytesRead;
char *ptr = bufStart + 1;
bool needEOL = false;
while (true) {
constexpr char dollarSigns[]{"$$$$"};
auto match = std::search(ptr, bufEnd, dollarSigns, dollarSigns + 4);
if (match == bufEnd) {
break;
}
if (*(match - 1) == '\n') { // ensure $$$$ is at start of line
char *nlPos = match + 4;
if (nlPos == bufEnd) {
// corner case, $$$$ is EXACTLY at the end of the buffer
// we need the next char in the stream to be a "\n", this is resolved
// below.
needEOL = true;
} else {
while (nlPos < bufEnd && *nlPos != '\n') {
++nlPos;
}
if (nlPos < bufEnd) {
++nlPos;
}
}
std::streampos posHold;
if (isBinaryLike &&
!needEOL) { // fast path, math checks out, no need to seek
posHold = chunkStartPos + std::streamoff(nlPos - bufStart - OVERLAP);
} else { // slow path, there is byte translation going on, need to seek
// and use the std translation magic to find the actual byte
// position
dp_inStream->clear();
dp_inStream->seekg(
chunkStartPos); // rollback to the start of the chunk
dp_inStream->ignore(
nlPos - bufStart -
OVERLAP); // advance but with the magic translation in effect now
posHold =
dp_inStream
->tellg(); // this is the physical position on disk we want
}
bool atTrueEOF =
(bytesRead < static_cast<std::streamsize>(CHUNK_SIZE)) &&
(nlPos >= bufEnd);
if (!atTrueEOF) {
if (needEOL) {
char c = dp_inStream->peek();
if (c == '\n') {
posHold = posHold + std::streamoff(1);
needEOL = false;
}
}
this->peekCheckForEnd(nlPos, bufEnd,
posHold); // the optimized peek version
if (!this->df_end) {
d_molpos.push_back(posHold);
++d_last;
if (static_cast<unsigned int>(d_last) ==
targetIdx) { // not really needed but this way we only index as
// much as needed
foundTarget = true;
break;
}
}
}
}
ptr = match + 4;
}
if (foundTarget) {
break;
}
if (!isBinaryLike) { // need to seek to the end of the chunk again to make
// sure next read is from the right position
dp_inStream->clear();
dp_inStream->seekg(chunkEndPos);
}
if (bytesRead >= static_cast<std::streamsize>(OVERLAP)) {
std::memcpy(&buffer[0], bufEnd - OVERLAP, OVERLAP);
}
currentStreamPos = chunkEndPos;
}
}
bool SDMolSupplier::atEnd() {
PRECONDITION(dp_inStream, "no stream");
return df_end;
}
void SDMolSupplier::setStreamIndices(const std::vector<std::streampos> &locs) {
d_molpos.clear();
d_molpos.resize(locs.size());
std::copy(locs.begin(), locs.end(), d_molpos.begin());
this->reset();
d_len = rdcast<int>(d_molpos.size());
}
} // namespace FileParsers
} // namespace v2
} // namespace RDKit