// // Copyright (C) 2002-2022 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include #include "MolSupplier.h" #include "FileParsers.h" #include #include #include namespace RDKit { namespace v2 { namespace FileParsers { SDMolSupplier::SDMolSupplier(const std::string &fileName, const MolFileParserParams ¶ms) { init(); dp_inStream = openAndCheckStream(fileName); df_owner = true; d_molpos.push_back(dp_inStream->tellg()); d_params = params; this->checkForEnd(); if (df_end) { // checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue // 19): d_len = 0; } POSTCONDITION(dp_inStream, "bad instream"); } SDMolSupplier::SDMolSupplier(std::istream *inStream, bool takeOwnership, const MolFileParserParams ¶ms) { PRECONDITION(inStream, "bad stream"); init(); dp_inStream = inStream; df_owner = takeOwnership; d_molpos.push_back(dp_inStream->tellg()); d_params = params; this->checkForEnd(); if (df_end) { // checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue // 19): d_len = 0; } POSTCONDITION(dp_inStream, "bad instream"); } void SDMolSupplier::init() { ForwardSDMolSupplier::init(); d_len = -1; d_last = 0; #ifdef RDK_BUILD_THREADSAFE_SSS const std::lock_guard guard(d_cacheMutex); #endif d_molCache.clear(); } void SDMolSupplier::setData(const std::string &text) { if (dp_inStream && df_owner) { delete dp_inStream; } init(); std::istream *tmpStream = nullptr; tmpStream = static_cast( new std::istringstream(text, std::ios_base::binary)); dp_inStream = tmpStream; df_owner = true; d_molpos.push_back(dp_inStream->tellg()); this->checkForEnd(); if (df_end) { // checkForEnd() sets d_len if we're at EOF. undo that (was GitHub issue // 19): d_len = 0; } POSTCONDITION(dp_inStream, "bad instream"); } void SDMolSupplier::setData(const std::string &text, const MolFileParserParams ¶ms) { d_params = params; setData(text); } void SDMolSupplier::checkForEnd() { PRECONDITION(dp_inStream, "no stream"); // we will call it end of file if we have more than 4 contiguous empty lines // or we reach end of file in the meantime if (dp_inStream->eof()) { df_end = true; d_len = rdcast(d_molpos.size()); return; } // we are not at the end of file, check for blank lines unsigned int nempty = 0; std::string tempStr, stmp; for (unsigned int i = 0; i < 4; i++) { tempStr = getLine(dp_inStream); if (dp_inStream->eof()) { df_end = true; d_len = rdcast(d_molpos.size()); return; } if (tempStr.find_first_not_of(" \t\r\n") == std::string::npos) { ++nempty; } } if (nempty == 4) { df_end = true; d_len = rdcast(d_molpos.size()); } } void SDMolSupplier::peekCheckForEnd(char *bufPtr, char *bufEnd, std::streampos molStartPos) { PRECONDITION(dp_inStream, "no stream"); int emptyLines = 0; char *p = bufPtr; while (p < bufEnd) { if (!std::isspace(*p)) { return; } if (*p == '\n') { ++emptyLines; if (emptyLines >= 4) { // the 4th empty line found this->df_end = true; this->d_len = rdcast(this->d_molpos.size()); return; } } ++p; } // buffer was exhausted without finding 4 empty lines or data. Need to check // the stream. std::streampos saveBlockPos = dp_inStream->tellg(); dp_inStream->clear(); dp_inStream->seekg(molStartPos); // run the standard (slow) logic this->checkForEnd(); // restore the stream position dp_inStream->clear(); dp_inStream->seekg(saveBlockPos); } void SDMolSupplier::reset() { PRECONDITION(dp_inStream, "no stream"); dp_inStream->clear(); dp_inStream->seekg(0, std::ios::beg); df_end = false; d_last = 0; d_line = 0; } std::unique_ptr SDMolSupplier::next() { PRECONDITION(dp_inStream, "no stream"); if (df_end && d_last >= d_len) { throw FileParseException("EOF hit."); } // set the stream to the current position dp_inStream->seekg(d_molpos[d_last]); std::string tempStr; // finally if we reached the end of the file set end to be true if (dp_inStream->eof()) { // FIX: we should probably be throwing an exception here df_end = true; d_len = rdcast(d_molpos.size()); return nullptr; } auto res = _next(); ++d_last; std::streampos posHold = dp_inStream->tellg(); this->checkForEnd(); if (!this->df_end && d_last >= static_cast(d_molpos.size())) { d_molpos.push_back(posHold); } return res; } std::string SDMolSupplier::getItemText(unsigned int idx) { PRECONDITION(dp_inStream, "no stream"); #ifdef RDK_BUILD_THREADSAFE_SSS const std::lock_guard guard(d_readMutex); #endif unsigned int holder = d_last; moveTo(idx); std::streampos begP = d_molpos[idx]; std::streampos endP; try { moveTo(idx + 1); endP = d_molpos[idx + 1]; } catch (FileParseException &) { dp_inStream->clear(); dp_inStream->seekg(0, std::ios_base::end); endP = dp_inStream->tellg(); } d_last = holder; auto *buff = new char[endP - begP]; dp_inStream->seekg(begP); dp_inStream->read(buff, endP - begP); std::string res(buff, endP - begP); delete[] buff; return res; } void SDMolSupplier::moveTo(unsigned int idx) { PRECONDITION(dp_inStream, "no stream"); // dp_inStream->seekg() is called for all idx values // and earlier calls to next() may have put the stream into a bad state dp_inStream->clear(); // move until we hit the desired idx if (idx < d_molpos.size()) { dp_inStream->seekg(d_molpos[idx]); d_last = idx; } // actually scan with buffering else { buildIndexTo(idx); if (idx < d_molpos.size()) { dp_inStream->clear(); dp_inStream->seekg(d_molpos[idx]); d_last = idx; } else { /*Unfortunately, the FileParseException is not being catched and thrown on python directly. Instead, we use this df_end flag workaround to indicate that we reached the end of file (and signal the error). There's a comment on MolSupplier.h about problems with Boost exception handling and the full explanation That's the only reason for the following line*/ df_end = true; // if we reached end of file without reaching "idx" we have an index error d_len = rdcast(d_molpos.size()); std::ostringstream errout; errout << "ERROR: Index error (idx = " << idx << ") : " << " we do not have enough mol blocks"; throw FileParseException(errout.str()); } } } std::unique_ptr SDMolSupplier::operator[](unsigned int idx) { PRECONDITION(dp_inStream, "no stream"); #ifdef RDK_BUILD_THREADSAFE_SSS const std::lock_guard guard(d_readMutex); #endif // std::cerr << "get molecule with index " << idx << std::endl; // get the molecule with index idx moveTo(idx); auto res = next(); return res; } std::shared_ptr SDMolSupplier::getShared(unsigned int idx) { PRECONDITION(dp_inStream, "no stream"); if (d_cacheMolecules) { #ifdef RDK_BUILD_THREADSAFE_SSS const std::lock_guard guard(d_cacheMutex); #endif if (d_molCache.size() > idx && d_molCache[idx]) { return d_molCache[idx].value(); } } std::shared_ptr res; { #ifdef RDK_BUILD_THREADSAFE_SSS const std::lock_guard guard(d_readMutex); #endif // get the molecule with index idx moveTo(idx); res.reset(next().release()); } if (d_cacheMolecules) { if (d_molCache.size() <= idx) { constexpr unsigned int molCacheAllocChunkSize = 1000; d_molCache.resize(idx + molCacheAllocChunkSize); } d_molCache[idx] = res; } return res; } unsigned int SDMolSupplier::length() { PRECONDITION(dp_inStream, "no stream"); // return the number of mol blocks in the sdfile if (d_len > 0 || (df_end && d_len == 0)) { return d_len; } else { int old_last = d_last; buildIndexTo(UINT32_MAX); d_len = rdcast(d_molpos.size()); // safeguard to restore the pointer to the last read molecule d_last = old_last; dp_inStream->clear(); dp_inStream->seekg(d_molpos[d_last]); df_end = false; return d_len; } } void SDMolSupplier::buildIndexTo(unsigned int targetIdx) { dp_inStream->seekg(d_molpos.back()); d_last = rdcast(d_molpos.size()) - 1; const size_t CHUNK_SIZE = 65536; const size_t OVERLAP = 4; // to catch "$$$$" at chunk boundaries ("...\n$$ $$...") std::vector buffer(CHUNK_SIZE + OVERLAP); std::fill(buffer.begin(), buffer.begin() + OVERLAP, '\n'); // safe init std::streampos currentStreamPos = dp_inStream->tellg(); bool foundTarget = false; while (dp_inStream->good() && !foundTarget) { std::streampos chunkStartPos = currentStreamPos; dp_inStream->read(&buffer[OVERLAP], CHUNK_SIZE); std::streamsize bytesRead = dp_inStream->gcount(); if (bytesRead == 0) { break; // EOF } std::streampos chunkEndPos = dp_inStream->tellg(); // check if the stream is "honest" (binary or text mode with 1 byte newlines // (like UNIX), meaning read bytes map 1:1 to disk bytes) bool isBinaryLike = (bytesRead == (chunkEndPos - chunkStartPos)); char *bufStart = &buffer[0]; char *bufEnd = bufStart + OVERLAP + bytesRead; char *ptr = bufStart + 1; bool needEOL = false; while (true) { constexpr char dollarSigns[]{"$$$$"}; auto match = std::search(ptr, bufEnd, dollarSigns, dollarSigns + 4); if (match == bufEnd) { break; } if (*(match - 1) == '\n') { // ensure $$$$ is at start of line char *nlPos = match + 4; if (nlPos == bufEnd) { // corner case, $$$$ is EXACTLY at the end of the buffer // we need the next char in the stream to be a "\n", this is resolved // below. needEOL = true; } else { while (nlPos < bufEnd && *nlPos != '\n') { ++nlPos; } if (nlPos < bufEnd) { ++nlPos; } } std::streampos posHold; if (isBinaryLike && !needEOL) { // fast path, math checks out, no need to seek posHold = chunkStartPos + std::streamoff(nlPos - bufStart - OVERLAP); } else { // slow path, there is byte translation going on, need to seek // and use the std translation magic to find the actual byte // position dp_inStream->clear(); dp_inStream->seekg( chunkStartPos); // rollback to the start of the chunk dp_inStream->ignore( nlPos - bufStart - OVERLAP); // advance but with the magic translation in effect now posHold = dp_inStream ->tellg(); // this is the physical position on disk we want } bool atTrueEOF = (bytesRead < static_cast(CHUNK_SIZE)) && (nlPos >= bufEnd); if (!atTrueEOF) { if (needEOL) { char c = dp_inStream->peek(); if (c == '\n') { posHold = posHold + std::streamoff(1); needEOL = false; } } this->peekCheckForEnd(nlPos, bufEnd, posHold); // the optimized peek version if (!this->df_end) { d_molpos.push_back(posHold); ++d_last; if (static_cast(d_last) == targetIdx) { // not really needed but this way we only index as // much as needed foundTarget = true; break; } } } } ptr = match + 4; } if (foundTarget) { break; } if (!isBinaryLike) { // need to seek to the end of the chunk again to make // sure next read is from the right position dp_inStream->clear(); dp_inStream->seekg(chunkEndPos); } if (bytesRead >= static_cast(OVERLAP)) { std::memcpy(&buffer[0], bufEnd - OVERLAP, OVERLAP); } currentStreamPos = chunkEndPos; } } bool SDMolSupplier::atEnd() { PRECONDITION(dp_inStream, "no stream"); return df_end; } void SDMolSupplier::setStreamIndices(const std::vector &locs) { d_molpos.clear(); d_molpos.resize(locs.size()); std::copy(locs.begin(), locs.end(), d_molpos.begin()); this->reset(); d_len = rdcast(d_molpos.size()); } } // namespace FileParsers } // namespace v2 } // namespace RDKit