From bc9e1145d19d0519d0ca6e0d8c001c019ad25a0b Mon Sep 17 00:00:00 2001 From: Raul Sofia <67133355+RaulSofia@users.noreply.github.com> Date: Wed, 6 May 2026 05:10:37 +0100 Subject: [PATCH] Extended fix for #9101 (#9255) * fix extended boundary issue (3 mols) * clang pass * no change. retrigger CI for failed java test there's a failing java test that seems to be failing by chance rather than by changes, as it depends on rng. this is just to retrigger the CI pipeline to confirm this * no change. retrigger the CI (yet again) * raw strings and removed garbage collector --- .gitignore | 2 + Code/GraphMol/FileParsers/SDMolSupplier.cpp | 6 +- Code/GraphMol/FileParsers/testMolSupplier.cpp | 65 +++++++++++++++++++ rdkit/Chem/UnitTestFeatFinderCLI.py | 12 ++-- 4 files changed, 77 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 2a321f0c6..c9e5c78c5 100644 --- a/.gitignore +++ b/.gitignore @@ -133,6 +133,8 @@ __pycache__/ /Data/eht_parms.dat +/External/pubchem_shape/test_data/bulk.pubchem_out.sdf + /Projects/DbCLI/testData/bzr/ /rdkit/sping/tests/testallps.ps diff --git a/Code/GraphMol/FileParsers/SDMolSupplier.cpp b/Code/GraphMol/FileParsers/SDMolSupplier.cpp index 22df8a6ad..09c9d8fde 100644 --- a/Code/GraphMol/FileParsers/SDMolSupplier.cpp +++ b/Code/GraphMol/FileParsers/SDMolSupplier.cpp @@ -398,11 +398,9 @@ void SDMolSupplier::buildIndexTo(unsigned int targetIdx) { posHold = posHold + std::streamoff(1); needEOL = false; } - this->checkForEnd(); - } else { - this->peekCheckForEnd(nlPos, bufEnd, - posHold); // the optimized peek version } + this->peekCheckForEnd(nlPos, bufEnd, + posHold); // the optimized peek version if (!this->df_end) { d_molpos.push_back(posHold); ++d_last; diff --git a/Code/GraphMol/FileParsers/testMolSupplier.cpp b/Code/GraphMol/FileParsers/testMolSupplier.cpp index e87cf12e4..e52de4c46 100644 --- a/Code/GraphMol/FileParsers/testMolSupplier.cpp +++ b/Code/GraphMol/FileParsers/testMolSupplier.cpp @@ -2719,3 +2719,68 @@ TEST_CASE("github9101 - $$$$ at buffer end") { REQUIRE(mol); delete mol; } + +// extends the "github9101 - $$$$ at buffer end" test case +// just two molecules werent enough to reliably trigger the issue +// (although the second created drift it only took effect on the next molecule +// which would be the third). this creates the edge case but with three +TEST_CASE("chunk boundary stream drift with 3+ molecules") { + std::string m1 = R"CTAB(mol1 + RDKit 3D + + 1 0 0 0 0 0 0 0 0 0999 V2000 + 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 +M END +> +)CTAB"; + std::string m1tail = R"CTAB( +$$$$ +)CTAB"; + std::string m2 = R"CTAB(mol2 + RDKit 3D + + 1 0 0 0 0 0 0 0 0 0999 V2000 + 0.0000 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 +M END +$$$$ +)CTAB"; + std::string m3 = R"CTAB(mol3 + RDKit 3D + + 1 0 0 0 0 0 0 0 0 0999 V2000 + 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 +M END +$$$$ +)CTAB"; + + // first separator must hit exactly at the 65536 byte boundary (the others + // could too but no need) + size_t paddingSize = 65536 - m1.size() - 5; + std::string padding(paddingSize, 'x'); + std::string data = m1 + padding + m1tail + m2 + m3; + + size_t firstDollar = data.find("$$$$"); + REQUIRE(firstDollar != std::string::npos); + REQUIRE(firstDollar + 4 == 65536); + + SDMolSupplier supplier; + supplier.setData(data); + + // trigger indexing (that mangles the positions if the issue is present) + CHECK(supplier.length() == 3); + + auto *mol = supplier[0]; + REQUIRE(mol); + CHECK(mol->getProp("_Name") == "mol1"); + delete mol; + + mol = supplier[1]; + REQUIRE(mol); + CHECK(mol->getProp("_Name") == "mol2"); + delete mol; + + mol = supplier[2]; + REQUIRE(mol); + CHECK(mol->getProp("_Name") == "mol3"); + delete mol; +} diff --git a/rdkit/Chem/UnitTestFeatFinderCLI.py b/rdkit/Chem/UnitTestFeatFinderCLI.py index 436943d0d..32bc57726 100644 --- a/rdkit/Chem/UnitTestFeatFinderCLI.py +++ b/rdkit/Chem/UnitTestFeatFinderCLI.py @@ -17,7 +17,8 @@ class TestCase(unittest.TestCase): parser = FeatFinderCLI.initParser() cmd = '-n 10 {0} {1}'.format(featureFile, smilesFile) with outputRedirect() as (out, err): - args = parser.parse_args(cmd.split()) + args_list = ['-n', '10', featureFile, smilesFile] + args = parser.parse_args(args_list) FeatFinderCLI.processArgs(args, parser) out = out.getvalue() err = err.getvalue() @@ -29,7 +30,8 @@ class TestCase(unittest.TestCase): cmd = '-n 2 -r {0} {1}'.format(featureFile, smilesFile) with outputRedirect() as (out, err): - args = parser.parse_args(cmd.split()) + args_list = ['-n', '2', '-r', featureFile, smilesFile] + args = parser.parse_args(args_list) FeatFinderCLI.processArgs(args, parser) out = out.getvalue() err = err.getvalue() @@ -46,13 +48,15 @@ class TestCase(unittest.TestCase): parser = FeatFinderCLI.initParser() cmd = '-n 10 {0} {1}'.format(smilesFile, smilesFile) with self.assertRaises(SystemExit), outputRedirect() as (_, err): - args = parser.parse_args(cmd.split()) + args_list = ['-n', '10', smilesFile, smilesFile] + args = parser.parse_args(args_list) FeatFinderCLI.processArgs(args, parser) self.assertIn('error', err.getvalue()) cmd = '-n 10 {0} {1}'.format(featureFile, 'incorrectFilename') with self.assertRaises(SystemExit), outputRedirect() as (_, err): - args = parser.parse_args(cmd.split()) + args_list = ['-n', '10', featureFile, 'incorrectFilename'] + args = parser.parse_args(args_list) FeatFinderCLI.processArgs(args, parser) self.assertIn('error', err.getvalue())