// // Copyright (c) 2025 Glysade Inc and other RDkit contributors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include "chemdraw.h" #include "chemdraw_doc.h" #include #include "RDGeneral/test.h" #include #include #include #include #include #include #include #include #include #include #include #include "ChemDrawStartInclude.h" #include "chemdraw/CDXStdObjects.h" #include "ChemDrawEndInclude.h" #include using namespace RDKit; using namespace RDKit::v2; namespace { std::string replace(std::string &istr, const std::string &from, const std::string &to) { std::string str(istr); size_t start_pos = str.find(from); if (start_pos == std::string::npos) { return str; } str.replace(start_pos, from.length(), to); return str; } bool hasNonSupportedFeatures(CDXDocument &document, const std::string &fname) { // check for monomers std::ifstream ifs(fname); std::stringstream xml; xml << ifs.rdbuf(); // We should be able to figure this out from the node but... if (xml.str().find("monomerAttachmentStructure_") != std::string::npos || xml.str().find("Name=\"monomerAttachments") != std::string::npos) { return true; } for (auto node : document.ContainedObjects()) { auto id = (CDXDatumID)node.second->GetTag(); switch (id) { case kCDXObj_Page: for (auto frag : node.second->ContainedObjects()) { auto id = (CDXDatumID)frag.second->GetTag(); if (id == kCDXObj_Fragment) { auto &fragment = (CDXFragment &)(*frag.second); if (fragment.m_sequenceType == kCDXSeqType_Unknown) { return true; } } else if (id == kCDXObj_BracketAttachment || id == kCDXObj_BracketedGroup) { return true; } } break; case kCDXObj_ObjectTag: { CDXObject &object = *((CDXObject *)node.second); id = (CDXDatumID)object.GetTag(); // Check for monomers break; } default: break; } } return false; } bool hasNonSupportedFeatures(const std::string &fname) { auto doc = ChemDraw::ChemDrawToDocument(fname); return hasNonSupportedFeatures(*doc, fname); } TEST_CASE("Round TRIP") { std::string path = std::string(getenv("RDBASE")) + "/External/ChemDraw/test_data/CDXML6K/"; SECTION("round trip") { // if we can't find the CDXML6K path, then don't run the test if (!std::filesystem::exists(path)) { return; } int failed = 0; int saniFailed = 0; int total = 0; int parseable = 0; int nomol = 0; int badparse = 0; int success = 0; int smimatches = 0; int nonSupported = 0; int no_mol_in_doc = 0; int bad_chemdraw_mol = 0; RDLog::LogStateSetter blocker; std::string cdxpath = path + "CDXML/"; std::string molpath = path + "mol/"; std::string smipath = path + "smiles/"; std::string failpath = path + "FAILED/"; std::string nomolpath = path + "NOMOL/"; std::string badparsepath = path + "BADPARSE/"; std::string sanitizationpath = path + "SANI/"; std::set known_failures{ "INDMUMLL1117_2025-01-24-17-23-14_304.cdxml", // Dative oxygen gets set // to a radical "INDMUMLL1117_2025-01-24-17-26-06_1010.cdxml", // The next batch has a // type of stereochem I // don't know how to // parse yet "INDMUMLL1117_2025-01-24-17-26-06_1012.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1022.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1024.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1026.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1032.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1034.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1036.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1040.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1042.cdxml", "INDMUMLL1117_2025-01-24-17-26-06_1048.cdxml", // Stereo chem batch // ends here "INDMUMLL1117_2025-01-24-17-26-13_1690.cdxml", // RDKit shows a radical // for the dative ->[O] "INDMUMLL1117_2025-01-24-17-27-11_6877.cdxml", // The next batch has a // type of stereochem I // don't know how to // parse yet (same as // before) "INDMUMLL1117_2025-01-24-17-27-11_6878.cdxml", "INDMUMLL1117_2025-01-24-17-27-11_6883.cdxml", "INDMUMLL1117_2025-01-24-17-27-11_6884.cdxml", "INDMUMLL1117_2025-01-24-17-27-11_6889.cdxml", "INDMUMLL1117_2025-01-24-17-27-11_6896.cdxml", "INDMUMLL1117_2025-01-24-17-27-30_8574.cdxml", // Stereo chem batch // ends here "INDMUMLL1117_2025-01-24-17-27-31_8633.cdxml", // RDkit is missing a // dummy atom molecule "INDMUMLL1117_2025-01-24-17-27-31_8651.cdxml", // RDkit is missing a // dummy atom molecule "INDMUMLL1117_2025-01-24-17-27-53_10330.cdxml", // 2D projection of 3D // stereo, we fail this // one "INDMUMLL1117_2025-01-24-17-27-53_10332.cdxml", // 2D projection of 3D // stereo, we fail this // one "INDMUMLL1117_2025-01-24-17-27-54_10336.cdxml", // RDKit Smiles keeps // any bonds ~, // ChemDraw doesn't "INDMUMLL1117_2025-01-24-17-28-02_10942.cdxml", // Chemdraw smiles // doesn't support // quadruple bond $ "INDMUMLL1117_2025-01-24-17-28-15_11666.cdxml", // RDKit Smiles keeps // any bonds ~, // ChemDraw doesn't "INDMUMLL1117_2025-01-24-17-28-20_12011.cdxml", // RDKit gets stereo // from the 3D data and // the wedging "INDMUMLL1117_2025-01-24-17-28-20_12012.cdxml", // RDKit gets stereo // from the 3D data and // the wedging "INDMUMLL1117_2025-01-24-17-28-21_12031.cdxml", // 2D projection of 3D // stereo, we fail this // one "INDMUMLL1117_2025-01-24-17-28-30_12568.cdxml", // 2D projection of 3D // stereo, we fail this // one "INDMUMLL1117_2025-01-24-17-29-06_14654.cdxml", // Dative oxygen gets // set to a radical "INDMUMLL1117_2025-01-24-17-29-08_14775.cdxml", // RDKit Smiles keeps // any bonds ~, // ChemDraw doesn't "INDMUMLL1117_2025-01-24-17-29-09_14896.cdxml", // We apparently do a // bit of a better job // than chemdraw here // in parsing R/S "INDMUMLL1117_2025-01-24-17-29-09_14897.cdxml" // RDKit just gets very // different stereo // chem, no idea why }; for (auto p : {failpath, nomolpath, badparsepath, sanitizationpath}) { if (std::filesystem::exists(p)) { std::filesystem::remove_all(p); } std::filesystem::create_directory(p); } for (const auto &entry : std::filesystem::recursive_directory_iterator(cdxpath)) { if (entry.is_regular_file()) { std::string fname = entry.path().filename().string(); // issue here - graphite nanotube if (fname == "INDMUMLL1117_2025-01-24-17-28-02_10946.cdxml") { continue; // nanotube takes forever } auto molfname = molpath + replace(fname, ".cdxml", ".mol"); auto smifname = smipath + replace(fname, ".cdxml", ".smi"); // if chemscript couldn't make an output, ignore it total++; if (!std::filesystem::exists(molfname) || !std::filesystem::exists(smifname)) { no_mol_in_doc++; continue; } // Get the ChemScript mol and smiles std::unique_ptr mol; //= nullptr; try { mol.reset(MolFileToMol(molfname)); } catch (...) { bad_chemdraw_mol++; continue; } // REQUIRE(mols.size()); std::ifstream ifs(smifname); std::string smiles_in; ifs >> smiles_in; std::string smiles; { try { auto smimol = SmilesToMol(smiles_in); if (!smimol) { smiles = smiles_in; } else { smiles = MolToSmiles(*smimol); delete smimol; } } catch (...) { smiles = smiles_in; } } parseable++; // Read the cdxml std::vector> mols; bool santizationFailure = false; try { mols = MolsFromChemDrawFile(entry.path().string()); if (mols.size() == 0) { ChemDrawParserParams params; params.sanitize = false; mols = MolsFromChemDrawFile(entry.path().string(), params); santizationFailure = true; } if (!mols.size()) { if (smiles.size() == 0) { // At least we match the chemscript non-mol success++; } else if (hasNonSupportedFeatures(entry.path().string())) { // std::cerr << "[NOMOL (Unsupported)]: " << entry.path().string() // << std::endl; nonSupported++; } else { std::cerr << "[NOMOL]: " << entry.path().string() << std::endl; std::filesystem::copy( entry.path().string(), nomolpath + entry.path().filename().string()); nomol++; } continue; } } catch (...) { std::cerr << "[BADPARSE]: " << entry.path().string() << std::endl; std::filesystem::copy( entry.path(), badparsepath + entry.path().filename().string()); badparse++; continue; } std::unique_ptr m = std::make_unique(*mols[0]); for (size_t i = 1; i < mols.size(); i++) { m.reset(combineMols(*m, *mols[i])); } auto rdkit_smi = MolToSmiles(*m); auto mol_smi = mol.get() ? MolToSmiles(*mol) : ""; if (mol_smi != rdkit_smi) { // Do we match chemscripts smiles output at least? if (rdkit_smi == smiles) { smimatches++; continue; } if (hasNonSupportedFeatures(entry.path().string())) { nonSupported++; continue; // has unsupported features } if (santizationFailure) { std::cerr << "[SANI]: " << entry.path() << std::endl; std::filesystem::copy( entry.path(), sanitizationpath + entry.path().filename().string()); saniFailed++; } else { if (known_failures.find(entry.path().filename().string()) != known_failures.end()) { continue; // we know this failure and it's ok for now } std::cerr << "[FAIL]: " << entry.path() << std::endl; std::filesystem::copy(entry.path(), failpath + entry.path().filename().string()); failed++; } std::cerr << "rdkit: " << rdkit_smi << std::endl; std::cerr << "chemscript (mol): " << mol_smi << std::endl; std::cerr << "chemscript (smiles): " << smiles << std::endl; std::cerr << molfname << std::endl; std::cerr << smifname << std::endl; } else { success++; } } } std::cerr << "Total:" << total << std::endl; std::cerr << "Parseable (has chemscript output):" << parseable << std::endl; std::cerr << "Success:" << success + smimatches << std::endl; std::cerr << "skipped (non supported features):" << nonSupported << std::endl; std::cerr << "skipped (no mol in doc):" << no_mol_in_doc << std::endl; std::cerr << "Chemscript smiles matches not chemscript mol: " << smimatches << std::endl; std::cerr << "Failed:" << failed << std::endl; std::cerr << "Sanitization:" << saniFailed << std::endl; std::cerr << "Nomol:" << nomol << std::endl; std::cerr << "Badparse:" << badparse << std::endl; std::cerr << "Bad ChemDraw Mol:" << bad_chemdraw_mol << std::endl; REQUIRE(failed == 0); } } } // namespace