From cf269aa8139c7203abfa54c673fde81277c66270 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 28 Aug 2025 22:39:22 -0400 Subject: [PATCH] Expose CDX support to FileParsers and ChemDraw to SWIG (#8681) * Fist pass at CDX support * Enable CDX support for reading (also) in the CDXMLParser API * Add cdxml test files * Update swig wrappers for CDXMLFormat and Parameters * Add constructor to ChemDrawParserParams * Add Java SWIG support for ChemDraw * Add chemdraw define to rdconfig * Add missing chemdraw deps * Remove direct expat link * Fix Java linkages for ChemDraw * Remove bad merge code * Remove bad merge code * Fix csharp builds * Add sniffer for the ChemDraw DataStream * Include filesystem * Fix test on windows * Add more CDX tests * Ensure streams are open in binary mode to support CDX on windows * Fix text to show that a Block is the text input, not a file * Fix CSharp test * Disable CDX tests when not building chemdraw * Turn back on chemdraw * Response to review * Turn off chemdraw support for the limited external test --------- Co-authored-by: Brian Kelley --- .../linux_build_limitexternal.yml | 1 + CMakeLists.txt | 2 +- Code/GraphMol/FileParsers/CDXMLParser.cpp | 45 ++++++- Code/GraphMol/FileParsers/CDXMLParser.h | 71 +++++++--- Code/GraphMol/FileParsers/CMakeLists.txt | 4 + .../FileParsers/cdxml_parser_catch.cpp | 117 ++++++++++++++++- Code/GraphMol/Wrap/rdmolfiles.cpp | 121 +++++++++++++++++- Code/GraphMol/Wrap/test_cdxml.py | 95 ++++++++++++++ Code/GraphMol/test_data/CDX/structure_1.cdx | Bin 0 -> 568 bytes Code/GraphMol/test_data/CDX/structure_2.cdx | Bin 0 -> 620 bytes Code/GraphMol/test_data/CDX/structure_3.cdx | Bin 0 -> 483 bytes Code/GraphMol/test_data/CDX/structure_4.cdx | Bin 0 -> 402 bytes Code/GraphMol/test_data/CDX/structure_5.cdx | Bin 0 -> 620 bytes Code/GraphMol/test_data/CDX/structure_6.cdx | Bin 0 -> 620 bytes .../GraphMol/test_data/CDXML/ring-stereo1.cdx | Bin 0 -> 1812 bytes Code/JavaWrappers/CMakeLists.txt | 8 ++ Code/JavaWrappers/ChemDraw.i | 82 ++++++++++++ Code/JavaWrappers/RWMol.i | 73 ++++++++++- .../csharp_wrapper/CMakeLists.txt | 17 ++- .../RdkitTests/MolToFromByteArray.cs | 13 +- Code/JavaWrappers/gmwrapper/CMakeLists.txt | 27 +++- Code/JavaWrappers/gmwrapper/GraphMolJava.i | 4 +- .../src-test/org/RDKit/ChemDrawTest.java | 91 +++++++++++++ .../src-test/org/RDKit/WrapperTests.java | 34 ++++- Code/RDGeneral/RDConfig.h.cmake | 2 + External/ChemDraw/Wrap/rdChemDraw.cpp | 6 +- External/ChemDraw/chemdraw.cpp | 47 ++++++- External/ChemDraw/chemdraw.h | 13 +- 28 files changed, 819 insertions(+), 54 deletions(-) create mode 100644 Code/GraphMol/test_data/CDX/structure_1.cdx create mode 100644 Code/GraphMol/test_data/CDX/structure_2.cdx create mode 100644 Code/GraphMol/test_data/CDX/structure_3.cdx create mode 100644 Code/GraphMol/test_data/CDX/structure_4.cdx create mode 100644 Code/GraphMol/test_data/CDX/structure_5.cdx create mode 100644 Code/GraphMol/test_data/CDX/structure_6.cdx create mode 100644 Code/GraphMol/test_data/CDXML/ring-stereo1.cdx create mode 100644 Code/JavaWrappers/ChemDraw.i create mode 100644 Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemDrawTest.java diff --git a/.azure-pipelines/linux_build_limitexternal.yml b/.azure-pipelines/linux_build_limitexternal.yml index 0a41d35af..7c2e21b74 100644 --- a/.azure-pipelines/linux_build_limitexternal.yml +++ b/.azure-pipelines/linux_build_limitexternal.yml @@ -28,6 +28,7 @@ steps: -DRDK_INSTALL_STATIC_LIBS=OFF \ -DRDK_BUILD_CPP_TESTS=ON \ -DRDK_BUILD_PYTHON_WRAPPERS=ON \ + -DRDK_BUILD_CHEMDRAW_SUPPORT=OFF \ -DRDK_BUILD_COORDGEN_SUPPORT=OFF \ -DRDK_BUILD_MAEPARSER_SUPPORT=OFF \ -DRDK_OPTIMIZE_POPCNT=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 2566c91ea..e4e8f257e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,7 @@ option(RDK_BUILD_TEST_GZIP "Build the gzip'd stream test" OFF) option(RDK_OPTIMIZE_POPCNT "Use SSE4.2 popcount instruction while compiling." ON) option(RDK_USE_STRICT_ROTOR_DEFINITION "Use the most strict rotatable bond definition" ON) option(RDK_BUILD_DESCRIPTORS3D "Build the 3D descriptors calculators, requires Eigen3 to be installed" ON) -option(RDK_BUILD_CHEMDRAW_SUPPORT "build support for the Revvity ChemDraw document format" OFF ) +option(RDK_BUILD_CHEMDRAW_SUPPORT "build support for the Revvity ChemDraw document format" ON ) option(RDK_BUILD_FREESASA_SUPPORT "build the rdkit freesasa wrapper" OFF ) option(RDK_BUILD_COORDGEN_SUPPORT "build the rdkit coordgen wrapper" ON ) option(RDK_BUILD_MAEPARSER_SUPPORT "build the rdkit MAE parser wrapper" ON ) diff --git a/Code/GraphMol/FileParsers/CDXMLParser.cpp b/Code/GraphMol/FileParsers/CDXMLParser.cpp index b9d101d6c..1aa5f5281 100644 --- a/Code/GraphMol/FileParsers/CDXMLParser.cpp +++ b/Code/GraphMol/FileParsers/CDXMLParser.cpp @@ -707,10 +707,15 @@ void visit_children( namespace v2 { namespace CDXMLParser { +bool hasChemDrawCDXSupport() { return false; } std::vector> MolsFromCDXMLDataStream( std::istream &inStream, const CDXMLParserParams ¶ms) { // populate tree structure pt + if (params.format == CDXMLFormat::CDX) { + throw FileParseException("Full ChemDraw support is not enabled, cannot parse CDX files"); + } + using boost::property_tree::ptree; ptree pt; try { @@ -829,7 +834,9 @@ std::vector> MolsFromCDXMLFile( } std::vector> MolsFromCDXML( - const std::string &cdxml, const CDXMLParserParams ¶ms) { + + const std::string &cdxml, const CDXMLParserParams ¶ms) { + std::stringstream iss(cdxml); return MolsFromCDXMLDataStream(iss, params); } @@ -839,29 +846,57 @@ std::vector> MolsFromCDXML( #else #include #include +#include // For std::filesystem::path +#include // For std::transform +#include // For std::tolower namespace RDKit{ + + namespace v2 { namespace CDXMLParser { - +bool hasChemDrawCDXSupport() { return true; } + std::vector> MolsFromCDXMLDataStream( std::istream &inStream, const CDXMLParserParams ¶ms) { // populate tree structure pt ChemDrawParserParams chemdraw_params; chemdraw_params.sanitize = params.sanitize; chemdraw_params.removeHs = params.removeHs; + switch(params.format) { + case CDXMLFormat::CDX: { + chemdraw_params.format = CDXFormat::CDX; + break; + } + case CDXMLFormat::CDXML: { + chemdraw_params.format = CDXFormat::CDXML; break; + } + case CDXMLFormat::Auto: + { + chemdraw_params.format = CDXFormat::AUTO; break; + } + } + return MolsFromChemDrawDataStream(inStream, chemdraw_params); } std::vector> MolsFromCDXMLFile( const std::string &fileName, const CDXMLParserParams ¶ms) { - std::ifstream ifs(fileName); + std::ifstream ifs(fileName, std::ios::binary); if (!ifs || ifs.bad()) { std::ostringstream errout; - errout << "Bad input file " << fileName; + errout << "Bad inxput file " << fileName; throw BadFileException(errout.str()); } - return MolsFromCDXMLDataStream(ifs, params); + + try { + return MolsFromCDXMLDataStream(ifs, params); + } catch (const std::exception &ex) { + std::ostringstream errout; + errout << "Bad input file " << fileName << " " << ex.what(); + + throw FileParseException(errout.str()); + } } std::vector> MolsFromCDXML( diff --git a/Code/GraphMol/FileParsers/CDXMLParser.h b/Code/GraphMol/FileParsers/CDXMLParser.h index 670be2a42..dd2899daa 100644 --- a/Code/GraphMol/FileParsers/CDXMLParser.h +++ b/Code/GraphMol/FileParsers/CDXMLParser.h @@ -21,15 +21,34 @@ class RWMol; namespace v2 { namespace CDXMLParser { + +enum class CDXMLFormat { + CDXML = 0, + CDX = 1, + Auto = 2 +}; + +//! \brief Returns true if the RDKit was build with ChemDraw CDX support +RDKIT_FILEPARSERS_EXPORT bool hasChemDrawCDXSupport(); + struct RDKIT_FILEPARSERS_EXPORT CDXMLParserParams { bool sanitize = true; bool removeHs = true; + CDXMLFormat format = CDXMLFormat::Auto; + + CDXMLParserParams() = default; + CDXMLParserParams(bool sanitize, bool removeHs, CDXMLFormat format) : + sanitize(sanitize), removeHs(removeHs), format(format) {} }; + //! \brief construct molecules from a CDXML file -//! Note that the CDXML format is large and complex, the RDKit doesn't -//! support -//! full functionality, just the base ones required for molecule and -//! reaction parsing. +//! The RDKit is optionally built with the Revvity ChemDraw parser +//! If this is available, CDX and CDXML can be read, see CDXMLParserParams +//! Note that the CDXML format is large and complex, the RDKit doesn't +//! support full functionality, just the base ones required for molecule and +//! reaction parsing. +//! Note: If the ChemDraw extensions are available, this auto detects between +//! CDXML and CDX /*! * \param inStream - string containing the mol block * \param params - parameters controlling the parsing and post-processing @@ -38,30 +57,34 @@ RDKIT_FILEPARSERS_EXPORT std::vector> MolsFromCDXMLDataStream(std::istream &inStream, const CDXMLParserParams ¶ms = CDXMLParserParams()); //! \brief construct molecules from a CDXML file -//! Note that the CDXML format is large and complex, the RDKit doesn't -//! support -//! full functionality, just the base ones required for molecule and -//! reaction parsing. +//! The RDKit is optionally built with the Revvity ChemDraw parser +//! If this is available, CDX and CDXML can be read, see CDXMLParserParams +//! Note that the CDXML format is large and complex, the RDKit doesn't +//! support full functionality, just the base ones required for molecule and +//! reaction parsing. /*! * \param fileName - cdxml fileName * \param params - parameters controlling the parsing and post-processing */ RDKIT_FILEPARSERS_EXPORT std::vector> MolsFromCDXMLFile( const std::string &filename, - const CDXMLParserParams ¶ms = CDXMLParserParams()); + const CDXMLParserParams ¶ms = CDXMLParserParams(true, true, CDXMLFormat::Auto)); -//! \brief construct molecules from a CDXML file -//! Note that the CDXML format is large and complex, the RDKit doesn't -//! support -//! full functionality, just the base ones required for molecule and -//! reaction parsing. +//! \brief construct molecules from a CDXML block +//! The RDKit is optionally built with the Revvity ChemDraw parser +//! If this is available, CDX and CDXML can be read, see CDXMLParserParams +//! Note that the CDXML format is large and complex, the RDKit doesn't +//! support full functionality, just the base ones required for molecule and +//! reaction parsing. +//! Note: If the ChemDraw extensions are available, +//! CDXMLFormat::Auto attempts to see if the input string is CDXML or CDX /*! * \param cdxml - string containing the mol block * \param params - parameters controlling the parsing and post-processing */ RDKIT_FILEPARSERS_EXPORT std::vector> MolsFromCDXML( const std::string &cdxml, - const CDXMLParserParams ¶ms = CDXMLParserParams()); + const CDXMLParserParams ¶ms = CDXMLParserParams(true, true, v2::CDXMLParser::CDXMLFormat::Auto)); } // namespace CDXMLParser } // namespace v2 @@ -71,6 +94,8 @@ inline namespace v1 { //! Note that the CDXML format is large and complex, the RDKit doesn't support //! full functionality, just the base ones required for molecule and //! reaction parsing. +//! Note: If the ChemDraw extensions are available, this auto detects between +//! CDXML and CDX /*! * \param inStream - string containing the mol block * \param sanitize - toggles sanitization and stereochemistry @@ -81,9 +106,8 @@ inline namespace v1 { */ inline std::vector> CDXMLDataStreamToMols( std::istream &inStream, bool sanitize = true, bool removeHs = true) { - v2::CDXMLParser::CDXMLParserParams params; - params.sanitize = sanitize; - params.removeHs = removeHs; + v2::CDXMLParser::CDXMLParserParams params( + sanitize, removeHs, v2::CDXMLParser::CDXMLFormat::Auto); return v2::CDXMLParser::MolsFromCDXMLDataStream(inStream, params); } @@ -91,6 +115,9 @@ inline std::vector> CDXMLDataStreamToMols( //! Note that the CDXML format is large and complex, the RDKit doesn't support //! full functionality, just the base ones required for molecule and //! reaction parsing. +//! Note: If the ChemDraw extensions are available, +//! This function uses the file extension to determine the file type, .cdx or .cdxml +//! If not, it defaults to CDXML /*! * \param fileName - cdxml fileName * \param sanitize - toggles sanitization and stereochemistry @@ -104,13 +131,15 @@ inline std::vector> CDXMLFileToMols( v2::CDXMLParser::CDXMLParserParams params; params.sanitize = sanitize; params.removeHs = removeHs; + params.format = v2::CDXMLParser::CDXMLFormat::Auto; return v2::CDXMLParser::MolsFromCDXMLFile(filename, params); } -//! \brief construct molecules from a CDXML file +//! \brief construct molecules from a CDXML block //! Note that the CDXML format is large and complex, the RDKit doesn't support //! full functionality, just the base ones required for molecule and //! reaction parsing. +//! Note: to parse CDX files see the CDXParserParams variant of this function /*! * \param cdxml - string containing the mol block * \param sanitize - toggles sanitization and stereochemistry @@ -125,8 +154,10 @@ inline std::vector> CDXMLToMols(const std::string &cdxml, v2::CDXMLParser::CDXMLParserParams params; params.sanitize = sanitize; params.removeHs = removeHs; + params.format = v2::CDXMLParser::CDXMLFormat::Auto; return v2::CDXMLParser::MolsFromCDXML(cdxml, params); } } // namespace v1 + } // namespace RDKit -#endif //  _RD_CDXML_FILEPARSERS_H +#endif // RD_CDXML_FILEPARSERS_H diff --git a/Code/GraphMol/FileParsers/CMakeLists.txt b/Code/GraphMol/FileParsers/CMakeLists.txt index 4fda68533..655389d65 100644 --- a/Code/GraphMol/FileParsers/CMakeLists.txt +++ b/Code/GraphMol/FileParsers/CMakeLists.txt @@ -22,6 +22,10 @@ if(RDK_BUILD_MAEPARSER_SUPPORT) endif() endif() +if(RDK_BUILD_CHEMDRAW_SUPPORT) + add_definitions("-DRDK_BUILD_CHEMDRAW_SUPPORT") +endif() + rdkit_library(FileParsers CDXMLParser.cpp Mol2FileParser.cpp MolFileParser.cpp diff --git a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp index 18ada8af8..9a259fed3 100644 --- a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp +++ b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp @@ -19,6 +19,7 @@ #include #include #include +#include using namespace RDKit; using namespace RDKit::v2::CDXMLParser; @@ -865,7 +866,8 @@ TEST_CASE("CDXML") { #ifndef RDK_BUILD_CHEMDRAW_SUPPORT CHECK(std::string(e.what()) == "expected > at line: 373"); #else - CHECK(std::string(e.what()) == "Bad Input File"); + CHECK(std::string(e.what()).find("Failed parsing XML with error code 5") != + std::string::npos); #endif } } @@ -1320,3 +1322,116 @@ TEST_CASE("Github #7501 - dative bonds") { // Osmium } } + +#ifdef RDK_BUILD_CHEMDRAW_SUPPORT +struct format_check { + std::string filename; + bool stream, iscdx, cdxres, cdxmlres, autores; +}; + +TEST_CASE("CDX and Formats") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + + std::string cdxbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDX/"; + + SECTION("READ CDX") { + auto cdxfname = cdxmlbase + "ring-stereo1.cdx"; + auto cdxmlfname = cdxmlbase + "ring-stereo1.cdxml"; + // should default to CDXFormat Auto + auto mols1 = MolsFromCDXMLFile(cdxfname); + auto mols2 = MolsFromCDXMLFile(cdxmlfname); + CHECK(MolToSmiles(*mols1[0]) == MolToSmiles(*mols2[0])); + } + + SECTION("Read CDX Files/Streams") { + const std::vector> tests { + {"structure_1.cdx", "C1CCOC1"}, + {"structure_2.cdx", "C1=CCN=C1"}, + {"structure_3.cdx", "CS(C)=O"}, + {"structure_4.cdx", "CCO"}, + {"structure_5.cdx", "c1cc[nH]c1"}, + {"structure_6.cdx", "c1ccoc1"} + }; + + for(const auto &test : tests) { + auto fname = cdxbase + test.first; + // Read the file + auto m = MolsFromCDXMLFile(fname); + CHECK(MolToSmiles(*m[0]) == test.second); + + // Read the CDX stream + auto size = std::filesystem::file_size(fname); + std::string content(size, '\0'); + std::ifstream in(fname, std::ios::binary); + in.read(&content[0], size); + + auto m2 = MolsFromCDXML(content); + CHECK(MolToSmiles(*m2[0]) == test.second); + } + } + + SECTION("READ CDX/CDXML Blocks") { + auto cdxfname = cdxmlbase + "ring-stereo1.cdx"; + auto cdxmlfname = cdxmlbase + "ring-stereo1.cdxml"; + + auto size = std::filesystem::file_size(cdxfname); + std::string content(size, '\0'); + std::ifstream in(cdxfname, std::ios::binary); + in.read(&content[0], size); + auto mols1 = MolsFromCDXML(content); + + auto size2 = std::filesystem::file_size(cdxmlfname); + std::string content2(size2, ' '); + std::ifstream in2(cdxmlfname, std::ios::binary); + in2.read(&content2[0], size2); + auto mols2 = MolsFromCDXML(content2); + CHECK(MolToSmiles(*mols1[0]) == MolToSmiles(*mols2[0])); + } + + SECTION("Check Formats") { + auto cdxfilename = cdxmlbase + "ring-stereo1.cdx"; + auto cdxmlfilename = cdxmlbase + "ring-stereo1.cdxml"; + std::vector checks { {cdxfilename, true, true, true, false, false}, + {cdxfilename, false, true, true, false, true}, + {cdxmlfilename, true, false, false, true, true}, + {cdxmlfilename, false, false, false, true, true}, + }; + std::vector formats = { CDXMLFormat::CDX, CDXMLFormat::CDXML, CDXMLFormat::Auto }; + + for(auto &check : checks) { + if(check.stream) { + } else { + for(auto format : formats) { + bool hasmols = false; + bool exception = false; + try { + auto mols = MolsFromCDXMLFile(check.filename, CDXMLParserParams(true, true, format)); + hasmols = mols.size() > 0; + // std::cerr << check.filename << " not stream " << (unsigned)format << " hasmols: " << hasmols << std::endl; + + } catch (...) { + exception = true; + // std::cerr << check.filename << " not stream " << (unsigned)format << " exception" << std::endl; + } + + bool expected = false; + if(format == CDXMLFormat::CDX) + expected = check.cdxres; + else if(format == CDXMLFormat::CDXML) + expected = check.cdxmlres; + else + expected = check.autores; + + if (exception) + CHECK(expected == false); + else + CHECK(expected == hasmols); + } + } + } + } +} +#endif + diff --git a/Code/GraphMol/Wrap/rdmolfiles.cpp b/Code/GraphMol/Wrap/rdmolfiles.cpp index 7145fb01a..e087db509 100644 --- a/Code/GraphMol/Wrap/rdmolfiles.cpp +++ b/Code/GraphMol/Wrap/rdmolfiles.cpp @@ -740,6 +740,47 @@ python::object MolsFromCDXMLFile(const std::string &filename, bool sanitize, return python::tuple(res); } +python::tuple MolsFromCDXMLHelper(python::object cdxml, python::object pyParams) { + RDKit::v2::CDXMLParser::CDXMLParserParams params; + if (pyParams) { + params = python::extract(pyParams); + } + auto mols = RDKit::v2::CDXMLParser::MolsFromCDXML(pyObjectToString(cdxml), params); + python::list res; + for (auto &mol : mols) { + // take ownership of the data from the unique_ptr + ROMOL_SPTR sptr(static_cast(mol.release())); + res.append(sptr); + } + return python::tuple(res); +} + + python::object MolsFromCDXMLFileHelper(const std::string &filename, + python::object pyParams) { + RDKit::v2::CDXMLParser::CDXMLParserParams params( + true, true, RDKit::v2::CDXMLParser::CDXMLFormat::Auto); + if (pyParams) { + params = python::extract(pyParams); + } + std::vector> mols; + try { + mols = RDKit::v2::CDXMLParser::MolsFromCDXMLFile(filename, params); + } catch (RDKit::BadFileException &e) { + PyErr_SetString(PyExc_IOError, e.what()); + throw python::error_already_set(); + } catch (RDKit::FileParseException &e) { + BOOST_LOG(rdWarningLog) << e.what() << std::endl; + } catch (...) { + } + python::list res; + for (auto &mol : mols) { + // take ownership of the data from the unique_ptr + ROMOL_SPTR sptr(static_cast(mol.release())); + res.append(sptr); + } + return python::tuple(res); +} + python::tuple MolsFromCDXML(python::object cdxml, bool sanitize, bool removeHs) { auto mols = CDXMLToMols(pyObjectToString(cdxml), sanitize, removeHs); @@ -751,7 +792,6 @@ python::tuple MolsFromCDXML(python::object cdxml, bool sanitize, } return python::tuple(res); } - namespace { PyObject *translateMetadata( const std::vector> &metadata, @@ -2599,6 +2639,85 @@ BOOST_PYTHON_MODULE(rdmolfiles) { python::arg("removeHs") = true), docString.c_str()); + python::enum_("CDXMLFormat") + .value("CDXML", + RDKit::v2::CDXMLParser::CDXMLFormat::CDXML) + .value("CDX", + RDKit::v2::CDXMLParser::CDXMLFormat::CDX) + .value("Auto", RDKit::v2::CDXMLParser::CDXMLFormat::Auto); + + python::class_( + "CDXMLParserParams", + "Parameters controlling conversion of a CDXML document to molecules", + python::init<>(python::args("self"), "Construct a default CDXMLFormat")) + .def(python::init( + python::args("self", "sanitize", "removeHs", "format"))) + .def_readwrite( + "sanitize", + &RDKit::v2::CDXMLParser::CDXMLParserParams::sanitize, + "controls whether or not the molecule is sanitized before " + "being returned") + .def_readwrite( + "removeHs", + &RDKit::v2::CDXMLParser::CDXMLParserParams::removeHs, + "controls whether or not Hs are removed before the " + "molecule is returned") + .def_readwrite( + "format", + &RDKit::v2::CDXMLParser::CDXMLParserParams::format, + "ChemDraw format One of Auto, CDXML, CDX. For data streams, Auto defaults to CDXML"); + + docString = + R"DOC(Construct a molecule from a cdxml file. + + Note: that the CDXML format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + Note: If the ChemDraw extensions are available, + CDXMLFormat::Auto attempts to see if the input string is CDXML or CDX, + If not, it defaults to CDXML + + ARGUMENTS: + + - filename: the cdxml filename + + - pyParams: CDXParserParams, see CDXParserParams for usage + + RETURNS: + a tuple of parsed Mol objects.)DOC"; + + python::def("MolsFromCDXMLFile", MolsFromCDXMLFileHelper, + (python::arg("filename"), + python::arg("params")), + docString.c_str()); + + docString = + R"DOC(Construct a molecule from a cdxml string. + + Note that the CDXML format is large and complex, the RDKit doesn't support + full functionality, just the base ones required for molecule and + reaction parsing. + + Note: in this function CDXMLFormat::Auto currently defaults to CDXML + + ARGUMENTS: + + - cdxml: the cdxml string + + - pyParams: CDXParserParams, see CDXParserParams for usage + + RETURNS: + a tuple of parsed Mol objects.)DOC"; + + python::def("MolsFromCDXML", MolsFromCDXMLHelper, + (python::arg("cdxml"), + python::arg("params")), + docString.c_str()); + + docString = "Returns true if the RDKit is built with ChemDraw CDX support"; + python::def("HasChemDrawCDXSupport", RDKit::v2::CDXMLParser::hasChemDrawCDXSupport, docString.c_str()); + #ifdef RDK_USE_BOOST_IOSTREAMS docString = R"DOC(Adds molecular metadata to PNG data read from a file. diff --git a/Code/GraphMol/Wrap/test_cdxml.py b/Code/GraphMol/Wrap/test_cdxml.py index ec14b4c96..c61fa0e74 100644 --- a/Code/GraphMol/Wrap/test_cdxml.py +++ b/Code/GraphMol/Wrap/test_cdxml.py @@ -281,6 +281,101 @@ class TestCase(unittest.TestCase): self.assertEqual(len(mols), 1) self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + mols = Chem.MolsFromCDXML(cdxml, True, False) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + mols = Chem.MolsFromCDXML(cdxml, False, False) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + params = Chem.CDXMLParserParams() + mols = Chem.MolsFromCDXML(cdxml, params) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + params.sanitize = True + params.removeHs = False + mols = Chem.MolsFromCDXML(cdxml, params) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + params.sanitize = False + params.removeHs = False + + mols = Chem.MolsFromCDXML(cdxml, params) + self.assertEqual(len(mols), 1) + self.assertEqual(Chem.MolToSmiles(mols[0]), "CC(C)(C)OC(=O)C1CCCCCC1") + + + + def test_cdxml(self): + try: from rdkit.Chem import rdChemDraw + except: + return + + rdbase = os.environ['RDBASE'] + cdxfilename = os.path.join(rdbase, + 'Code/GraphMol/test_data/CDXML/ring-stereo1.cdx') + mols = Chem.MolsFromCDXMLFile(cdxfilename) + filename = os.path.join(rdbase, + 'Code/GraphMol/test_data/CDXML/ring-stereo1.cdxml') + mols2 = Chem.MolsFromCDXMLFile(filename) + smi1 = [Chem.MolToSmiles(m) for m in mols] + smi2 = [Chem.MolToSmiles(m) for m in mols2] + self.assertEqual(smi1, smi2) + + self.assertEqual(smi1, ['C1CC[C@H]2CCCC[C@H]2C1']) + with open(cdxfilename, 'rb') as f: + data = f.read() + params = Chem.CDXMLParserParams(True, True, Chem.CDXMLFormat.CDX) + mols3 = Chem.MolsFromCDXML(data, params) + smi3 = [Chem.MolToSmiles(m) for m in mols3] + self.assertEqual(smi1, smi3) + + + def test_formats(self): + try: + from rdkit.Chem import rdChemDraw + self.assertEqual(Chem.HasChemDrawCDXSupport(),True) + except: + self.assertEqual(Chem.HasChemDrawCDXSupport(),False) + return + + rdbase = os.environ['RDBASE'] + cdxfilename = os.path.join(rdbase, + 'Code/GraphMol/test_data/CDXML/ring-stereo1.cdx') + mols = Chem.MolsFromCDXMLFile(cdxfilename) + cdxmlfilename = os.path.join(rdbase, + 'Code/GraphMol/test_data/CDXML/ring-stereo1.cdxml') + + tests = [ + # we can deduce extensions from filenames, but not from streams (yet!) + # filename, Stream, IsCDX, CDX res, CDXML res, Auto Res + (cdxfilename, True, True, True, False, False), + (cdxfilename, False, True, True, False, True), + (cdxmlfilename, True, False, False, True, True), + (cdxmlfilename, False, False, False, True, True), + ] + + for filename, stream, iscdx, cdxres, cdxmlres, autores in tests: + for format, res in zip([Chem.CDXMLFormat.CDX, Chem.CDXMLFormat.CDXML, Chem.CDXMLFormat.Auto], + [cdxres, cdxmlres, autores]): + if stream: + with open(filename, 'rb') as f: + data = f.read() + try: + mols = Chem.MolsFromCDXML(data, Chem.CDXMLParserParams(True, True, format)) + if res: assert mols + except RuntimeError: + assert res == False + else: + mols = Chem.MolsFromCDXMLFile(filename, Chem.CDXMLParserParams(True, True, format)) + if res: assert mols + + + + if __name__ == '__main__': if "RDTESTCASE" in os.environ: diff --git a/Code/GraphMol/test_data/CDX/structure_1.cdx b/Code/GraphMol/test_data/CDX/structure_1.cdx new file mode 100644 index 0000000000000000000000000000000000000000..979a67df92d699c95f4056c2490004c2f0311e4b GIT binary patch literal 568 zcmZWm%Sr<=6upy4>eL5x?V>~x7cMk2)52HjlbA?td`I&yyB5j=d zt{)zU5;gfo7-}sx32T8a&QL||M^M!`zl=yTUd|z}>!?y-<0{6iE8Zk3k8E{Qq1|k?$HJKK1t| zr4>33m33f8MFVt02XnCcO2N|<^2`lZUmSQEMV>i%Uk`YiN1i$P@bRXiq3idr48ex{ E0=i^FIRF3v literal 0 HcmV?d00001 diff --git a/Code/GraphMol/test_data/CDX/structure_2.cdx b/Code/GraphMol/test_data/CDX/structure_2.cdx new file mode 100644 index 0000000000000000000000000000000000000000..41a0b3b4dd8af88d46609772f646dacdd1d8786d GIT binary patch literal 620 zcmZuuy-EW?5T5Mqo)@E#%EBV7V55cZT@pB>g_wY1CxZ9@NnsGdfY$peK7&~L4*Cjq zK7!TxeY+f<#({5lX7-!EovZnH;&SJ7qFVAJJ{ZSC*)n28K{;|mTR2y0)s?r;*Gi;^ zsE_LT=~P;6)`+4Sy2;oJcu9dSsvwb7A->k=re2@XuBw(-@FqUNsXa4^%H;lQ2TD&4 zHHJRoaiGa?Bynt#+{Ou%Wl~NO|D`6e`@fkAlK63Tx&KIglETy|$5NZ`MI)2a3V$Cv z8(K%fTHwg`Z2IpG_Iup)UMf6#ca6j~{G2m)Fb_{HYQpowSR$xN1g6F`lbt P1S>LA7=I~(ef*YhXEjCR literal 0 HcmV?d00001 diff --git a/Code/GraphMol/test_data/CDX/structure_3.cdx b/Code/GraphMol/test_data/CDX/structure_3.cdx new file mode 100644 index 0000000000000000000000000000000000000000..5ceeda515b2f958ce7fda6e309ea294d928a0f7b GIT binary patch literal 483 zcmZXR!AiqG5QZnan>6**lZeo=1uuHAo3uz%JqRf(3LX^12S^VV@leonUPT|kix;29 zLk~WN>;EU#6gn`w*_kghv;W@B^QlXmGg<}GSXmuT_oZ({i$NlUNAMOrhPUBc@V(hn zv3)bY&~kuNerxmTy)70GFSCUm9y&X)m(P#ZCDx56qiixtZR*_EsE&MpeR}V^qGah` zudXF%vu0ds5#@!6{$RvSH2y;qqSS@x+xwaDCQa44UP_3<5!&saW8CfrG=#<_6)=&h e)DE#%^F|69F-y&vWxxL{;(Q0j*`kT#P}whF$V86- literal 0 HcmV?d00001 diff --git a/Code/GraphMol/test_data/CDX/structure_4.cdx b/Code/GraphMol/test_data/CDX/structure_4.cdx new file mode 100644 index 0000000000000000000000000000000000000000..72c6f51d8dbfe99a8815fcb280a131117a260b86 GIT binary patch literal 402 zcmZWlJxc>Y5S`ndi?Op-2ph1n(799LE~XGJpxB8Zf^AY5#3G>e{s#Y#m7PCB{2%Lk zw;T~1n9qIlW_Iq^-NGs}^{$Yu#6G#4%G8S!#X|`lL3f~I=;7*R`FXp(_Hu-*y9?do zAuKmfZ>vo>J+^QXZeE{*Rl%CsSu?BJP@6UC_T=N>D~U|d^_}(~ucX)|Ul#W#F2|e; zoUbu4^^#Eu{X@hu$h1+kLoI328`Dz?2@>Dv~d_MS|A9b$5I`a~~r$n6P7sK>8!vFvP literal 0 HcmV?d00001 diff --git a/Code/GraphMol/test_data/CDX/structure_5.cdx b/Code/GraphMol/test_data/CDX/structure_5.cdx new file mode 100644 index 0000000000000000000000000000000000000000..66c90e4defe5ca0cf866f93e3988e6201a4d6052 GIT binary patch literal 620 zcmZWn%}N6?5T0g}wY3U8dni%Fg9pvFE38$pxIL8KL=YdK1zUd(1wH2}d=C#kfls0b zUn73shSJs!eA$`go5@VRy_=N-mpP{sRg;SvK z;pAofaWfkxa*U|FwdLU6j_1?2$=r5NoISOp*C*>T>xy1p^x$-yE3|6K@ANZ>w2}1_ z9v+8MYw(U}YQ;Bo_5xngK^LL2$O_0`c-*GHSYsSQ%`5CCKEbLzD~YPh*Ygpawp?fo zeaYoqli^6>*d!gauHkHwGLrZ%B<;PAU6CVGbwU3 W;SvK z;pAofaWfkxa*U|FwdLU6j_1?2$=r5NoISOp*C*>T>xy1p^qjLD=L)S_@;m(uB5h>- zgonqW)Eayvnp*KqoxOmUbkIepEV2Uf7aou4FV+}`Q1c3>iBGU<&q|`|^7VWKr!5y6 zLtk<^*JL=7I5tTKt!p@&q>Lnf7n1hgNAle-HY#;U;)f%tdzA}E{Qq2+q-tuA_|$qO zr4=TQo#l|jQvjXnC=T|j(^QF)X4=ACl!OvRGoC&($TZliwo!=^@|-cxt4`M{F_R)k WGoC&($Ye$o%Xt0-t3546Wab~|`9qul literal 0 HcmV?d00001 diff --git a/Code/GraphMol/test_data/CDXML/ring-stereo1.cdx b/Code/GraphMol/test_data/CDXML/ring-stereo1.cdx new file mode 100644 index 0000000000000000000000000000000000000000..2dca0711ac53969cf5409ffa9b8fdc611a9bfa2a GIT binary patch literal 1812 zcmZ9M-)|E~5XWct?(FlO6PtuYK_IjeND2LsCgCB{wL#<;SO`l|s6f17!Zo-$+vseF z(JM-Du-I>|D{N?J}-2B{} zZD~bW>tSfoILp?U;qrv-7hHvx_c`u{y3dw?A%o!Z>Wr&u-o5+PX|*&ns%_ z+u!N0GvBDa+bc>HT=(|0ww+!JZaORp#=y-kkv$H{gzb;*)hmrn3;oFWUF z!NE>Jd;qZk-G?k@K4Rf%x6EEo4*iwIHzLiFQEP!*>PurKw|;*hlYV#xE&tUxuT^}X z@^pbJVRR?#)VFHnc98+13jet2|Mc1Z-~CsXr#G!_XU6)M?>A({3Frp@*_*;xO_wm{ zQPwmaQQyh*Y^oe4Sbfij>-+v;mPkJzCo<<+apQJW3(WWaDC^g|wXKe+*4pjB{2*kt z6@^<&#O6v^-@e%i%#VJa$;KyN1?DGz5{sg+5jH#QZMj*CcH4Dc{J=lNmX%gL;)*gG zt$LGJJ``m&j5_~QJ`&|>Tt|E&^Lf~~6`04v%8fW=pHF;bDkU$sqQE>A-TC_U>us~% z4CC$Hz&!H{%w25pzJYn}=Xg@x49uQi;!!1R#=FyPbFsD)n4e{JC9HK!>kElRV1DsY z-kpt0%Ype-LiU?Tzldt^iX7JhM6u{FH*?~Y>NR zB2`opY4xV|R~~$09+!b4yC+H1IM7wK_s_C|UehAK7FD%vM$qgY5=(+zP=CIi7R<^p zF*An38(>#?<)~mb&9S#UTsrf1nMN2vCEz20hHYdW**!eJWSis88^URvY#T8%hPWi# zEO~zjmdh|PGlskjupO0P`3w^?V<@%(_O(hjbu&!Nj3Gvex8!bx19!un@Rh?0K*QZ+ z9oapUOtMW9LND7!%#3*{wv#5r&oD8Q({QNU;=_r ztDGh~1Cb4YqXGf|h+wC_P}o39g1`g>fld)rCy=rrFabf3Qv@#-$RR;s0)hai2%-mM zOc0oWAchAHjZgy|7ce +%} +%ignore RDKit::v2::MolsFromChemDrawDataStream; +%ignore RDKit::v2::MolsFromChemDrawFile; +%ignore RDKit::v2::MolsFromChemDrawBlock; +%rename("ChemDraw") RDKit::v2; +%rename(CDXFormat) RDKit::v2::CDXFormat; +%rename(ChemDrawParserParams) RDKit::v2::ChemDrawParserParams; + +%include + +%{ +std::vector MolsFromChemDrawBlockHelper( + const std::string &text, + const RDKit::v2::ChemDrawParserParams ¶ms=RDKit::v2::ChemDrawParserParams()) { + auto res = RDKit::v2::MolsFromChemDrawBlock(text, params); + std::vector mols; + for(auto &mol: res) { + mols.emplace_back(mol.release()); + } + return mols; + +} + +std::vector MolsFromChemDrawFileHelper( + const std::string &filename, + const RDKit::v2::ChemDrawParserParams ¶ms=RDKit::v2::ChemDrawParserParams()) { + auto res = RDKit::v2::MolsFromChemDrawFile(filename, params); + std::vector mols; + for(auto &mol: res) { + mols.emplace_back(mol.release()); + } + return mols; +} + +%} + +%rename("MolsFromChemDrawBlock") MolsFromChemDrawBlockHelper; +%rename("MolsFromChemDrawFile") MolsFromChemDrawFileHelper; + +std::vector MolsFromChemDrawBlockHelper( + const std::string &text, + const RDKit::v2::ChemDrawParserParams ¶ms=RDKit::v2::ChemDrawParserParams()); + +std::vector MolsFromChemDrawFileHelper( + const std::string &filename, + const RDKit::v2::ChemDrawParserParams ¶ms=RDKit::v2::ChemDrawParserParams()); + diff --git a/Code/JavaWrappers/RWMol.i b/Code/JavaWrappers/RWMol.i index cec414c2b..6e2599095 100644 --- a/Code/JavaWrappers/RWMol.i +++ b/Code/JavaWrappers/RWMol.i @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -58,10 +59,41 @@ #if swifjava %javaconst(1); #endif -%include -%ignore RDKit::v2; + %ignore RDKit::v2::SmilesParse; + +%ignore RDKit::v1::CDXMLDataStreamToMols; +%ignore RDKit::v1::CDXMLFileToMols; +%ignore RDKit::v1::CDXMLToMols; + +%ignore RDKit::v2::CDXMLParser::MolsFromCDXMLDataStream; +%ignore RDKit::v2::CDXMLParser::MolsFromCDXML; +%ignore RDKit::v2::CDXMLParser::MolsFromCDXMLFile; +%ignore *::MolsFromCDXMLDataStream; + +%include +%include %include + + +%typemap(cscode) RDKit::RWMol %{ + public static RWMol_Vect MolsFromCDXMLByteArray( + byte[] text, bool sanitize=true, bool removeHs=true) { + UChar_Vect vec = null; + try { + vec = new UChar_Vect(); + vec.Capacity = text.Length; + for (int i = 0; i < text.Length; ++i) { + vec.Add((byte)text[i]); + } + return RWMol.MolsFromCDXML(vec, sanitize, removeHs); + } finally { + if (vec != null) { + vec.Dispose(); + } + } + } +%} %include %extend RDKit::RWMol { @@ -140,8 +172,8 @@ static RDKit::RWMOL_SPTR MolFromHELM(const std::string &text, } static std::vector MolsFromCDXML(const std::string &text, - bool sanitize=true){ - auto res = RDKit::CDXMLToMols(text, sanitize); + bool sanitize=true, bool removeHs=true){ + auto res = RDKit::CDXMLToMols(text, sanitize, removeHs); std::vector mols; for(auto &mol: res) { mols.emplace_back(mol.release()); @@ -150,9 +182,36 @@ static std::vector MolsFromCDXML(const std::string &text, } -static std::vector MolsFromCDXMLFile(const std::string &text, - bool sanitize=true){ - auto res = RDKit::CDXMLFileToMols(text, sanitize); +static std::vector MolsFromCDXML( + const std::vector &text, bool sanitize=true, bool removeHs=true) { + std::string str(text.begin(), text.end()); + RDKit::v2::CDXMLParser::CDXMLParserParams params; + params.sanitize=sanitize; + params.removeHs=removeHs; + auto res = RDKit::v2::CDXMLParser::MolsFromCDXML(str, params); + std::vector mols; + for(auto &mol: res) { + mols.emplace_back(mol.release()); + } + return mols; +} + +static std::vector MolsFromCDXML( + const std::string &text, + const RDKit::v2::CDXMLParser::CDXMLParserParams ¶ms=RDKit::v2::CDXMLParser::CDXMLParserParams()) { + auto res = RDKit::v2::CDXMLParser::MolsFromCDXML(text, params); + std::vector mols; + for(auto &mol: res) { + mols.emplace_back(mol.release()); + } + return mols; + +} + +static std::vector MolsFromCDXMLFile( + const std::string &filename, + const RDKit::v2::CDXMLParser::CDXMLParserParams ¶ms=RDKit::v2::CDXMLParser::CDXMLParserParams()) { + auto res = RDKit::v2::CDXMLParser::MolsFromCDXMLFile(filename, params); std::vector mols; for(auto &mol: res) { mols.emplace_back(mol.release()); diff --git a/Code/JavaWrappers/csharp_wrapper/CMakeLists.txt b/Code/JavaWrappers/csharp_wrapper/CMakeLists.txt index 42c46e9f5..6881b72fe 100644 --- a/Code/JavaWrappers/csharp_wrapper/CMakeLists.txt +++ b/Code/JavaWrappers/csharp_wrapper/CMakeLists.txt @@ -54,7 +54,9 @@ endif() if (RDK_BUILD_CAIRO_SUPPORT) SET(CMAKE_SWIG_FLAGS "-DRDK_BUILD_CAIRO_SUPPORT" ${CMAKE_SWIG_FLAGS} ) endif() - +if (RDK_BUILD_CHEMDRAW_SUPPORT) + SET(CMAKE_SWIG_FLAGS "-DRDK_BUILD_CHEMDRAW_SUPPORT" ${CMAKE_SWIG_FLAGS} ) +endif() FILE(GLOB SWIG_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../*.i") @@ -67,6 +69,10 @@ if(NOT RDK_BUILD_INCHI_SUPPORT) LIST(REMOVE_ITEM SWIG_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../Inchi.i") endif() +if(NOT RDK_BUILD_CHEMDRAW_SUPPORT) +LIST(REMOVE_ITEM SWIG_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../ChemDraw.i") +endif() + SET(SWIG_MODULE_RDKFuncs_EXTRA_DEPS ${SWIG_SRC_FILES} ) SWIG_ADD_LIBRARY(RDKFuncs TYPE MODULE LANGUAGE CSharp SOURCES GraphMolCSharp.i ) @@ -75,8 +81,13 @@ SWIG_ADD_LIBRARY(RDKFuncs TYPE MODULE LANGUAGE CSharp SOURCES GraphMolCSharp.i ) # it doesnt seem like the threading libs should need to be here, but # as of Oct 2012 using boost 1.51 under at least ubuntu 12.04 we get a # link error if they aren't there. -SWIG_LINK_LIBRARIES(RDKFuncs ${RDKit_Wrapper_Libs} - rdkit_base ${RDKit_THREAD_LIBS} ) +if(RDK_BUILD_CHEMDRAW_SUPPORT) + SWIG_LINK_LIBRARIES(RDKFuncs ${RDKit_Wrapper_Libs} + rdkit_base ${RDKit_THREAD_LIBS} ChemDraw expat) +else () + SWIG_LINK_LIBRARIES(RDKFuncs ${RDKit_Wrapper_Libs} + rdkit_base ${RDKit_THREAD_LIBS}) +endif() INSTALL(TARGETS RDKFuncs DESTINATION ${CMAKE_CURRENT_SOURCE_DIR} ) diff --git a/Code/JavaWrappers/csharp_wrapper/RdkitTests/MolToFromByteArray.cs b/Code/JavaWrappers/csharp_wrapper/RdkitTests/MolToFromByteArray.cs index fafa551b8..2f9b08639 100644 --- a/Code/JavaWrappers/csharp_wrapper/RdkitTests/MolToFromByteArray.cs +++ b/Code/JavaWrappers/csharp_wrapper/RdkitTests/MolToFromByteArray.cs @@ -59,5 +59,16 @@ namespace RdkitTests } } + [Fact] + public void MolFromCDX() { + var fileName = + Path.Combine(Environment.GetEnvironmentVariable("RDBASE"), + "Code", "GraphMol", "test_data", "CDX", "structure_1.cdx"); + + byte[] pkl = File.ReadAllBytes(fileName); + var mols = RWMol.MolsFromCDXMLByteArray(pkl); + Assert.True(mols.Count >= 1); + + } } -} \ No newline at end of file +} diff --git a/Code/JavaWrappers/gmwrapper/CMakeLists.txt b/Code/JavaWrappers/gmwrapper/CMakeLists.txt index 889e568e9..8c3ce7898 100644 --- a/Code/JavaWrappers/gmwrapper/CMakeLists.txt +++ b/Code/JavaWrappers/gmwrapper/CMakeLists.txt @@ -92,6 +92,9 @@ endif() if (RDK_BUILD_CAIRO_SUPPORT) SET(CMAKE_SWIG_FLAGS "-DRDK_BUILD_CAIRO_SUPPORT" ${CMAKE_SWIG_FLAGS} ) endif() +if(RDK_BUILD_CHEMDRAW_SUPPORT) + SET(CMAKE_SWIG_FLAGS "-DRDK_BUILD_CHEMDRAW_SUPPORT" ${CMAKE_SWIG_FLAGS} ) +endif() # enable this line to build the ErrorGenerator class for testing handling of C++ errors in the JNI layer @@ -108,6 +111,11 @@ if(NOT RDK_BUILD_INCHI_SUPPORT) LIST(REMOVE_ITEM SWIG_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../Inchi.i") endif() +if(NOT RDK_BUILD_CHEMDRAW_SUPPORT) +LIST(REMOVE_ITEM SWIG_SRC_FILES "${CMAKE_CURRENT_SOURCE_DIR}/../ChemDraw.i") +endif() + + SET(SWIG_MODULE_GraphMolWrap_EXTRA_DEPS ${SWIG_SRC_FILES} ) SWIG_ADD_LIBRARY(GraphMolWrap TYPE MODULE LANGUAGE java SOURCES GraphMolJava.i ) @@ -115,8 +123,14 @@ SWIG_ADD_LIBRARY(GraphMolWrap TYPE MODULE LANGUAGE java SOURCES GraphMolJava.i ) # it doesnt seem like the threading libs should need to be here, but # as of Oct 2012 using boost 1.51 under at least ubuntu 12.04 we get a # link error if they aren't there. -SWIG_LINK_LIBRARIES(GraphMolWrap ${RDKit_Wrapper_Libs} + +if(RDK_BUILD_CHEMDRAW_SUPPORT) + SWIG_LINK_LIBRARIES(GraphMolWrap ${RDKit_Wrapper_Libs} + rdkit_base ${RDKit_THREAD_LIBS} ChemDraw expat) +else () + SWIG_LINK_LIBRARIES(GraphMolWrap ${RDKit_Wrapper_Libs} rdkit_base ${RDKit_THREAD_LIBS}) +endif() # code adapted from the wrapper code for # GDCM: http://gdcm.svn.sf.net/viewvc/gdcm/trunk/Wrapping/Java/CMakeLists.txt?view=markup @@ -169,6 +183,10 @@ if(NOT RDK_BUILD_INCHI_SUPPORT) LIST(REMOVE_ITEM JAVA_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src-test/org/RDKit/InchiTests.java") endif() +if(NOT RDK_BUILD_CHEMDRAW_SUPPORT) +LIST(REMOVE_ITEM JAVA_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src-test/org/RDKit/ChemDrawTests.java") +endif() + if(NOT RDK_USE_BOOST_IOSTREAMS) LIST(REMOVE_ITEM JAVA_TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src-test/org/RDKit/GzStreamTests.java") endif() @@ -323,6 +341,13 @@ if (RDK_BUILD_INCHI_SUPPORT) org.RDKit.InchiTests) endif (RDK_BUILD_INCHI_SUPPORT) +if (RDK_BUILD_CHEMDRAW_SUPPORT) + ADD_TEST(JavaChemDrawTests + java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR} + -cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar" + org.RDKit.ChemDrawTest) +endif (RDK_BUILD_INCHI_SUPPORT) + #ADD_TEST(JavaMemoryTests # java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR} # -cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar" diff --git a/Code/JavaWrappers/gmwrapper/GraphMolJava.i b/Code/JavaWrappers/gmwrapper/GraphMolJava.i index ab1312cac..8f350857f 100644 --- a/Code/JavaWrappers/gmwrapper/GraphMolJava.i +++ b/Code/JavaWrappers/gmwrapper/GraphMolJava.i @@ -310,7 +310,9 @@ typedef unsigned long long int uintmax_t; #ifdef RDK_BUILD_INCHI_SUPPORT %include "../Inchi.i" #endif - +#ifdef RDK_BUILD_CHEMDRAW_SUPPORT +%include "../ChemDraw.i" +#endif %include "../DiversityPick.i" %{ diff --git a/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemDrawTest.java b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemDrawTest.java new file mode 100644 index 000000000..5846896d8 --- /dev/null +++ b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemDrawTest.java @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2025, Glysade Inc + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of Novartis Institutes for BioMedical Research Inc. + * nor the names of its contributors may be used to endorse or promote + * products derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.RDKit; +import static org.junit.Assert.*; +import java.io.File; + +import org.junit.Test; + +public class ChemDrawTest extends GraphMolTest { + @Test public void testChemDrawReader() { + String rdpath = System.getenv("RDBASE"); + if (rdpath == null) + org.junit.Assert.fail("No definition for RDBASE"); + File base = new File(rdpath); + File testFile = new File(base, "Code" + File.separator + "GraphMol" + + File.separator + "test_data" + File.separator + + "CDXML" + File.separator + "beta-cypermethrin.cdxml"); + String fn = testFile.getAbsolutePath(); + RWMol_Vect prods = RDKFuncs.MolsFromChemDrawFile(fn); + assertEquals(prods.size(), 1); + for(int idx = 0; idx < prods.size(); idx++) { + if(idx == 0) { + System.out.print(prods.get(idx).MolToSmiles(true)); + System.out.print("\n"); + assertEquals(prods.get(idx).MolToSmiles(true), "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"); + } + } + ChemDrawParserParams params = new ChemDrawParserParams(); + prods = RDKFuncs.MolsFromChemDrawFile(fn, params); + assertEquals(prods.size(), 1); + for(int idx = 0; idx < prods.size(); idx++) { + if(idx == 0) { + System.out.print(prods.get(idx).MolToSmiles(true)); + System.out.print("\n"); + assertEquals(prods.get(idx).MolToSmiles(true), "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"); + } + } + + testFile = new File(base, "Code" + File.separator + "GraphMol" + + File.separator + "test_data" + File.separator + + "CDXML" + File.separator + "ring-stereo1.cdx"); + fn = testFile.getAbsolutePath(); + params = new ChemDrawParserParams(true, true, CDXFormat.CDX); + prods = RDKFuncs.MolsFromChemDrawFile(fn, params); + assertEquals(prods.size(), 1); + + params = new ChemDrawParserParams(true, true, CDXFormat.CDXML); + boolean e = false; + try { + prods = RDKFuncs.MolsFromChemDrawFile(fn, params); + } catch(GenericRDKitException ex) { + e = true; + } + assertEquals(true, e); + + } + + + public static void main(String args[]) { + org.junit.runner.JUnitCore.main("org.RDKit.ChemDrawTest"); + } + +} diff --git a/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/WrapperTests.java b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/WrapperTests.java index 3b2faec17..a09e91d56 100644 --- a/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/WrapperTests.java +++ b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/WrapperTests.java @@ -347,9 +347,39 @@ public class WrapperTests extends GraphMolTest { assertEquals(prods.get(idx).MolToSmiles(true), "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"); } } + CDXMLParserParams params = new CDXMLParserParams(); + prods = RWMol.MolsFromCDXMLFile(fn, params); + assertEquals(prods.size(), 1); + for(int idx = 0; idx < prods.size(); idx++) { + if(idx == 0) { + System.out.print(prods.get(idx).MolToSmiles(true)); + System.out.print("\n"); + assertEquals(prods.get(idx).MolToSmiles(true), "CC1(C)[C@H](C=C(Cl)Cl)[C@H]1C(=O)O[C@@H](C#N)c1cccc(Oc2ccccc2)c1"); + } + } - } - + testFile = new File(base, "Code" + File.separator + "GraphMol" + + File.separator + "test_data" + File.separator + + "CDXML" + File.separator + "ring-stereo1.cdx"); + fn = testFile.getAbsolutePath(); + params = new CDXMLParserParams(true, true, CDXMLFormat.CDX); + prods = RWMol.MolsFromCDXMLFile(fn, params); + assertEquals(prods.size(), 1); + + params = new CDXMLParserParams(true, true, CDXMLFormat.Auto); + prods = RWMol.MolsFromCDXMLFile(fn, params); + assertEquals(prods.size(), 1); + + params = new CDXMLParserParams(true, true, CDXMLFormat.CDXML); + boolean e = false; + try { + prods = RWMol.MolsFromCDXMLFile(fn, params); + } catch(GenericRDKitException ex) { + e = true; + } + assertEquals(true, e); + + } public static void main(String args[]) { org.junit.runner.JUnitCore.main("org.RDKit.WrapperTests"); } diff --git a/Code/RDGeneral/RDConfig.h.cmake b/Code/RDGeneral/RDConfig.h.cmake index fb302cd99..741468fef 100644 --- a/Code/RDGeneral/RDConfig.h.cmake +++ b/Code/RDGeneral/RDConfig.h.cmake @@ -45,3 +45,5 @@ #cmakedefine RDK_BUILD_YAEHMOP_SUPPORT #cmakedefine RDK_BUILD_XYZ2MOL_SUPPORT + +#cmakedefine RDK_BUILD_CHEMDRAW_SUPPORT \ No newline at end of file diff --git a/External/ChemDraw/Wrap/rdChemDraw.cpp b/External/ChemDraw/Wrap/rdChemDraw.cpp index f24defa60..dd6b37aa7 100644 --- a/External/ChemDraw/Wrap/rdChemDraw.cpp +++ b/External/ChemDraw/Wrap/rdChemDraw.cpp @@ -70,7 +70,8 @@ python::object MolsFromChemDrawBlockHelper(const std::string &filename, bool san bool removeHs) { std::vector> mols; try { - mols = RDKit::v2::MolsFromChemDrawBlock(filename, {sanitize, removeHs}); + mols = RDKit::v2::MolsFromChemDrawBlock(filename, + {sanitize, removeHs, RDKit::v2::CDXFormat::CDXML}); } catch (RDKit::BadFileException &e) { PyErr_SetString(PyExc_IOError, e.what()); throw python::error_already_set(); @@ -89,7 +90,8 @@ python::object MolsFromChemDrawBlockHelper(const std::string &filename, bool san python::tuple MolsFromChemDrawFileHelper(python::object cdxml, bool sanitize, bool removeHs) { - auto mols = RDKit::v2::MolsFromChemDrawFile(pyObjectToString(cdxml), {sanitize, removeHs}); + auto mols = RDKit::v2::MolsFromChemDrawFile(pyObjectToString(cdxml), + {sanitize, removeHs, RDKit::v2::CDXFormat::CDXML}); python::list res; for (auto &mol : mols) { // take ownership of the data from the unique_ptr diff --git a/External/ChemDraw/chemdraw.cpp b/External/ChemDraw/chemdraw.cpp index 3b1892887..b82d908b5 100644 --- a/External/ChemDraw/chemdraw.cpp +++ b/External/ChemDraw/chemdraw.cpp @@ -237,8 +237,41 @@ void visit_children( } } +CDXFormat sniff_format(std::istream &is) { + // Remember the current read position + std::streampos start_pos = is.tellg(); + if (start_pos == -1) { + // Some streams (like std::cin) may not support tellg + return CDXFormat::AUTO; // here it simply means we failed + } + + // CDX header consists of: + // 8 bytes with the value "VjCD0100" (hex: 56 6A 43 44 30 31 30 30). + CDXFormat format = CDXFormat::CDXML; + const std::vector header{86, 106, 67, 68, 48, 49, 48, 48}; + std::vector buf(8); + is.read(buf.data(), 8); + if (buf == header) { + format = CDXFormat::CDX; + } + + // Reset the stream position + is.clear(); // clear EOF flag if we hit it + is.seekg(start_pos); + return format; +} + std::unique_ptr streamToCDXDocument(std::istream &inStream, CDXFormat format) { + if(format == CDXFormat::AUTO) { + format = sniff_format(inStream); + if(format == CDXFormat::AUTO) { + const std::string msg = " Failed deducing whether the input stream is CDXML or CDX"; + BOOST_LOG(rdErrorLog) << msg << std::endl; + throw FileParseException(msg); + } + } + if (format == CDXFormat::CDXML) { CDXMLParser parser; // populate tree structure pt @@ -249,17 +282,21 @@ std::unique_ptr streamToCDXDocument(std::istream &inStream, static_cast(data.size()), HaveAllXml)) { auto error = XML_GetErrorCode(parser); - BOOST_LOG(rdErrorLog) << "Failed parsing XML with error code " << error; - throw FileParseException("Bad Input File"); + std::stringstream msg; + msg << "Failed parsing XML with error code " << error; + BOOST_LOG(rdErrorLog) << msg.str() << std::endl; + throw FileParseException(msg.str()); } return parser.ReleaseDocument(); } else { - throw FileParseException("Can't handle cdx yet"); - return std::unique_ptr(); + CDXistream input(inStream); + const bool doThrow = true; + std::unique_ptr doc(CDXReadDocFromStorage(input, doThrow)); + return doc; } } - + // may raise FileParseException std::vector> molsFromCDXMLDataStream( std::istream &inStream, const ChemDrawParserParams ¶ms) { diff --git a/External/ChemDraw/chemdraw.h b/External/ChemDraw/chemdraw.h index c4e1884d9..af43aef43 100644 --- a/External/ChemDraw/chemdraw.h +++ b/External/ChemDraw/chemdraw.h @@ -41,13 +41,18 @@ namespace RDKit { namespace v2 { enum class CDXFormat { CDX = 1, - CDXML = 2 + CDXML = 2, + AUTO = 3 }; struct RDKIT_RDCHEMDRAWLIB_EXPORT ChemDrawParserParams { - bool sanitize = true; - bool removeHs = true; - CDXFormat format = CDXFormat::CDXML; + bool sanitize; + bool removeHs; + CDXFormat format; + ChemDrawParserParams() : sanitize(true), removeHs(true), format(CDXFormat::AUTO) {} + ChemDrawParserParams(bool sanitize, bool removeHs, CDXFormat format) : + sanitize(sanitize), removeHs(removeHs), format(format) {} + }; std::vector> RDKIT_RDCHEMDRAWLIB_EXPORT