Expose CDX support to FileParsers and ChemDraw to SWIG (#8681)

* Fist pass at CDX support

* Enable CDX support for reading (also) in the CDXMLParser API

* Add cdxml test files

* Update swig wrappers for CDXMLFormat and Parameters

* Add constructor to ChemDrawParserParams

* Add Java SWIG support for ChemDraw

* Add chemdraw define to rdconfig

* Add missing chemdraw deps

* Remove direct expat link

* Fix Java linkages for ChemDraw

* Remove bad merge code

* Remove bad merge code

* Fix csharp builds

* Add sniffer for the ChemDraw DataStream

* Include filesystem

* Fix test on windows

* Add more CDX tests

* Ensure streams are open in binary mode to support CDX on windows

* Fix text to show that a Block is the text input, not a file

* Fix CSharp test

* Disable CDX tests when not building chemdraw

* Turn back on chemdraw

* Response to review

* Turn off chemdraw support for the limited external test

---------

Co-authored-by: Brian Kelley <bkelley@glysade.com>
This commit is contained in:
Brian Kelley
2025-08-28 22:39:22 -04:00
committed by GitHub
parent dbd972497f
commit cf269aa813
28 changed files with 819 additions and 54 deletions

View File

@@ -70,7 +70,8 @@ python::object MolsFromChemDrawBlockHelper(const std::string &filename, bool san
bool removeHs) {
std::vector<std::unique_ptr<RWMol>> mols;
try {
mols = RDKit::v2::MolsFromChemDrawBlock(filename, {sanitize, removeHs});
mols = RDKit::v2::MolsFromChemDrawBlock(filename,
{sanitize, removeHs, RDKit::v2::CDXFormat::CDXML});
} catch (RDKit::BadFileException &e) {
PyErr_SetString(PyExc_IOError, e.what());
throw python::error_already_set();
@@ -89,7 +90,8 @@ python::object MolsFromChemDrawBlockHelper(const std::string &filename, bool san
python::tuple MolsFromChemDrawFileHelper(python::object cdxml, bool sanitize,
bool removeHs) {
auto mols = RDKit::v2::MolsFromChemDrawFile(pyObjectToString(cdxml), {sanitize, removeHs});
auto mols = RDKit::v2::MolsFromChemDrawFile(pyObjectToString(cdxml),
{sanitize, removeHs, RDKit::v2::CDXFormat::CDXML});
python::list res;
for (auto &mol : mols) {
// take ownership of the data from the unique_ptr

View File

@@ -237,8 +237,41 @@ void visit_children(
}
}
CDXFormat sniff_format(std::istream &is) {
// Remember the current read position
std::streampos start_pos = is.tellg();
if (start_pos == -1) {
// Some streams (like std::cin) may not support tellg
return CDXFormat::AUTO; // here it simply means we failed
}
// CDX header consists of:
// 8 bytes with the value "VjCD0100" (hex: 56 6A 43 44 30 31 30 30).
CDXFormat format = CDXFormat::CDXML;
const std::vector<char> header{86, 106, 67, 68, 48, 49, 48, 48};
std::vector<char> buf(8);
is.read(buf.data(), 8);
if (buf == header) {
format = CDXFormat::CDX;
}
// Reset the stream position
is.clear(); // clear EOF flag if we hit it
is.seekg(start_pos);
return format;
}
std::unique_ptr<CDXDocument> streamToCDXDocument(std::istream &inStream,
CDXFormat format) {
if(format == CDXFormat::AUTO) {
format = sniff_format(inStream);
if(format == CDXFormat::AUTO) {
const std::string msg = " Failed deducing whether the input stream is CDXML or CDX";
BOOST_LOG(rdErrorLog) << msg << std::endl;
throw FileParseException(msg);
}
}
if (format == CDXFormat::CDXML) {
CDXMLParser parser;
// populate tree structure pt
@@ -249,17 +282,21 @@ std::unique_ptr<CDXDocument> streamToCDXDocument(std::istream &inStream,
static_cast<int>(data.size()),
HaveAllXml)) {
auto error = XML_GetErrorCode(parser);
BOOST_LOG(rdErrorLog) << "Failed parsing XML with error code " << error;
throw FileParseException("Bad Input File");
std::stringstream msg;
msg << "Failed parsing XML with error code " << error;
BOOST_LOG(rdErrorLog) << msg.str() << std::endl;
throw FileParseException(msg.str());
}
return parser.ReleaseDocument();
} else {
throw FileParseException("Can't handle cdx yet");
return std::unique_ptr<CDXDocument>();
CDXistream input(inStream);
const bool doThrow = true;
std::unique_ptr<CDXDocument> doc(CDXReadDocFromStorage(input, doThrow));
return doc;
}
}
// may raise FileParseException
std::vector<std::unique_ptr<RWMol>> molsFromCDXMLDataStream(
std::istream &inStream, const ChemDrawParserParams &params) {

View File

@@ -41,13 +41,18 @@ namespace RDKit {
namespace v2 {
enum class CDXFormat {
CDX = 1,
CDXML = 2
CDXML = 2,
AUTO = 3
};
struct RDKIT_RDCHEMDRAWLIB_EXPORT ChemDrawParserParams {
bool sanitize = true;
bool removeHs = true;
CDXFormat format = CDXFormat::CDXML;
bool sanitize;
bool removeHs;
CDXFormat format;
ChemDrawParserParams() : sanitize(true), removeHs(true), format(CDXFormat::AUTO) {}
ChemDrawParserParams(bool sanitize, bool removeHs, CDXFormat format) :
sanitize(sanitize), removeHs(removeHs), format(format) {}
};
std::vector<std::unique_ptr<RWMol>> RDKIT_RDCHEMDRAWLIB_EXPORT