// // Copyright (c) 2024 Glysade Inc and other RDkit contributors // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Novartis Institutes for BioMedical Research Inc. // nor the names of its contributors may be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #include #include #include #include "ChemDrawStartInclude.h" #include "chemdraw/CDXMLParser.h" #include "chemdraw/CDXStdObjects.h" #include "ChemDrawEndInclude.h" #include "bracket.h" #include "chemdraw.h" #include "chemdraw_doc.h" #include "fragment.h" #include "reaction.h" #include "utils.h" #include #include #include #include #include #include #include #include #include #include // #define DEBUG 1 namespace { using namespace RDKit; using namespace RDKit::v2; using namespace RDKit::ChemDraw; // The parsing of fragments needed to be moved to a recursive function since // they may be embedded further in the document, i.e. a group may hold multiple // fragments // // Additionally, a grouped_fragments map is included to group fragments together // for the purposes of reactions. // // Ungrouped fragments will end up as vectors of size 1 in the grouped_fragement // list. The reaction schemes in the CDXML docs appear to use the fragment id // for ungrouped fragments and the grouped id for grouped fragments, so the // grouped_fragments holds both for ease of bookkeeping. void visit_children( CDXObject &node, PageData &pagedata, int &missing_frag_id, // if we don't have a fragment id, start at -1 and // decrement double bondLength, // bond length of the document for assigning coordinates const ChemDrawParserParams ¶ms, // parser parameters int group_id = -1) { // current group id for this set of subnodes MolzipParams molzip_params; molzip_params.label = MolzipLabel::AtomProperty; molzip_params.atomProperty = FUSE_LABEL; molzip_params.enforceValenceRules = false; for (auto frag : node.ContainedObjects()) { auto id = (CDXDatumID)frag.second->GetTag(); if (id == kCDXObj_Fragment) { std::unique_ptr mol = std::make_unique(); if (!parseFragment(*mol, (CDXFragment &)(*frag.second), pagedata, missing_frag_id)) { continue; } unsigned int frag_id = mol->getProp(CDX_FRAG_ID); pagedata.fragmentLookup[frag_id] = pagedata.mols.size(); if (group_id != -1) { pagedata.groupedFragments[group_id].push_back(frag_id); } else { pagedata.groupedFragments[frag_id].push_back(frag_id); } if (mol->hasProp(NEEDS_FUSE)) { mol->clearProp(NEEDS_FUSE); std::unique_ptr fused; try { replaceFragments(*mol); fused = molzip(*mol, molzip_params); } catch (Invar::Invariant &) { BOOST_LOG(rdWarningLog) << "Failed fusion of fragment skipping... " << frag_id << std::endl; // perhaps have an option to extract all fragments? // mols.push_back(std::move(mol)); continue; } fused->setProp(CDX_FRAG_ID, static_cast(frag_id)); pagedata.mols.emplace_back(dynamic_cast(fused.release())); } else { pagedata.mols.push_back(std::move(mol)); } RWMol *res = pagedata.mols.back().get(); auto conf = std::make_unique(res->getNumAtoms()); conf->set3D(false); bool hasConf = false; bool is3D = false; for (auto &atm : res->atoms()) { RDGeom::Point3D p{0.0, 0.0, 0.0}; if (atm->hasProp(CDX_ATOM_POS)) { hasConf = true; const auto coord = atm->getProp>(CDX_ATOM_POS); p.x = coord[0]; p.y = -1 * coord[1]; // CDXML uses an inverted coordinate // system, so we need to reverse that if (coord.size() == 2) { p.z = 0.0; } else { p.z = coord[2]; is3D = true; } } conf->setAtomPos(atm->getIdx(), p); atm->clearProp(CDX_ATOM_POS); } if (hasConf) { if (!is3D) { scaleBonds(*res, *conf, RDKIT_DEPICT_BONDLENGTH, bondLength); } conf->set3D(is3D); auto confidx = res->addConformer(conf.release()); if (is3D) { res->updatePropertyCache(false); MolOps::assignChiralTypesFrom3D(*res, confidx, true); } else { MolOps::assignChiralTypesFromBondDirs(*res, confidx, true); } Atropisomers::detectAtropisomerChirality(*res, &res->getConformer(confidx)); } else { // no Conformer Atropisomers::detectAtropisomerChirality(*res, nullptr); } // now that atom stereochem has been perceived, the wedging // information is no longer needed, so we clear // single bond dir flags: MolOps::clearSingleBondDirFlags(*res); if (params.sanitize) { try { if (params.removeHs) { // Bond stereo detection must happen before H removal, or // else we might be removing stereogenic H atoms in double // bonds (e.g. imines). But before we run stereo detection, // we need to run mol cleanup so don't have trouble with // e.g. nitro groups. Sadly, this a;; means we will find // run both cleanup and ring finding twice (a fast find // rings in bond stereo detection, and another in // sanitization's SSSR symmetrization). unsigned int failedOp = 0; MolOps::sanitizeMol(*res, failedOp, MolOps::SANITIZE_CLEANUP); MolOps::detectBondStereochemistry(*res); MolOps::removeHs(*res); } else { MolOps::sanitizeMol(*res); MolOps::detectBondStereochemistry(*res); } } catch (...) { BOOST_LOG(rdWarningLog) << "CDXMLParser: failed sanitizing skipping fragment " << frag_id << std::endl; pagedata.mols.pop_back(); continue; } MolOps::assignStereochemistry(*res, true, true, true); // Sometimes ChemDraw just marks with R and S, so let's assign // these as long as they were not already determined checkChemDrawTetrahedralGeometries(*res); } else { MolOps::detectBondStereochemistry(*res); } } else if (id == kCDXObj_ReactionScheme) { // get the reaction info auto &scheme = (CDXReactionScheme &)(*frag.second); pagedata.schemes.emplace_back(scheme); /* int scheme_id = scheme.GetObjectID(); //frag.second.template get(".id", -1); for (auto &rxnNode : scheme.ContainedObjects()) { CDXDatumID type_id = (CDXDatumID)rxnNode.second->GetTag(); if (type_id == kCDXObj_ReactionStep) { CDXReactionStep &step = (CDXReactionStep&)(*rxnNode.second); auto step_id = step.GetObjectID(); SchemeInfo scheme; scheme.scheme_id = scheme_id; scheme.step_id = step_id; scheme.ReactionStepProducts = step.m_products; scheme.ReactionStepReactants = step.m_reactants; scheme.ReactionStepObjectsBelowArrow = step.m_objectsBelowArrow; scheme.ReactionStepAtomMap = step.m_aamap; schemes.push_back(scheme); } } */ } else if (id == kCDXObj_Group) { auto &group = (CDXGroup &)(*frag.second); group_id = frag.second->GetObjectID(); visit_children(group, pagedata, missing_frag_id, bondLength, params, group_id); } else if (id == kCDXObj_BracketedGroup) { auto &bracketgroup = (CDXBracketedGroup &)(*frag.second); parseBracket(bracketgroup, pagedata); } } } CDXFormat sniff_format(std::istream &is) { // Remember the current read position std::streampos start_pos = is.tellg(); if (start_pos == -1) { // Some streams (like std::cin) may not support tellg return CDXFormat::AUTO; // here it simply means we failed } // CDX header consists of: // 8 bytes with the value "VjCD0100" (hex: 56 6A 43 44 30 31 30 30). CDXFormat format = CDXFormat::CDXML; const std::vector header{86, 106, 67, 68, 48, 49, 48, 48}; std::vector buf(8); is.read(buf.data(), 8); if (buf == header) { format = CDXFormat::CDX; } // Reset the stream position is.clear(); // clear EOF flag if we hit it is.seekg(start_pos); return format; } std::unique_ptr streamToCDXDocument(std::istream &inStream, CDXFormat format) { if (format == CDXFormat::AUTO) { format = sniff_format(inStream); if (format == CDXFormat::AUTO) { const std::string msg = " Failed deducing whether the input stream is CDXML or CDX"; BOOST_LOG(rdErrorLog) << msg << std::endl; throw FileParseException(msg); } } if (format == CDXFormat::CDXML) { CDXMLParser parser; // populate tree structure pt std::string data = std::string(std::istreambuf_iterator(inStream), std::istreambuf_iterator()); const bool HaveAllXml = true; if (XML_STATUS_OK != parser.XML_Parse(data.c_str(), static_cast(data.size()), HaveAllXml)) { auto error = XML_GetErrorCode(parser); std::stringstream msg; msg << "Failed parsing XML with error code " << error; BOOST_LOG(rdErrorLog) << msg.str() << std::endl; throw FileParseException(msg.str()); } return parser.ReleaseDocument(); } else { CDXistream input(inStream); const bool doThrow = true; // if we aren't opened in binary mode on windows, we are going to have a bad // time... try { std::unique_ptr doc(CDXReadDocFromStorage(input, doThrow)); return doc; } catch (const std::exception &ex) { throw FileParseException( std::string(ex.what()) + "\nPlease ensure for CDX data streams, the stream is in binary mode."); } } } // may raise FileParseException std::vector> molsFromCDXMLDataStream( std::istream &inStream, const ChemDrawParserParams ¶ms) { std::unique_ptr document = streamToCDXDocument(inStream, params.format); if (!document) { // error return std::vector>(); } PageData pagedata; auto bondLength = document->m_bondLength; int missing_frag_id = -1; for (auto node : document->ContainedObjects()) { auto id = (CDXDatumID)node.second->GetTag(); switch (id) { case kCDXObj_Page: visit_children(*node.second, pagedata, missing_frag_id, bondLength, params); break; default: break; } } for (auto &scheme : pagedata.schemes) { scheme.set_reaction_steps(pagedata.groupedFragments, pagedata.mols); } pagedata.clearCDXProps(); return std::move(pagedata.mols); } } // namespace namespace RDKit { namespace ChemDraw { std::unique_ptr ChemDrawToDocument(std::istream &inStream, CDXFormat format) { return streamToCDXDocument(inStream, format); } std::unique_ptr ChemDrawToDocument(const std::string &filename) { std::fstream chemdrawfile(filename); std::string ext = std::filesystem::path(filename).extension().string(); boost::algorithm::to_lower(ext); if (ext == ".cdxml") { return streamToCDXDocument(chemdrawfile, CDXFormat::CDXML); } else if (ext == ".cdx") { return streamToCDXDocument(chemdrawfile, CDXFormat::CDX); } std::string msg = std::string("Unknoen filetype ") + (std::string)std::filesystem::path(filename).extension().string(); throw FileParseException(msg.c_str()); } } // namespace ChemDraw namespace v2 { std::vector> MolsFromChemDrawDataStream( std::istream &inStream, const ChemDrawParserParams ¶ms) { return molsFromCDXMLDataStream(inStream, params); } std::vector> MolsFromChemDrawBlock( const std::string &block, const ChemDrawParserParams ¶ms) { std::stringstream ss; ss << block; return MolsFromChemDrawDataStream(ss, params); } std::vector> MolsFromChemDrawFile( const std::string &filename, const ChemDrawParserParams ¶ms) { ChemDrawParserParams realparams{params}; // Always open in Binary mode std::fstream chemdrawfile(filename, std::ios::in | std::ios::binary); if (!chemdrawfile) { throw BadFileException(filename + " does not exist"); } // need to reopen in binary mode if (params.format == CDXFormat::AUTO && sniff_format(chemdrawfile) == CDXFormat::CDX) { realparams.format = CDXFormat::CDX; } return molsFromCDXMLDataStream(chemdrawfile, realparams); } } // namespace v2 } // namespace RDKit