From 2b99ee477c19cb7bf4e826b386ab465cbc21bb2f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sun, 23 Jun 2024 01:02:19 -0400 Subject: [PATCH] Allow fragments to be grouped in cdxml (#7529) * Allow fragments to be groups in CDXML * Add support for grouped reactants * run clang-format * Change github issue to 7528 * Add documents to the code * response to review, check grouped reactants in cdxml against rxn file * Remove unused code * Add missing file --------- Co-authored-by: Brian Kelley --- Code/GraphMol/ChemReactions/CDXMLParser.cpp | 31 +- Code/GraphMol/ChemReactions/catch_tests.cpp | 102 ++- Code/GraphMol/FileParsers/CDXMLParser.cpp | 354 +++++----- .../FileParsers/cdxml_parser_catch.cpp | 58 +- .../CDXML/github7467-grouped-fragments.cdxml | 608 ++++++++++++++++++ .../github7467-ungrouped-fragments.cdxml | 608 ++++++++++++++++++ .../reaction-with-grouped-templates.cdxml | 335 ++++++++++ 7 files changed, 1907 insertions(+), 189 deletions(-) create mode 100644 Code/GraphMol/test_data/CDXML/github7467-grouped-fragments.cdxml create mode 100644 Code/GraphMol/test_data/CDXML/github7467-ungrouped-fragments.cdxml create mode 100644 Code/GraphMol/test_data/CDXML/reaction-with-grouped-templates.cdxml diff --git a/Code/GraphMol/ChemReactions/CDXMLParser.cpp b/Code/GraphMol/ChemReactions/CDXMLParser.cpp index 898839377..47d19d713 100644 --- a/Code/GraphMol/ChemReactions/CDXMLParser.cpp +++ b/Code/GraphMol/ChemReactions/CDXMLParser.cpp @@ -8,6 +8,7 @@ // of the RDKit source tree. // #include +#include #include #include #include @@ -34,6 +35,17 @@ void make_query_atoms(RWMol &mol) { QueryOps::replaceAtomWithQueryAtom(&mol, atom); } } + +void add_template(const std::string &prop, std::map &templates, + std::unique_ptr &mol) { + auto reactant_idx = mol->getProp(prop); + if (templates.find(reactant_idx) != templates.end()) { + templates[reactant_idx] = + ROMOL_SPTR(combineMols(*templates[reactant_idx], *mol)); + } else { + templates[reactant_idx] = ROMOL_SPTR(std::move(mol)); + } +} } // namespace //! Parse a text stream with CDXML data into a ChemicalReaction @@ -46,6 +58,10 @@ CDXMLDataStreamToChemicalReactions(std::istream &inStream, bool sanitize, std::map, std::vector> schemes; std::set used; + std::map reactant_templates; + std::map product_templates; + std::map agent_templates; + for (size_t i = 0; i < mols.size(); ++i) { unsigned int step = 0; unsigned int scheme = 0; @@ -69,17 +85,26 @@ CDXMLDataStreamToChemicalReactions(std::istream &inStream, bool sanitize, if (mols[idx]->hasProp("CDX_REAGENT_ID")) { used.insert(idx); make_query_atoms(*mols[idx]); - res->addReactantTemplate(ROMOL_SPTR(std::move(mols[idx]))); + add_template("CDX_REAGENT_ID", reactant_templates, mols[idx]); } else if (mols[idx]->hasProp("CDX_AGENT_ID")) { used.insert(idx); make_query_atoms(*mols[idx]); - res->addAgentTemplate(ROMOL_SPTR(std::move(mols[idx]))); + add_template("CDX_AGENT_ID", agent_templates, mols[idx]); } else if (mols[idx]->hasProp("CDX_PRODUCT_ID")) { used.insert(idx); make_query_atoms(*mols[idx]); - res->addProductTemplate(ROMOL_SPTR(std::move(mols[idx]))); + add_template("CDX_PRODUCT_ID", product_templates, mols[idx]); } } + for (auto reactant : reactant_templates) { + res->addReactantTemplate(reactant.second); + } + for (auto reactant : agent_templates) { + res->addAgentTemplate(reactant.second); + } + for (auto reactant : product_templates) { + res->addProductTemplate(reactant.second); + } updateProductsStereochem(res); // CDXML-based reactions do not have implicit properties res->setImplicitPropertiesFlag(false); diff --git a/Code/GraphMol/ChemReactions/catch_tests.cpp b/Code/GraphMol/ChemReactions/catch_tests.cpp index 37779a458..df2d21a32 100644 --- a/Code/GraphMol/ChemReactions/catch_tests.cpp +++ b/Code/GraphMol/ChemReactions/catch_tests.cpp @@ -1306,6 +1306,83 @@ TEST_CASE("CDXML Parser") { smarts == "[#6&D2:2]1:[#6&D2:3]:[#6&D2:4]:[#6&D3:1](:[#6&D2:5]:[#6&D2:6]:1)-[#17&D1].[#6&D3](-[#5&D2]-[#6&D3:7]1:[#6&D2:8]:[#6&D2:9]:[#6&D2:10]:[#6&D2:11]:[#6&D2:12]:1)(-[#8&D1])-[#8&D1]>>[#6&D2:1]1:[#6&D2:5]:[#6&D3:6](:[#6&D2:2]:[#6&D2:3]:[#6&D2:4]:1)-[#6&D3:7]1:[#6&D2:8]:[#6&D2:9]:[#6&D2:10]:[#6&D2:11]:[#6&D2:12]:1"); } + + SECTION("Github #7528 CDXML Grouped Agents in Reactions") { + // The failing case had fragments grouped with labels, ensure the grouped cersion and the ungrouped + // versions have the same results + auto fname = cdxmlbase + "github7467-grouped-fragments.cdxml"; + auto rxns = CDXMLFileToChemicalReactions(fname); + CHECK(rxns.size() == 1); + fname = cdxmlbase + "github7467-ungrouped-fragments.cdxml"; + auto rxns2 = CDXMLFileToChemicalReactions(fname); + + CHECK(ChemicalReactionToRxnSmarts(*rxns[0]) == ChemicalReactionToRxnSmarts(*rxns2[0])); + + // Check to see if our understanding of grouped reagents in reactions is correct + fname = cdxmlbase + "reaction-with-grouped-templates.cdxml"; + auto rxns3 = CDXMLFileToChemicalReactions(fname); + CHECK(rxns3.size() == 1); + std::string rxnb = R"RXN($RXN + + Mrv2004 062120241319 + + 2 0 +$MOL + + Mrv2004 06212413192D + + 5 5 0 0 0 0 999 V2000 + 2.6221 -4.6475 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 2.6221 -5.4725 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4070 -5.7274 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.8918 -5.0600 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.4070 -4.3926 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 2 0 0 0 0 + 2 3 1 0 0 0 0 + 3 4 2 0 0 0 0 + 4 5 1 0 0 0 0 + 5 1 1 0 0 0 0 +M END +$MOL + + Mrv2004 06212413192D + + 11 11 0 0 0 0 999 V2000 + 6.9305 -4.5100 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 6.9305 -5.3350 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 7.6450 -5.7475 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.3594 -5.3350 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.3594 -4.5100 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 7.6450 -4.0975 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.6171 -4.4825 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 8.6171 -5.3075 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.4020 -5.5624 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.8868 -4.8950 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 9.4020 -4.2276 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 6 1 1 0 0 0 0 + 2 3 1 0 0 0 0 + 3 4 1 0 0 0 0 + 4 5 1 0 0 0 0 + 5 6 1 0 0 0 0 + 7 8 2 0 0 0 0 + 11 7 1 0 0 0 0 + 8 9 1 0 0 0 0 + 9 10 2 0 0 0 0 + 10 11 1 0 0 0 0 +M END +)RXN"; + std::unique_ptr rxn_mb{RxnBlockToChemicalReaction(rxnb)}; + // CDXMLToReaction is sanitized by default, this might be a mistake... + unsigned int failed; + RxnOps::sanitizeRxn( + *rxn_mb, failed, + RxnOps::SANITIZE_ADJUST_REACTANTS | RxnOps::SANITIZE_ADJUST_PRODUCTS, + RxnOps::MatchOnlyAtRgroupsAdjustParams()); + + CHECK(rxns3[0]->getNumReactantTemplates() == rxn_mb->getNumReactantTemplates()); + CHECK(ChemicalReactionToRxnSmarts(*rxns3[0]) == ChemicalReactionToRxnSmarts(*rxn_mb)); + } } TEST_CASE("Github #5785: separateAgents ignored for V3000 RXN files") { @@ -1800,17 +1877,26 @@ TEST_CASE("sanitizeRxnAsMols") { TEST_CASE("Github #7372: SMILES output option to disable dative bonds") { SECTION("basics") { - auto rxn = "[C:1]-[C:2].[NH3:3]->[Fe:4]-[NH2:5]>>[C:1]=[C:2].[NH3:3]->[Fe:4]-[NH2:5]"_rxnsmarts; + auto rxn = + "[C:1]-[C:2].[NH3:3]->[Fe:4]-[NH2:5]>>[C:1]=[C:2].[NH3:3]->[Fe:4]-[NH2:5]"_rxnsmarts; REQUIRE(rxn); auto smi = ChemicalReactionToRxnSmiles(*rxn); - CHECK(smi == "[CH3:1][CH3:2].[NH3:3]->[Fe:4][NH2:5]>>[CH2:1]=[CH2:2].[NH3:3]->[Fe:4][NH2:5]"); + CHECK( + smi == + "[CH3:1][CH3:2].[NH3:3]->[Fe:4][NH2:5]>>[CH2:1]=[CH2:2].[NH3:3]->[Fe:4][NH2:5]"); smi = ChemicalReactionToRxnSmarts(*rxn); - CHECK(smi == "[C:1]-[C:2].[N&H3:3]->[#26:4]-[N&H2:5]>>[C:1]=[C:2].[N&H3:3]->[#26:4]-[N&H2:5]"); + CHECK( + smi == + "[C:1]-[C:2].[N&H3:3]->[#26:4]-[N&H2:5]>>[C:1]=[C:2].[N&H3:3]->[#26:4]-[N&H2:5]"); SmilesWriteParams ps; ps.includeDativeBonds = false; - smi = ChemicalReactionToRxnSmiles(*rxn,ps); - CHECK(smi == "[CH3:1][CH3:2].[NH3:3][Fe:4][NH2:5]>>[CH2:1]=[CH2:2].[NH3:3][Fe:4][NH2:5]"); - smi = ChemicalReactionToRxnSmarts(*rxn,ps); - CHECK(smi == "[C:1]-[C:2].[N&H3:3]-[#26:4]-[N&H2:5]>>[C:1]=[C:2].[N&H3:3]-[#26:4]-[N&H2:5]"); + smi = ChemicalReactionToRxnSmiles(*rxn, ps); + CHECK( + smi == + "[CH3:1][CH3:2].[NH3:3][Fe:4][NH2:5]>>[CH2:1]=[CH2:2].[NH3:3][Fe:4][NH2:5]"); + smi = ChemicalReactionToRxnSmarts(*rxn, ps); + CHECK( + smi == + "[C:1]-[C:2].[N&H3:3]-[#26:4]-[N&H2:5]>>[C:1]=[C:2].[N&H3:3]-[#26:4]-[N&H2:5]"); } -} \ No newline at end of file +} diff --git a/Code/GraphMol/FileParsers/CDXMLParser.cpp b/Code/GraphMol/FileParsers/CDXMLParser.cpp index 74903a9cc..8dc6aa6f4 100644 --- a/Code/GraphMol/FileParsers/CDXMLParser.cpp +++ b/Code/GraphMol/FileParsers/CDXMLParser.cpp @@ -29,6 +29,7 @@ namespace RDKit { namespace { const std::string NEEDS_FUSE("CDXML_NEEDS_FUSE"); const std::string CDXML_FRAG_ID("CDXML_FRAG_ID"); +const std::string CDXML_GROUP_ID("CDXML_GROUP_ID"); const std::string FUSE_LABEL("CDXML_NODE_ID"); const std::string CDX_SCHEME_ID("CDX_SCHEME_ID"); const std::string CDX_STEP_ID("CDX_STEP_ID"); @@ -495,28 +496,200 @@ bool parse_fragment(RWMol &mol, ptree &frag, return !skip_fragment; } -void set_reaction_data(std::string type, std::string prop, SchemeInfo &scheme, - const std::vector &frag_ids, - const std::map &fragments, - const std::vector> &mols) { +void set_reaction_data( + std::string type, std::string prop, SchemeInfo &scheme, + const std::vector &frag_ids, + const std::map &fragments, + std::map> &grouped_fragments, + const std::vector> &mols) { unsigned int reagent_idx = 0; for (auto idx : frag_ids) { - auto iter = fragments.find(idx); - if (iter == fragments.end()) { + auto iter = grouped_fragments.find(idx); + if (iter == grouped_fragments.end()) { BOOST_LOG(rdWarningLog) << "CDXMLParser: Schema " << scheme.scheme_id << " step " - << scheme.step_id << " " << type << " fragment " << idx + << scheme.step_id << " " << type << " reaction fragment " << idx << " not found in document." << std::endl; continue; } - if (iter->second >= mols.size()) { - // shouldn't get here - continue; + for (auto reaction_fragment_id : iter->second) { + auto fragment = fragments.find(reaction_fragment_id); + if (fragment == fragments.end()) { + BOOST_LOG(rdWarningLog) + << "CDXMLParser: Schema " << scheme.scheme_id << " step " + << scheme.step_id << " " << type << " fragment " << idx + << " not found in document." << std::endl; + continue; + } + auto &mol = mols[fragment->second]; + mol->setProp(CDX_SCHEME_ID, scheme.scheme_id); + mol->setProp(CDX_STEP_ID, scheme.step_id); + mol->setProp(prop, reagent_idx); + } + reagent_idx += 1; + } +} + +// The parsing of fragments needed to be moved to a recursive function since they may be +// embedded further in the documentation, i.e. a group may hold multiple fragments +// +// Additionally, a grouped_fragments map is included to group fragments together for the +// purposes of reactions. +// +// Ungrouped fragments will end up as vectors of size 1 in the grouped_fragement list. +// The reaction schemes in the CDXML docs appear to use the fragment id for ungrouped +// fragments and the grouped id for grouped fragments, so the grouped_fragments +// holds both for ease of bookkeeping. +template +void visit_children(T &node, std::map &ids, + std::vector> &mols, // All molecules found in the doc + std::map &fragment_lookup, // fragment.id->molecule index + std::map> &grouped_fragments, //grouped.id -> [fragment.id] + std::vector &schemes, // reaction schemes found + int &missing_frag_id, // if we don't have a fragment id, start at -1 and decrement + double bondLength, // bond length of the document for assigning coordinates + const v2::CDXMLParser::CDXMLParserParams ¶ms, // parser parameters + int group_id = -1) { // current group id for this set of subnodes + MolzipParams molzip_params; + molzip_params.label = MolzipLabel::AtomProperty; + molzip_params.atomProperty = FUSE_LABEL; + molzip_params.enforceValenceRules = false; + for (auto &frag : node.second) { + if (frag.first == "fragment") { // chemical matter + std::unique_ptr mol = std::make_unique(); + if (!parse_fragment(*mol, frag.second, ids, missing_frag_id)) { + continue; + } + unsigned int frag_id = mol->getProp(CDXML_FRAG_ID); + fragment_lookup[frag_id] = mols.size(); + if (group_id != -1) { + grouped_fragments[group_id].push_back(frag_id); + } else { + grouped_fragments[frag_id].push_back(frag_id); + } + if (mol->hasProp(NEEDS_FUSE)) { + mol->clearProp(NEEDS_FUSE); + std::unique_ptr fused; + try { + fused = molzip(*mol, molzip_params); + } catch (Invar::Invariant &) { + BOOST_LOG(rdWarningLog) << "Failed fusion of fragment skipping... " + << frag_id << std::endl; + // perhaps have an option to extract all fragments? + // mols.push_back(std::move(mol)); + continue; + } + fused->setProp(CDXML_FRAG_ID, static_cast(frag_id)); + mols.emplace_back(dynamic_cast(fused.release())); + } else { + mols.push_back(std::move(mol)); + } + RWMol *res = mols.back().get(); + auto conf = std::make_unique(res->getNumAtoms()); + conf->set3D(false); + + bool hasConf = false; + for (auto &atm : res->atoms()) { + RDGeom::Point3D p{0.0, 0.0, 0.0}; + + if (atm->hasProp(CDX_ATOM_POS)) { + hasConf = true; + const std::vector coord = + atm->getProp>(CDX_ATOM_POS); + + if (coord.size() == 2) { + p.x = coord[0]; + p.y = -1 * coord[1]; // CDXML uses an inverted coordinate + // system, so we need to reverse that + p.z = 0.0; + } + } + conf->setAtomPos(atm->getIdx(), p); + atm->clearProp(CDX_ATOM_POS); + } + + if (hasConf) { + scaleBonds(*res, *conf, RDKIT_DEPICT_BONDLENGTH, bondLength); + auto confidx = res->addConformer(conf.release()); + DetectAtomStereoChemistry(*res, &res->getConformer(confidx)); + + Atropisomers::detectAtropisomerChirality(*res, + &res->getConformer(confidx)); + } else { // no Conformer + Atropisomers::detectAtropisomerChirality(*res, nullptr); + } + + // now that atom stereochem has been perceived, the wedging + // information is no longer needed, so we clear + // single bond dir flags: + MolOps::clearSingleBondDirFlags(*res); + + if (params.sanitize) { + try { + if (params.removeHs) { + // Bond stereo detection must happen before H removal, or + // else we might be removing stereogenic H atoms in double + // bonds (e.g. imines). But before we run stereo detection, + // we need to run mol cleanup so don't have trouble with + // e.g. nitro groups. Sadly, this a;; means we will find + // run both cleanup and ring finding twice (a fast find + // rings in bond stereo detection, and another in + // sanitization's SSSR symmetrization). + unsigned int failedOp = 0; + MolOps::sanitizeMol(*res, failedOp, MolOps::SANITIZE_CLEANUP); + MolOps::detectBondStereochemistry(*res); + MolOps::removeHs(*res, false, false); + } else { + MolOps::sanitizeMol(*res); + MolOps::detectBondStereochemistry(*res); + } + } catch (...) { + BOOST_LOG(rdWarningLog) + << "CDXMLParser: failed sanitizing skipping fragment " << frag_id + << std::endl; + mols.pop_back(); + continue; + } + MolOps::assignStereochemistry(*res, true, true, true); + } else { + MolOps::detectBondStereochemistry(*res); + } + } else if (frag.first == "scheme") { // get the reaction info + int scheme_id = frag.second.template get(".id", -1); + for (auto &node : frag.second) { + if (node.first == "step") { + auto step_id = node.second.template get(".id", -1); + SchemeInfo scheme; + scheme.scheme_id = scheme_id; + scheme.step_id = step_id; + for (auto &attrib : node.second.get_child("")) { + if (attrib.first == "ReactionStepProducts") { + scheme.ReactionStepProducts = + to_vec(attrib.second.data()); + } else if (attrib.first == "ReactionStepReactants") { + scheme.ReactionStepReactants = + to_vec(attrib.second.data()); + } else if (attrib.first == "ReactionStepObjectsAboveArrow") { + scheme.ReactionStepObjectsAboveArrow = + to_vec(attrib.second.data()); + } else if (attrib.first == "ReactionStepObjectsBelowArrow") { + scheme.ReactionStepObjectsBelowArrow = + to_vec(attrib.second.data()); + } else if (attrib.first == "ReactionStepAtomMap") { + scheme.ReactionStepAtomMap = + to_vec(attrib.second.data()); + } + } + schemes.push_back(std::move(scheme)); + } + } + } else { + if (frag.first == "group") { + group_id = frag.second.template get(".id"); + } + visit_children(frag, ids, mols, fragment_lookup, grouped_fragments, + schemes, missing_frag_id, bondLength, params, group_id); } - auto &mol = mols[iter->second]; - mol->setProp(CDX_SCHEME_ID, scheme.scheme_id); - mol->setProp(CDX_STEP_ID, scheme.step_id); - mol->setProp(prop, reagent_idx++); } } } // namespace @@ -541,151 +714,20 @@ std::vector> MolsFromCDXMLDataStream( throw FileParseException(e.what()); } - std::map ids; - std::vector> mols; - std::map fragment_lookup; - std::vector schemes; + std::map ids; // atom.id to atom in fragment (used for linkages) + std::vector> mols; // All molecules found in the doc + std::map fragment_lookup; // fragment.id->molecule index + std::map> grouped_fragments; //grouped.id -> [fragment.id] + std::vector schemes; // reaction schemes found - MolzipParams molzip_params; - molzip_params.label = MolzipLabel::AtomProperty; - molzip_params.atomProperty = FUSE_LABEL; - molzip_params.enforceValenceRules = false; int missing_frag_id = -1; for (auto &cdxml : pt) { if (cdxml.first == "CDXML") { double bondLength = cdxml.second.get(".BondLength"); for (auto &node : cdxml.second) { if (node.first == "page") { - for (auto &frag : node.second) { - if (frag.first == "fragment") { // chemical matter - std::unique_ptr mol = std::make_unique(); - if (!parse_fragment(*mol, frag.second, ids, missing_frag_id)) { - continue; - } - unsigned int frag_id = mol->getProp(CDXML_FRAG_ID); - fragment_lookup[frag_id] = mols.size(); - if (mol->hasProp(NEEDS_FUSE)) { - mol->clearProp(NEEDS_FUSE); - std::unique_ptr fused; - try { - fused = molzip(*mol, molzip_params); - } catch (Invar::Invariant &) { - BOOST_LOG(rdWarningLog) - << "Failed fusion of fragment skipping... " << frag_id - << std::endl; - // perhaps have an option to extract all fragments? - // mols.push_back(std::move(mol)); - continue; - } - fused->setProp(CDXML_FRAG_ID, static_cast(frag_id)); - mols.emplace_back(dynamic_cast(fused.release())); - } else { - mols.push_back(std::move(mol)); - } - RWMol *res = mols.back().get(); - auto conf = std::make_unique(res->getNumAtoms()); - conf->set3D(false); - - bool hasConf = false; - for (auto &atm : res->atoms()) { - RDGeom::Point3D p{0.0, 0.0, 0.0}; - - if (atm->hasProp(CDX_ATOM_POS)) { - hasConf = true; - const std::vector coord = - atm->getProp>(CDX_ATOM_POS); - - if (coord.size() == 2) { - p.x = coord[0]; - p.y = -1 * coord[1]; // CDXML uses an inverted coordinate - // system, so we need to reverse that - p.z = 0.0; - } - } - conf->setAtomPos(atm->getIdx(), p); - atm->clearProp(CDX_ATOM_POS); - } - - if (hasConf) { - scaleBonds(*res, *conf, RDKIT_DEPICT_BONDLENGTH, bondLength); - auto confidx = res->addConformer(conf.release()); - DetectAtomStereoChemistry(*res, &res->getConformer(confidx)); - - Atropisomers::detectAtropisomerChirality( - *res, &res->getConformer(confidx)); - } else { // no Conformer - Atropisomers::detectAtropisomerChirality(*res, nullptr); - } - - // now that atom stereochem has been perceived, the wedging - // information is no longer needed, so we clear - // single bond dir flags: - MolOps::clearSingleBondDirFlags(*res); - - if (params.sanitize) { - try { - if (params.removeHs) { - // Bond stereo detection must happen before H removal, or - // else we might be removing stereogenic H atoms in double - // bonds (e.g. imines). But before we run stereo detection, - // we need to run mol cleanup so don't have trouble with - // e.g. nitro groups. Sadly, this a;; means we will find - // run both cleanup and ring finding twice (a fast find - // rings in bond stereo detection, and another in - // sanitization's SSSR symmetrization). - unsigned int failedOp = 0; - MolOps::sanitizeMol(*res, failedOp, - MolOps::SANITIZE_CLEANUP); - MolOps::detectBondStereochemistry(*res); - MolOps::removeHs(*res, false, false); - } else { - MolOps::sanitizeMol(*res); - MolOps::detectBondStereochemistry(*res); - } - } catch (...) { - BOOST_LOG(rdWarningLog) - << "CDXMLParser: failed sanitizing skipping fragment " - << frag_id << std::endl; - mols.pop_back(); - continue; - } - MolOps::assignStereochemistry(*res, true, true, true); - } else { - MolOps::detectBondStereochemistry(*res); - } - } else if (frag.first == "scheme") { // get the reaction info - int scheme_id = frag.second.get(".id", -1); - for (auto &node : frag.second) { - if (node.first == "step") { - auto step_id = node.second.get(".id", -1); - SchemeInfo scheme; - scheme.scheme_id = scheme_id; - scheme.step_id = step_id; - for (auto &attrib : node.second.get_child("")) { - if (attrib.first == "ReactionStepProducts") { - scheme.ReactionStepProducts = - to_vec(attrib.second.data()); - } else if (attrib.first == "ReactionStepReactants") { - scheme.ReactionStepReactants = - to_vec(attrib.second.data()); - } else if (attrib.first == - "ReactionStepObjectsAboveArrow") { - scheme.ReactionStepObjectsAboveArrow = - to_vec(attrib.second.data()); - } else if (attrib.first == - "ReactionStepObjectsBelowArrow") { - scheme.ReactionStepObjectsBelowArrow = - to_vec(attrib.second.data()); - } else if (attrib.first == "ReactionStepAtomMap") { - scheme.ReactionStepAtomMap = - to_vec(attrib.second.data()); - } - } - schemes.push_back(std::move(scheme)); - } - } - } - } + visit_children(node, ids, mols, fragment_lookup, grouped_fragments, + schemes, missing_frag_id, bondLength, params); } } } @@ -708,15 +750,17 @@ std::vector> MolsFromCDXMLDataStream( for (auto &scheme : schemes) { // Set the molecule properties set_reaction_data("ReactionStepReactants", CDX_REAGENT_ID, scheme, - scheme.ReactionStepReactants, fragments, mols); + scheme.ReactionStepReactants, fragments, + grouped_fragments, mols); set_reaction_data("ReactionStepProducts", CDX_PRODUCT_ID, scheme, - scheme.ReactionStepProducts, fragments, mols); + scheme.ReactionStepProducts, fragments, + grouped_fragments, mols); auto agents = scheme.ReactionStepObjectsAboveArrow; agents.insert(agents.end(), scheme.ReactionStepObjectsBelowArrow.begin(), scheme.ReactionStepObjectsBelowArrow.end()); set_reaction_data("ReactionStepAgents", CDX_AGENT_ID, scheme, agents, - fragments, mols); + fragments, grouped_fragments, mols); // Set the Atom Maps int sz = scheme.ReactionStepAtomMap.size(); if (sz % 2) { diff --git a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp index bf2bd0395..22d392576 100644 --- a/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp +++ b/Code/GraphMol/FileParsers/cdxml_parser_catch.cpp @@ -40,7 +40,6 @@ void check_smiles_and_roundtrip(const RWMol &m, const std::string &expected) { TEST_CASE("CDXML") { std::string cdxmlbase = std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; - SECTION("SIMPLE") { std::string cdxml1 = R"( @@ -423,29 +422,30 @@ TEST_CASE("CDXML") { } } SECTION("Queries") { - { - auto fname = cdxmlbase + "query-atoms.cdxml"; - - std::vector expected = {"*c1ccccc1", "*c1ccccc1", "*c1ccccc1"}; - std::vector expected_smarts = { - "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-*", - "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#1]", - "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#6&!#1]"}; - auto mols = MolsFromCDXMLFile(fname); - CHECK(mols.size() == expected.size()); - int i = 0; - for (auto &mol : mols) { - CHECK(MolToSmarts(*mol) == expected_smarts[i]); - CHECK(MolToSmiles(*mol) == expected[i++]); - } - } - { - auto fname = cdxmlbase + "anybond.cdxml"; - auto mols = MolsFromCDXMLFile(fname); - CHECK(mols.size() == 1); - CHECK(MolToSmiles(*mols[0]) == "C1CCC~CC1"); - CHECK(MolToSmarts(*mols[0]) == "[#6]1~[#6]-[#6]-[#6]-[#6]-[#6]-1"); + { + auto fname = cdxmlbase + "query-atoms.cdxml"; + + std::vector expected = {"*c1ccccc1", "*c1ccccc1", + "*c1ccccc1"}; + std::vector expected_smarts = { + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-*", + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#1]", + "[#6]1:[#6]:[#6]:[#6]:[#6]:[#6]:1-[!#6&!#1]"}; + auto mols = MolsFromCDXMLFile(fname); + CHECK(mols.size() == expected.size()); + int i = 0; + for (auto &mol : mols) { + CHECK(MolToSmarts(*mol) == expected_smarts[i]); + CHECK(MolToSmiles(*mol) == expected[i++]); } + } + { + auto fname = cdxmlbase + "anybond.cdxml"; + auto mols = MolsFromCDXMLFile(fname); + CHECK(mols.size() == 1); + CHECK(MolToSmiles(*mols[0]) == "C1CCC~CC1"); + CHECK(MolToSmarts(*mols[0]) == "[#6]1~[#6]-[#6]-[#6]-[#6]-[#6]-1"); + } } SECTION("ElementList") { auto fname = cdxmlbase + "element-list.cdxml"; @@ -1175,3 +1175,15 @@ TEST_CASE("Github #6887: and1 or1 in same mol") { "CO[C@H](C)C[C@H](Cl)C[C@H](C)Br |o1:5,o2:8,&1:2|"); } } + +TEST_CASE("Github #7528 - read fragments in groups") { + std::string cdxmlbase = + std::string(getenv("RDBASE")) + "/Code/GraphMol/test_data/CDXML/"; + SECTION("case 1") { + auto fname = cdxmlbase + "github7467-grouped-fragments.cdxml"; + CDXMLParserParams params; + params.sanitize = false; + auto mols = MolsFromCDXMLFile(fname, params); + REQUIRE(mols.size() == 2); + } +} diff --git a/Code/GraphMol/test_data/CDXML/github7467-grouped-fragments.cdxml b/Code/GraphMol/test_data/CDXML/github7467-grouped-fragments.cdxml new file mode 100644 index 000000000..589d30f10 --- /dev/null +++ b/Code/GraphMol/test_data/CDXML/github7467-grouped-fragments.cdxml @@ -0,0 +1,608 @@ + + + + + + + + + + + + + + +SiH2OSiH2OSiH2OSiH2OSiH2OSiH2OSiH2OSiH2OCaution: Valence appears to be exceeded2,4,6,8-tetramethylcyclotetrasiloxane+Caution: Valence appears to be exceeded2,4,6,8-tetramethylcyclotetrasiloxane[II] \ No newline at end of file diff --git a/Code/GraphMol/test_data/CDXML/github7467-ungrouped-fragments.cdxml b/Code/GraphMol/test_data/CDXML/github7467-ungrouped-fragments.cdxml new file mode 100644 index 000000000..f6a80dfcb --- /dev/null +++ b/Code/GraphMol/test_data/CDXML/github7467-ungrouped-fragments.cdxml @@ -0,0 +1,608 @@ + + + + + + + + + + + + + + +SiH2OSiH2OSiH2OSiH2OSiH2OSiH2OSiH2OSiH2OCaution: Valence appears to be exceeded2,4,6,8-tetramethylcyclotetrasiloxane+Caution: Valence appears to be exceeded2,4,6,8-tetramethylcyclotetrasiloxane[II] \ No newline at end of file diff --git a/Code/GraphMol/test_data/CDXML/reaction-with-grouped-templates.cdxml b/Code/GraphMol/test_data/CDXML/reaction-with-grouped-templates.cdxml new file mode 100644 index 000000000..7799f46ac --- /dev/null +++ b/Code/GraphMol/test_data/CDXML/reaction-with-grouped-templates.cdxml @@ -0,0 +1,335 @@ + + + + + + + + + + + + + + + ++ \ No newline at end of file