diff --git a/Code/GraphMol/CMakeLists.txt b/Code/GraphMol/CMakeLists.txt index 3c4d46b71..d6188bf20 100644 --- a/Code/GraphMol/CMakeLists.txt +++ b/Code/GraphMol/CMakeLists.txt @@ -8,7 +8,7 @@ rdkit_library(GraphMol Renumber.cpp AdjustQuery.cpp Resonance.cpp StereoGroup.cpp new_canon.cpp SubstanceGroup.cpp FindStereo.cpp MonomerInfo.cpp NontetrahedralStereo.cpp Atropisomers.cpp - WedgeBonds.cpp MolProps.cpp + WedgeBonds.cpp MolProps.cpp Subset.cpp SHARED LINK_LIBRARIES RDGeometryLib RDGeneral) target_compile_definitions(GraphMol PRIVATE RDKIT_GRAPHMOL_BUILD) @@ -49,6 +49,7 @@ rdkit_headers(Atom.h MonomerInfo.h new_canon.h MolBundle.h + Subset.h DEST GraphMol) add_subdirectory(SmilesParse) diff --git a/Code/GraphMol/MolOps.cpp b/Code/GraphMol/MolOps.cpp index d37574db9..4bcce3003 100644 --- a/Code/GraphMol/MolOps.cpp +++ b/Code/GraphMol/MolOps.cpp @@ -41,6 +41,7 @@ #include #include #include +#include "Subset.h" const int ci_LOCAL_INF = static_cast(1e8); @@ -700,6 +701,7 @@ std::vector> getTheFrags( } int nFrags = getMolFrags(mol, *frags); std::vector> res; + if (nFrags == 1) { res.emplace_back(new RWMol(mol)); if (fragsMolAtomMapping) { @@ -777,35 +779,14 @@ std::vector> getTheFrags( // empirical. This is mainly intended to catch situations like proteins // where you have a bunch of single-atom fragments (waters); the // standard approach below ends up being horribly inefficient there - res.emplace_back(new RWMol()); - auto &frag = res.back(); - std::map atomIdxMap; - for (auto aid : comp) { - atomIdxMap[aid] = - frag->addAtom(mol.getAtomWithIdx(aid)->copy(), false, true); - } - for (auto bond : mol.bonds()) { - if (atomsInFrag[bond->getBeginAtomIdx()] && - atomsInFrag[bond->getEndAtomIdx()]) { - auto bondCopy = bond->copy(); - bondCopy->setBeginAtomIdx(atomIdxMap[bond->getBeginAtomIdx()]); - bondCopy->setEndAtomIdx(atomIdxMap[bond->getEndAtomIdx()]); - frag->addBond(bondCopy, true); - } - } - if (copyConformers) { - for (auto cit = mol.beginConformers(); cit != mol.endConformers(); - ++cit) { - auto *conf = new Conformer(frag->getNumAtoms()); - conf->setId((*cit)->getId()); - conf->set3D((*cit)->is3D()); - unsigned int cidx = 0; - for (auto ai : comp) { - conf->setAtomPos(cidx++, (*cit)->getAtomPos(ai)); - } - frag->addConformer(conf); - } - } + SubsetOptions opts{.sanitize = sanitizeFrags, + .clearComputedProps = true, + .copyCoordinates = copyConformers, + .method = SubsetMethod::BONDS_BETWEEN_ATOMS}; + std::vector atoms{comp.begin(), comp.end()}; + SubsetInfo info; + auto submol = copyMolSubset(mol, atoms, info, opts); + res.push_back(std::move(submol)); } else { res.emplace_back(new RWMol(mol)); auto &frag = res.back(); @@ -842,6 +823,7 @@ std::vector> getTheFrags( } return finalRes; } + } // namespace std::vector getMolFrags(const ROMol &mol, bool sanitizeFrags, INT_VECT *frags, diff --git a/Code/GraphMol/Subgraphs/SubgraphUtils.cpp b/Code/GraphMol/Subgraphs/SubgraphUtils.cpp index c5391e6ee..40771525c 100644 --- a/Code/GraphMol/Subgraphs/SubgraphUtils.cpp +++ b/Code/GraphMol/Subgraphs/SubgraphUtils.cpp @@ -12,9 +12,11 @@ #include "Subgraphs.h" #include #include +#include #include #include #include +#include #include #include @@ -27,84 +29,20 @@ ROMol *pathToSubmol(const ROMol &mol, const PATH_TYPE &path, bool useQuery) { ROMol *pathToSubmol(const ROMol &mol, const PATH_TYPE &path, bool useQuery, INT_MAP_INT &atomIdxMap) { - auto *subMol = new RWMol(); - // path needs to be in sorted order to preserve chirality - PATH_TYPE sorted_path(path); - std::sort(sorted_path.begin(), sorted_path.end()); + SubsetOptions options; + options.copyAsQuery = useQuery; + options.copyCoordinates = true; + options.method = SubsetMethod::BONDS; + std::vector upath{path.begin(), path.end()}; + SubsetInfo subsetInfo; + auto res = copyMolSubset(mol, upath, subsetInfo, options); atomIdxMap.clear(); - - if (useQuery) { - // have to do this in two different blocks because of issues with variable - // scopes. - for (auto bondidx : sorted_path) { - auto *bond = new QueryBond(*(mol.getBondWithIdx(bondidx))); - - int begIdx = bond->getBeginAtomIdx(); - int endIdx = bond->getEndAtomIdx(); - - if (atomIdxMap.find(begIdx) == atomIdxMap.end()) { - auto *atom = new QueryAtom(*(mol.getAtomWithIdx(begIdx))); - int newAtomIdx = subMol->addAtom(atom, false, true); - atomIdxMap[begIdx] = newAtomIdx; - } - begIdx = atomIdxMap.find(begIdx)->second; - if (atomIdxMap.find(endIdx) == atomIdxMap.end()) { - auto *atom = new QueryAtom(*(mol.getAtomWithIdx(endIdx))); - int newAtomIdx = subMol->addAtom(atom, false, true); - atomIdxMap[endIdx] = newAtomIdx; - } - endIdx = atomIdxMap.find(endIdx)->second; - - bond->setOwningMol(subMol); - bond->setBeginAtomIdx(begIdx); - bond->setEndAtomIdx(endIdx); - subMol->addBond(bond, true); - } - } else { - for (auto bondidx : sorted_path) { - Bond *bond = mol.getBondWithIdx(bondidx)->copy(); - - int begIdx = bond->getBeginAtomIdx(); - int endIdx = bond->getEndAtomIdx(); - - if (atomIdxMap.find(begIdx) == atomIdxMap.end()) { - Atom *atom = mol.getAtomWithIdx(begIdx)->copy(); - int newAtomIdx = subMol->addAtom(atom, false, true); - atomIdxMap[begIdx] = newAtomIdx; - } - begIdx = atomIdxMap.find(begIdx)->second; - if (atomIdxMap.find(endIdx) == atomIdxMap.end()) { - Atom *atom = mol.getAtomWithIdx(endIdx)->copy(); - int newAtomIdx = subMol->addAtom(atom, false, true); - atomIdxMap[endIdx] = newAtomIdx; - } - endIdx = atomIdxMap.find(endIdx)->second; - - bond->setOwningMol(subMol); - bond->setBeginAtomIdx(begIdx); - bond->setEndAtomIdx(endIdx); - subMol->addBond(bond, true); - } + for (auto mapping : subsetInfo.atomMapping) { + atomIdxMap[mapping.first] = mapping.second; } - if (mol.getNumConformers()) { - // copy coordinates over: - for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); - ++confIt) { - auto *conf = new Conformer(subMol->getNumAtoms()); - conf->set3D((*confIt)->is3D()); - for (INT_MAP_INT::const_iterator mapIt = atomIdxMap.begin(); - mapIt != atomIdxMap.end(); ++mapIt) { - conf->setAtomPos(mapIt->second, (*confIt)->getAtomPos(mapIt->first)); - } - conf->setId((*confIt)->getId()); - subMol->addConformer(conf, false); - } - } - // clear computed properties - subMol->clearComputedProps(true); - return subMol; + return res.release(); } PATH_TYPE bondListFromAtomList(const ROMol &mol, const PATH_TYPE &atomIds) { diff --git a/Code/GraphMol/Subgraphs/catch_tests.cpp b/Code/GraphMol/Subgraphs/catch_tests.cpp index 1c89d9f77..37d7f0bf5 100644 --- a/Code/GraphMol/Subgraphs/catch_tests.cpp +++ b/Code/GraphMol/Subgraphs/catch_tests.cpp @@ -57,4 +57,4 @@ TEST_CASE("shortestPathsOnly") { CHECK(ps.size() == 1); CHECK(ps[3].size() == 2); } -} \ No newline at end of file +} diff --git a/Code/GraphMol/Subset.cpp b/Code/GraphMol/Subset.cpp new file mode 100644 index 000000000..f2a225728 --- /dev/null +++ b/Code/GraphMol/Subset.cpp @@ -0,0 +1,338 @@ +// +// Copyright (C) 2025 Hussein Faara, Brian Kelley and other RDKit Contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#include +#include +#include +#include +#include "Subset.h" + +namespace RDKit { +namespace { + +inline void copyComputedProps(const ROMol &src, ROMol &dst) { + dst.updateProps(src); + for (auto &v : dst.getPropList(true, false)) { + if (v != RDKit::detail::computedPropName) dst.clearProp(v); + } +} + +static void copySelectedAtomsAndBonds(RWMol &extracted_mol, + const RDKit::ROMol &reference_mol, + SubsetInfo &selection_info, + const SubsetOptions &options) { + auto &[selectedAtoms, selectedBonds, atomMapping, bondMapping] = + selection_info; + for (const auto &ref_atom : reference_mol.atoms()) { + if (!selectedAtoms[ref_atom->getIdx()]) { + continue; + } + + std::unique_ptr extracted_atom{ + options.copyAsQuery ? new QueryAtom(*ref_atom) : ref_atom->copy()}; + + constexpr bool updateLabel = false; + constexpr bool takeOwnership = true; + atomMapping[ref_atom->getIdx()] = extracted_mol.addAtom( + extracted_atom.release(), updateLabel, takeOwnership); + } + + for (const auto &ref_bond : reference_mol.bonds()) { + if (!selectedBonds[ref_bond->getIdx()]) { + continue; + } + + std::unique_ptr extracted_bond{ + options.copyAsQuery ? new QueryBond(*ref_bond) : ref_bond->copy()}; + + // Check the stereo atoms + auto &atoms = extracted_bond->getStereoAtoms(); + if (atoms.size() == 2) { + auto map1 = atomMapping.find(atoms[0]); + auto map2 = atomMapping.find(atoms[1]); + if (map1 != atomMapping.end() && map2 != atomMapping.end()) { + atoms[0] = map1->second; + atoms[1] = map2->second; + } else { + atoms.clear(); // We couldn't map the stereo atoms + } + } + + for (auto &atomidx : atoms) { + auto map = atomMapping.find(atomidx); + if (map != atomMapping.end()) { + atomidx = map->second; + } + } + + extracted_bond->setBeginAtomIdx(atomMapping[ref_bond->getBeginAtomIdx()]); + extracted_bond->setEndAtomIdx(atomMapping[ref_bond->getEndAtomIdx()]); + + constexpr bool takeOwnership = true; + auto num_bonds = + extracted_mol.addBond(extracted_bond.release(), takeOwnership); + bondMapping[ref_bond->getIdx()] = num_bonds - 1; + } + // we need to update rings now + + if (selectedBonds.any() && reference_mol.getRingInfo()->isInitialized()) { + extracted_mol.getRingInfo()->reset(); + } +} + +static bool isSelectedSGroup(const SubstanceGroup &sgroup, + const SubsetInfo &selection_info) { + auto is_selected_component = [](auto &indices, auto &selection_test) { + return indices.empty() || + std::all_of(indices.begin(), indices.end(), selection_test); + }; + + auto atom_test = [&](int idx) { return selection_info.selectedAtoms[idx]; }; + auto bond_test = [&](int idx) { return selection_info.selectedBonds[idx]; }; + return is_selected_component(sgroup.getAtoms(), atom_test) && + is_selected_component(sgroup.getBonds(), bond_test) && + is_selected_component(sgroup.getParentAtoms(), atom_test); +} + +static void copySelectedSubstanceGroups(RWMol &extracted_mol, + const RDKit::ROMol &reference_mol, + const SubsetInfo &selection_info, + const SubsetOptions &) { + auto update_indices = [](auto &sgroup, auto getter, auto setter, + auto &mapping) { + auto indices = getter(sgroup); + std::for_each(indices.begin(), indices.end(), + [&](auto &idx) { idx = mapping.at(idx); }); + setter(sgroup, std::move(indices)); + }; + + const auto &[selectedAtoms, selectedBonds, atomMapping, bondMapping] = + selection_info; + for (const auto &sgroup : getSubstanceGroups(reference_mol)) { + if (!isSelectedSGroup(sgroup, selection_info)) { + continue; + } + + SubstanceGroup extracted_sgroup(sgroup); + extracted_sgroup.setOwningMol(&extracted_mol); + + update_indices(extracted_sgroup, std::mem_fn(&SubstanceGroup::getAtoms), + std::mem_fn(&SubstanceGroup::setAtoms), atomMapping); + update_indices(extracted_sgroup, + std::mem_fn(&SubstanceGroup::getParentAtoms), + std::mem_fn(&SubstanceGroup::setParentAtoms), atomMapping); + update_indices(extracted_sgroup, std::mem_fn(&SubstanceGroup::getBonds), + std::mem_fn(&SubstanceGroup::setBonds), bondMapping); + + addSubstanceGroup(extracted_mol, std::move(extracted_sgroup)); + } +} + +static void copySelectedStereoGroups(RWMol &extracted_mol, + const RDKit::ROMol &reference_mol, + const SubsetInfo &selection_info) { + auto is_selected_component = [](auto &objects, auto &selected_indices) { + return objects.empty() || + std::any_of(objects.begin(), objects.end(), [&](auto &object) { + return selected_indices[object->getIdx()]; + }); + }; + + auto is_selected_stereo_group = [&](const auto &stereo_group) { + return is_selected_component(stereo_group.getAtoms(), + selection_info.selectedAtoms) && + is_selected_component(stereo_group.getBonds(), + selection_info.selectedBonds); + }; + + std::vector extracted_atoms(extracted_mol.getNumAtoms()); + for (const auto &atom : extracted_mol.atoms()) { + extracted_atoms[atom->getIdx()] = atom; + } + + std::vector extracted_bonds(extracted_mol.getNumBonds()); + for (const auto &bond : extracted_mol.bonds()) { + extracted_bonds[bond->getIdx()] = bond; + } + + const auto &[selectedAtoms, selectedBonds, atomMapping, bondMapping] = + selection_info; + std::vector extracted_stereo_groups; + for (const auto &stereo_group : reference_mol.getStereoGroups()) { + if (!is_selected_stereo_group(stereo_group)) { + continue; + } + + std::vector atoms; + for (const auto &atom : stereo_group.getAtoms()) { + auto mapping = atomMapping.find(atom->getIdx()); + if (mapping != atomMapping.end()) { + atoms.push_back(extracted_atoms[mapping->second]); + } + } + + std::vector bonds; + for (const auto &bond : stereo_group.getBonds()) { + auto mapping = bondMapping.find(bond->getIdx()); + if (mapping != bondMapping.end()) { + bonds.push_back(extracted_bonds[mapping->second]); + } + } + + extracted_stereo_groups.push_back({stereo_group.getGroupType(), + std::move(atoms), std::move(bonds), + stereo_group.getReadId()}); + extracted_stereo_groups.back().setWriteId(stereo_group.getWriteId()); + } + + extracted_mol.setStereoGroups(std::move(extracted_stereo_groups)); +} + +static void getSubsetInfo(SubsetInfo &selection_info, const RDKit::ROMol &mol, + const std::vector &path, + const SubsetOptions &options) { + const auto num_atoms = mol.getNumAtoms(); + const auto num_bonds = mol.getNumBonds(); + selection_info.selectedAtoms.clear(); + selection_info.selectedAtoms.resize(num_atoms); + selection_info.selectedBonds.clear(); + selection_info.selectedBonds.resize(num_bonds); + + selection_info.atomMapping.clear(); + selection_info.bondMapping.clear(); + + auto &[selectedAtoms, selectedBonds, atomMapping, bondMapping] = + selection_info; + + if (options.method == SubsetMethod::BONDS_BETWEEN_ATOMS) { + for (const auto &atom_idx : path) { + if (atom_idx < num_atoms) { + selectedAtoms.set(atom_idx); + } + } + for (const auto &bond : mol.bonds()) { + if (selectedAtoms[bond->getBeginAtomIdx()] && + selectedAtoms[bond->getEndAtomIdx()]) { + selectedBonds.set(bond->getIdx()); + } + } + } else if (options.method == SubsetMethod::BONDS) { + for (const auto &bond_idx : path) { + if (bond_idx < num_bonds) { + selectedBonds.set(bond_idx); + const auto &bnd = mol.getBondWithIdx(bond_idx); + selectedAtoms.set(bnd->getBeginAtomIdx()); + selectedAtoms.set(bnd->getEndAtomIdx()); + } + } + } +} + +void copyCoords(RDKit::RWMol ©, const RDKit::RWMol &mol, + SubsetInfo &subset_info) { + if (mol.getNumConformers()) { + // copy coordinates over: + for (auto confIt = mol.beginConformers(); confIt != mol.endConformers(); + ++confIt) { + auto *conf = new Conformer(copy.getNumAtoms()); + conf->set3D((*confIt)->is3D()); + for (auto &mapping : subset_info.atomMapping) { + conf->setAtomPos(mapping.second, (*confIt)->getAtomPos(mapping.first)); + } + conf->setId((*confIt)->getId()); + copy.addConformer(conf, false); + } + } +} + +std::unique_ptr copyMolSubset(const RDKit::ROMol &mol, + SubsetInfo &selection_info, + const SubsetOptions &options) { + auto extracted_mol = std::make_unique(); + copySelectedAtomsAndBonds(*extracted_mol, mol, selection_info, options); + copySelectedSubstanceGroups(*extracted_mol, mol, selection_info, options); + copySelectedStereoGroups(*extracted_mol, mol, selection_info); + if (options.copyCoordinates) { + copyCoords(*extracted_mol, mol, selection_info); + } + + if (options.sanitize) { + MolOps::sanitizeMol(*extracted_mol); + } + + if (options.clearComputedProps) { + // this clears atom/bond and molecule computed props + extracted_mol->clearComputedProps(); + } else { + // this copies the mol computed props over, the atoms/bonds already have + // their's + copyComputedProps(mol, *extracted_mol); + } + + return extracted_mol; +} + +} // namespace + +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &atoms, + const std::vector &bonds, const SubsetOptions &options) { + SubsetInfo mappings; + return copyMolSubset(mol, atoms, bonds, mappings, options); +} + +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &atoms, + const std::vector &bonds, SubsetInfo &selection_info, + const SubsetOptions &options) { + const auto natoms = mol.getNumAtoms(); + const auto nbonds = mol.getNumBonds(); + + if ((atoms.size() == natoms && + options.method == SubsetMethod::BONDS_BETWEEN_ATOMS) || + (bonds.size() == nbonds && options.method == SubsetMethod::BONDS) || + (atoms.size() == natoms && bonds.size() == nbonds)) { + // optimization to copy the entire thing + return std::make_unique(mol); + } + + selection_info.selectedAtoms.reset(); + selection_info.selectedAtoms.resize(natoms); + selection_info.selectedBonds.reset(); + selection_info.selectedBonds.resize(nbonds); + selection_info.atomMapping.clear(); + selection_info.bondMapping.clear(); + + for (auto v : atoms) { + selection_info.selectedAtoms.set(v); + } + for (auto v : bonds) { + selection_info.selectedBonds.set(v); + } + + auto res = copyMolSubset(mol, selection_info, options); + return res; +} + +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &path, + const SubsetOptions &options) { + SubsetInfo selection_info; + return copyMolSubset(mol, path, selection_info, options); +} + +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &path, + SubsetInfo &selection_info, const SubsetOptions &options) { + getSubsetInfo(selection_info, mol, path, options); + auto res = copyMolSubset(mol, selection_info, options); + return res; +} + +} // namespace RDKit diff --git a/Code/GraphMol/Subset.h b/Code/GraphMol/Subset.h new file mode 100644 index 000000000..6b7e79a7f --- /dev/null +++ b/Code/GraphMol/Subset.h @@ -0,0 +1,129 @@ +// +// Copyright (C) 2025 Hussein Faara, Brian Kelley and other RDKit contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#ifndef RD_SUBSET_H +#define RD_SUBSET_H + +#include +#include + +#include +#include + +namespace RDKit { +class RWMol; + +//! Subsetting Methods for copyMolSubset +/* + * These control what the path structures mean in copyMolSubset + * + * NOTE: when the atoms and bonds are fully specified, the subset method is + * ignored. + */ +enum class SubsetMethod { + BONDS_BETWEEN_ATOMS = + 0, // selectedAtoms; /**< atoms selected in the subset */ + boost::dynamic_bitset<> selectedBonds; /**< bonds selected in the subset */ + std::map + atomMapping; /**< mapping of original atom indices to new ones */ + std::map + bondMapping; /**< mapping of original bond indices to new ones */ +}; + +//! +/* + * Extract a subgraph from an ROMol. Bonds, atoms, substance groups and + * stereo groups are only extracted to the subgraph if all participant entities + * are contained within the given atoms and bonds. + * + * \param mol - starting mol + * \param atoms - atoms to extract + * \param bonds - bonds to extract + * \param subsetInfo - optional subsetInfo to record the atoms and bonds used + * \param options - subset options, note the method is ignored since all the + * atoms and bonds are specified + * + * + * NOTE: Bookmarks are currently copied, StereoGroups that are not entirely + * included in the subset are not copied. + * + * NOTE: when using this method the SubsetOptions.method is ignored + * for resolving the atoms and bonds to use to subset + */ + +RDKIT_GRAPHMOL_EXPORT +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &atoms, + const std::vector &bonds, + const SubsetOptions &options = SubsetOptions()); + +RDKIT_GRAPHMOL_EXPORT +std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &atoms, + const std::vector &bonds, SubsetInfo &subsetInfo, + const SubsetOptions &options = SubsetOptions()); + +//! +/* + * Extract a subgraph from an ROMol. Bonds, substance groups and + * stereo groups are only extracted to the subgraph if all participant entities + * are selected by the `path` parameter. + * + * \param mol - starting mol + * \param path - the indices of atoms or bonds to extract. If an index falls + * outside of the acceptable indices, it is ignored. + * use SubsetMethod::BONDS to indicate a bond path and + * SubsetMethod::BONDS_BETWEEN_ATOMS to indicate an atom path + * and any bond that includes both atoms in the path + * \param subsetInfo - optional subsetInfo to record the atoms and bonds used + * \param option - optional SubsetOptions to control how the subset is created + * + * NOTE: Bookmarks are currently copied, StereoGroups that are not entirely + * included in the subset are not copied. + * + */ +RDKIT_GRAPHMOL_EXPORT std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &path, + const SubsetOptions &options = SubsetOptions()); + +RDKIT_GRAPHMOL_EXPORT std::unique_ptr copyMolSubset( + const RDKit::ROMol &mol, const std::vector &path, + SubsetInfo &subsetInfo, const SubsetOptions &options = SubsetOptions()); + +} // namespace RDKit +#endif diff --git a/Code/GraphMol/Wrap/CMakeLists.txt b/Code/GraphMol/Wrap/CMakeLists.txt index 5f01f1e68..189baafa7 100644 --- a/Code/GraphMol/Wrap/CMakeLists.txt +++ b/Code/GraphMol/Wrap/CMakeLists.txt @@ -86,3 +86,6 @@ add_pytest(pyTestPropertyLists add_pytest(pyCDXMLTest ${CMAKE_CURRENT_SOURCE_DIR}/test_cdxml.py) + +add_pytest(pySubsetTest + ${CMAKE_CURRENT_SOURCE_DIR}/test_subset.py) diff --git a/Code/GraphMol/Wrap/MolOps.cpp b/Code/GraphMol/Wrap/MolOps.cpp index 02cda207b..189a7589f 100644 --- a/Code/GraphMol/Wrap/MolOps.cpp +++ b/Code/GraphMol/Wrap/MolOps.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,8 @@ #include #include +#include + namespace python = boost::python; using boost_adaptbx::python::streambuf; @@ -1098,6 +1101,60 @@ python::object findMesoHelper(const ROMol &mol, bool includeIsotopes, return python::tuple(res); } +ROMol *copyMolSubsetHelper1(const ROMol &mol, + python::object pyAtomIndices, + python::object pyBondIndices, + const SubsetOptions &options = SubsetOptions()) { + auto atomIndices = pythonObjectToVect(pyAtomIndices); + auto bondIndices = pythonObjectToVect(pyBondIndices); + if (!atomIndices.get()) { + atomIndices = std::make_unique>(); + } + if (!bondIndices.get()) { + bondIndices = std::make_unique>(); + } + + return copyMolSubset(mol, *atomIndices, *bondIndices, options).release(); +} + +ROMol *copyMolSubsetHelper2(const ROMol &mol, + python::object pyAtomIndices, + python::object pyBondIndices, + SubsetInfo &info, + const SubsetOptions &options = SubsetOptions()) { + auto atomIndices = pythonObjectToVect(pyAtomIndices); + auto bondIndices = pythonObjectToVect(pyBondIndices); + if (!atomIndices.get()) { + atomIndices = std::make_unique>(); + } + if (!bondIndices.get()) { + bondIndices = std::make_unique>(); + } + + return copyMolSubset(mol, *atomIndices, *bondIndices, info, options).release(); +} + + +ROMol *copyMolSubsetHelper3(const ROMol &mol, + python::object path, + const SubsetOptions &options = SubsetOptions()) { + auto pathvect = pythonObjectToVect(path); + if (!pathvect.get()) + pathvect = std::make_unique>(); + return copyMolSubset(mol, *pathvect, options).release(); +} + +ROMol *copyMolSubsetHelper4(const ROMol &mol, + python::object path, + SubsetInfo &selectionInfo, + const SubsetOptions &options = SubsetOptions()) { + auto pathvect = pythonObjectToVect(path); + if (!pathvect.get()) + pathvect = std::make_unique>(); + return copyMolSubset(mol, *pathvect, selectionInfo, options).release(); +} + + struct molops_wrapper { static void wrap() { std::string docString; @@ -3327,6 +3384,93 @@ enantiomer" or "OR enantiomer". CIP labels, if present, are removed. "AtomHasConjugatedBond", MolOps::atomHasConjugatedBond, (python::arg("atom")), "returns whether or not the atom is involved in a conjugated bond"); + + + python::enum_("SubsetMethod") + .value("BONDS_BETWEEN_ATOMS", + RDKit::SubsetMethod::BONDS_BETWEEN_ATOMS) + .value("BONDS", RDKit::SubsetMethod::BONDS); + + python::class_>("BoolVector") + .def(python::vector_indexing_suite>()); + + python::class_("SubsetOptions") + .def_readwrite("sanitize", + &RDKit::SubsetOptions::sanitize, "Sanitize the resulting subset") + .def_readwrite("clearComputedProps", &RDKit::SubsetOptions::clearComputedProps, + "clear all computed props on the subsetted molecule") + .def_readwrite("copyAsQuery", &RDKit::SubsetOptions::copyAsQuery, + "Return the subset as a query") + .def_readwrite("copyCoordinates", &RDKit::SubsetOptions::copyCoordinates, + "Copy the active coordinates from the molecule") + .def_readwrite("conformerIdx", &RDKit::SubsetOptions::conformerIdx, + "What conformer idx to use for the coordinates default is -1") + .def_readwrite("method", &RDKit::SubsetOptions::method, + "Subsetting method to use"); + + if (!is_python_converter_registered>()) { + python::class_>("UIntUIntMap") + .def(python::map_indexing_suite, true>()); + } + + python::class_("SubsetInfo") + .def_readwrite("atomMapping", &RDKit::SubsetInfo::atomMapping, + "mapping from the original atom index to the subset atom index") + .def_readwrite("bondMapping", &RDKit::SubsetInfo::bondMapping, + " mapping from the original bond index to the subset bond index"); + + + + docString = "Extract a subgraph from an ROMol. Bonds, atoms, substance groups and \n\ +stereo groups are only extracted to the subgraph if all participant entities \n\ +are contained within the given atoms and bonds. \n\ +\n\ +ARGUMENTS:\n\ + - mol - starting mol \n\ + - atoms - indices atoms to extract \n\ + - bonds - indices bonds to extract \n\ + - subsetInfo - optional subsetInfo to record the atoms and bonds used \n\ + - options - optional subset options, note the method is ignored since all the atoms and bonds are sp \n\ +\n"; + + python::def("CopyMolSubset", copyMolSubsetHelper1, + (python::arg("mol"), python::arg("atomIndices"), python::arg("bondIndices"), + python::arg("options")=SubsetOptions()), + docString.c_str(), + python::return_value_policy()); + python::def("CopyMolSubset", copyMolSubsetHelper2, + (python::arg("mol"), python::arg("atomIndices"), python::arg("bondIndices"), + python::arg("subsetInfo"), + python::arg("options")=SubsetOptions()), + docString.c_str(), + python::return_value_policy()); + + docString = "Extract a subgraph from an ROMol. Bonds, atoms, substance groups and \n\ +stereo groups are only extracted to the subgraph if all participant entities \n\ +are contained within the given atoms and bonds. \n\ +\n\ +ARGUMENTS:\n\ + - mol - starting mol \n\ + - path - the indices of atoms or bonds to extract. If an index falls \n\ + outside of the acceptable indices, it is ignored.yes \n\ + Use SubsetMethod.BONDS to indicate a bond path and BONDS_BETWEEN_ATOMS \n\ + to indicate an atom path with any bond that includes atoms in the path. \n\ + - subsetInfo - optional subsetInfo to record the atoms and bonds used \n\ + - options - optional subset options, note the method is ignored since all the atoms and bonds are sp \n\ +\n"; + + python::def("CopyMolSubset", copyMolSubsetHelper3, + (python::arg("mol"), python::arg("path"), + python::arg("options")=SubsetOptions()), + "copy a subset of a molecule", + python::return_value_policy()); + python::def("CopyMolSubset", copyMolSubsetHelper4, + (python::arg("mol"), python::arg("path"), + python::arg("subsetInfo"), + python::arg("options")=SubsetOptions()), + "copy a subset of a molecule", + python::return_value_policy()); + } }; } // namespace RDKit diff --git a/Code/GraphMol/Wrap/test_subset.py b/Code/GraphMol/Wrap/test_subset.py new file mode 100644 index 000000000..c0892449f --- /dev/null +++ b/Code/GraphMol/Wrap/test_subset.py @@ -0,0 +1,75 @@ +# +# Copyright (C) 2025 Hussein Faara, Brian Kelley and other RDKit contributors +# All Rights Reserved +# +import doctest +import gc +import gzip +import logging +import os +import sys +import tempfile +import unittest +from io import StringIO + +from rdkit import Chem + +def check(info, selection): + for i,v in enumerate(selection): + assert (v and i in info) or (i not in info) + +class TestCase(unittest.TestCase): + + def test_subset(self): + smi = "c1ccccc1CCN" + m = Chem.MolFromSmiles(smi) + N = list(range(6)) + sub = Chem.CopyMolSubset(m, N) + self.assertEqual(Chem.MolToSmiles(sub), "c1ccccc1") + + info = Chem.SubsetInfo() + sub = Chem.CopyMolSubset(m, N, info) + check(info.atomMapping, [True]*6 + [False]*3) + check(info.bondMapping, + [True, True, True, True, True, False, False, False, True]) + atoms = [(0,0),(1,1),(2,2),(3,3),(4,4),(5,5)] + bonds = [(0,0),(1,1),(2,2),(3,3),(4,4),(8,5)] + + for src,dst in atoms: + self.assertEqual(info.atomMapping[src], dst) + for src,dst in bonds: + self.assertEqual(info.bondMapping[src], dst) + + opts = Chem.SubsetOptions() + opts.method = Chem.SubsetMethod.BONDS; + sub = Chem.CopyMolSubset(m, N, opts) + self.assertEqual(Chem.MolToSmiles(sub), "ccccccC") + + sub = Chem.CopyMolSubset(m, N, info, opts) + check(info.atomMapping, [True]*7 + [False]*2) + check(info.bondMapping, [True]*6 + [False]*3) + for i in N: + self.assertEqual(info.atomMapping[i], i) + self.assertEqual(info.bondMapping[i], i) + + N = list(range(6,11)) + sub = Chem.CopyMolSubset(m, N) + self.assertEqual(Chem.MolToSmiles(sub), "CCN") + info = Chem.SubsetInfo() + sub = Chem.CopyMolSubset(m, N, info) + self.assertEqual(Chem.MolToSmiles(sub), "CCN") + check(info.atomMapping, [False]*6 + [True]*3) + check(info.bondMapping, [False]*6+ [True]*2 + [False]) + + sub = Chem.CopyMolSubset(m, N, info, opts) + self.assertEqual(Chem.MolToSmiles(sub), "CCN.cc") + check(info.atomMapping, [True, False, False, False, False, True, True, True, True]) + check(info.bondMapping, [False, False, False, False, False, False, True, True, True]) + + m.SetProp("foo", "bar", computed=True) + sub = Chem.CopyMolSubset(m, N) + self.assertTrue(sub.HasProp("foo")) + + opts.clearComputedProps = True + sub = Chem.CopyMolSubset(m, N, opts) + self.assertFalse(sub.HasProp("foo")) diff --git a/Code/GraphMol/catch_molops.cpp b/Code/GraphMol/catch_molops.cpp index 087f3a1ba..d7665f550 100644 --- a/Code/GraphMol/catch_molops.cpp +++ b/Code/GraphMol/catch_molops.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -417,6 +418,208 @@ M END)CTAB"; } } +// helper api to get test data for copyMolSubset +struct SelectedComponents { + std::vector selected_atoms; + std::vector selected_bonds; +}; + +// helper api to get test mol for copyMolSubset api. +static std::unique_ptr getTestMol() { + std::unique_ptr mol{RDKit::SmilesToMol("CCCCCCCCCCCCCCC")}; + for (auto &atom : mol->atoms()) { + atom->setProp("orig_idx", atom->getIdx()); + } + + for (auto &bond : mol->bonds()) { + bond->setProp("orig_idx", bond->getIdx()); + } + + return mol; +} + +// Helper api to get the included atoms and bonds from test atom indices. +[[nodiscard]] static SelectedComponents get_selected_components( + ::RDKit::RWMol &mol, const std::vector &atom_ids) { + const auto num_atoms = mol.getNumAtoms(); + std::vector selected_atoms(num_atoms); + + for (auto &atom_idx : atom_ids) { + if (atom_idx < num_atoms) { + selected_atoms[atom_idx] = true; + } + } + + std::vector selected_bonds(mol.getNumBonds()); + for (auto &bond : mol.bonds()) { + if (selected_atoms[bond->getBeginAtomIdx()] && + selected_atoms[bond->getEndAtomIdx()]) { + selected_bonds[bond->getIdx()] = true; + } + } + + return {std::move(selected_atoms), std::move(selected_bonds)}; +} + +TEST_CASE("test_extract_atoms", "[copyMolSubset]") { + auto selected_atoms = GENERATE( + // unique values + std::vector{0, 2, 4, 6, 8, 10, 12}, + // duplicate values + std::vector{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12}, + // values outside of atom indices + std::vector{0, 2, 4, 6, 8, 10, 12, 100, 200, 300}); + + std::vector expected_atoms{0, 2, 4, 6, 8, 10, 12}; + + auto mol = getTestMol(); + auto extracted_mol = copyMolSubset(*mol, selected_atoms); + REQUIRE(extracted_mol->getNumAtoms() == expected_atoms.size()); + + std::vector extracted_atoms; + for (auto &atom : extracted_mol->atoms()) { + extracted_atoms.push_back(atom->template getProp("orig_idx")); + } + + CHECK(extracted_atoms == expected_atoms); +} + +TEST_CASE("test_extract_bonds", "[copyMolSubset]") { + auto test_mol = getTestMol(); + + for (auto &bond : test_mol->bonds()) { + bond->setProp("test_prop", true); + } + + for (auto &bond : test_mol->bonds()) { + auto begin_idx = bond->getBeginAtomIdx(); + auto end_idx = bond->getEndAtomIdx(); + auto m = copyMolSubset(*test_mol, {begin_idx, end_idx}); + + REQUIRE(m->getNumBonds() == 1); + CHECK(m->getBondWithIdx(0)->getProp("test_prop") == true); + CHECK(m->getNumAtoms() == 2); + CHECK(m->getAtomWithIdx(0)->getProp("orig_idx") == begin_idx); + CHECK(m->getAtomWithIdx(1)->getProp("orig_idx") == end_idx); + } +} + +TEST_CASE("test_extract_substance_groups", "[copyMolSubset]") { + auto mol = getTestMol(); + ::RDKit::SubstanceGroup sgroup{mol.get(), "COP"}; + + auto test_sgroup_atoms = GENERATE(std::vector{}, + std::vector{0, 1, 2, 3, 4}, + std::vector{9, 10, 11}); + sgroup.setAtoms(test_sgroup_atoms); + + auto test_sgroup_bonds = + GENERATE(std::vector{}, std::vector{0, 1, 2}, + std::vector{3, 4, 5}); + sgroup.setBonds(test_sgroup_bonds); + + auto test_sgroup_patoms = + GENERATE(std::vector{}, std::vector{3, 4}, + std::vector{5, 6}); + sgroup.setParentAtoms(test_sgroup_patoms); + + ::RDKit::addSubstanceGroup(*mol, std::move(sgroup)); + + auto has_selected_components = [&](auto &components, auto &ref_bitset) { + return components.empty() || + std::ranges::all_of(components, [&](auto &idx) { + return idx < ref_bitset.size() && ref_bitset[idx]; + }); + }; + + auto test_selected_atoms = GENERATE( + std::vector{0, 1, 2, 3, 4, 5}, + std::vector{0, 2, 4, 6, 8, 10, 12}, + std::vector{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12}, + std::vector{0, 2, 4, 6, 8, 10, 12, 100, 200, 300}); + + auto extracted_mol = copyMolSubset(*mol, test_selected_atoms); + + auto [selected_atoms, selected_bonds] = + get_selected_components(*mol, test_selected_atoms); + auto flag = ::RDKit::getSubstanceGroups(*extracted_mol).size() == 1; + REQUIRE(flag == + (has_selected_components(test_sgroup_atoms, selected_atoms) && + has_selected_components(test_sgroup_patoms, selected_atoms) && + has_selected_components(test_sgroup_bonds, selected_bonds))); + + if (flag) { + auto &extracted_sgroup = ::RDKit::getSubstanceGroups(*extracted_mol)[0]; + for (auto &idx : extracted_sgroup.getAtoms()) { + auto atom = extracted_mol->getAtomWithIdx(idx); + CHECK(selected_atoms[atom->template getProp("orig_idx")] == + true); + } + + for (auto &idx : extracted_sgroup.getParentAtoms()) { + auto atom = extracted_mol->getAtomWithIdx(idx); + CHECK(selected_atoms[atom->template getProp("orig_idx")] == + true); + } + + for (auto &idx : extracted_sgroup.getBonds()) { + auto bond = extracted_mol->getBondWithIdx(idx); + CHECK(selected_bonds[bond->template getProp("orig_idx")] == + true); + } + } +} + +TEST_CASE("test_extract_stereo_groups", "[copyMolSubset]") { + auto mol = getTestMol(); + + auto test_stereo_group_atoms = GENERATE( + std::vector{}, std::vector{0, 1, 2, 3, 4}, + std::vector{9, 10, 11}); + + std::vector<::RDKit::Atom *> sg_atoms; + for (auto &idx : test_stereo_group_atoms) { + sg_atoms.push_back(mol->getAtomWithIdx(idx)); + } + + auto test_stereo_group_bonds = + GENERATE(std::vector{}, std::vector{0, 1, 2}, + std::vector{3, 4, 5}); + + std::vector<::RDKit::Bond *> sg_bonds; + for (auto &idx : test_stereo_group_bonds) { + sg_bonds.push_back(mol->getBondWithIdx(idx)); + } + + ::RDKit::StereoGroup stereo_group{::RDKit::StereoGroupType::STEREO_ABSOLUTE, + std::move(sg_atoms), std::move(sg_bonds)}; + mol->setStereoGroups({std::move(stereo_group)}); + + auto test_selected_atoms = GENERATE( + std::vector{0, 1, 2, 3, 4, 5}, + std::vector{0, 2, 4, 6, 8, 10, 12}, + std::vector{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12}, + std::vector{0, 2, 4, 6, 8, 10, 12, 100, 200, 300}); + + auto extracted_mol = copyMolSubset(*mol, test_selected_atoms); + + auto [selected_atoms, selected_bonds] = + get_selected_components(*mol, test_selected_atoms); + + auto flag = extracted_mol->getStereoGroups().size() == 1; + + if (flag) { + auto &extracted_stereo_group = extracted_mol->getStereoGroups()[0]; + for (auto &atom : extracted_stereo_group.getAtoms()) { + CHECK(selected_atoms[atom->template getProp("orig_idx")] == true); + } + + for (auto &bond : extracted_stereo_group.getBonds()) { + CHECK(selected_bonds[bond->template getProp("orig_idx")] == true); + } + } +} + TEST_CASE("GitHub #8726: Do not remove hydrides by default") { auto m = "[OH+][H-]"_smiles; REQUIRE(m); @@ -435,4 +638,4 @@ TEST_CASE("Github #8945") { CHECK(MolToSmiles(*m1) == MolToSmiles(*m2)); } -} \ No newline at end of file +} diff --git a/Code/JavaWrappers/Subset.i b/Code/JavaWrappers/Subset.i new file mode 100644 index 000000000..6403dbe78 --- /dev/null +++ b/Code/JavaWrappers/Subset.i @@ -0,0 +1,73 @@ +// +// Copyright (C) 2025 and other RDKit contributors +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +%{ +#include +%} + +%ignore RDKit::copyMolSubset; +%include + +%{ +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector &atoms, + const std::vector &bonds, + const RDKit::SubsetOptions &options=RDKit::SubsetOptions() + ) +{ + return copyMolSubset(mol, atoms, bonds, options).release(); +} + + +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector &atoms, + const std::vector &bonds, + RDKit::SubsetInfo &subsetInfo, + const RDKit::SubsetOptions &options=RDKit::SubsetOptions() + ) { + return copyMolSubset(mol, atoms, bonds, subsetInfo, options).release(); +} + +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector& path, + const RDKit::SubsetOptions &options = RDKit::SubsetOptions()) { + return copyMolSubset(mol, path, options).release(); +} + + +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector& path, + RDKit::SubsetInfo &subsetInfo, + const RDKit::SubsetOptions &options = RDKit::SubsetOptions()) { + return copyMolSubset(mol, path, subsetInfo, options).release(); +} + +%} + +%rename("copyMolSubset") copyMolSubsetHelper; + +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector &atoms, + const std::vector &bonds, + const RDKit::SubsetOptions &options=RDKit::SubsetOptions() + ); +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector &atoms, + const std::vector &bonds, + RDKit::SubsetInfo &subsetInfo, + const RDKit::SubsetOptions &options=SRDKit::ubsetOptions() + ); +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector& path, + const RDKit::SubsetOptions &options = SubsetOptions()); + +RDKit::ROMol *copyMolSubsetHelper(const RDKit::ROMol& mol, + const std::vector& path, + RDKit::SubsetInfo &subsetInfo, + const RDKit::SubsetOptions &options = RDKit::SubsetOptions()); diff --git a/Code/JavaWrappers/gmwrapper/GraphMolJava.i b/Code/JavaWrappers/gmwrapper/GraphMolJava.i index 8f350857f..1c37b5fb4 100644 --- a/Code/JavaWrappers/gmwrapper/GraphMolJava.i +++ b/Code/JavaWrappers/gmwrapper/GraphMolJava.i @@ -227,6 +227,7 @@ typedef unsigned long long int uintmax_t; %include "../GeneralizedSubstruct.i" %include "../RascalMCES.i" %include "../Queries.i" +%include "../Subset.i" // Create a class to throw various sorts of errors for testing. Required for unit tests in ErrorHandlingTests.java #ifdef INCLUDE_ERROR_GENERATOR diff --git a/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemTransformsTests.java b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemTransformsTests.java index 1ac243f59..546ddc980 100644 --- a/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemTransformsTests.java +++ b/Code/JavaWrappers/gmwrapper/src-test/org/RDKit/ChemTransformsTests.java @@ -70,6 +70,19 @@ public class ChemTransformsTests extends GraphMolTest { } + @Test + public void testSubset() { + ROMol mol = RWMol.MolFromSmiles("c1ccccc1CCN"); + UInt_Vect vect = new UInt_Vect(mol.getNumAtoms()); + for(int i=0;i<6;++i) { + vect.add(i); + } + // atom copy + ROMol sub = RDKFuncs.copyMolSubset(mol, vect); + assertEquals("c1ccccc1", sub.MolToSmiles()); + + } + public static void main(String args[]) { org.junit.runner.JUnitCore.main("org.RDKit.ChemTransformsTests"); }