// // Copyright (C) 2003-2025 Greg Landrum and other RDKit contributors // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "SubstructUtils.h" #include #include #include #include #include #include #include #include #include #include namespace RDKit { namespace detail { // Helper class used by the sortMatchesByDegreeOfCoreSubstitution // and getMostSubstitutedCoreMatch functions. A penalty of 1.0 is assigned // to matches for each terminal dummy atom matching hydrogen. // To make the sort stable in case of ties, a fraction of 1.0 // is added to each score based on match indices. class ScoreMatchesByDegreeOfCoreSubstitution { public: typedef std::pair IdxScorePair; ScoreMatchesByDegreeOfCoreSubstitution( const RDKit::ROMol &mol, const RDKit::ROMol &query, const std::vector &matches) : d_mol(mol), d_query(query), d_matches(matches), d_sumIndices(0.0), d_minIdx(-1), d_isSorted(false) { PRECONDITION(!matches.empty(), "matches must not be empty"); auto na = d_mol.getNumAtoms(); d_sumIndices = static_cast(na * (na + 1) / 2); unsigned int i = 0; d_matchIdxVsScore.reserve(d_matches.size()); for (const auto &match : d_matches) { d_matchIdxVsScore.emplace_back(i++, computeScore(match)); } } const RDKit::MatchVectType &getMostSubstitutedCoreMatch() { if (d_minIdx == -1) { d_minIdx = std::min_element(d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), compare) ->first; } return d_matches.at(d_minIdx); } std::vector sortMatchesByDegreeOfCoreSubstitution() { if (!d_isSorted) { std::sort(d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), compare); d_isSorted = true; d_minIdx = d_matchIdxVsScore.front().first; } std::vector res(d_matches.size()); std::transform( d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), res.begin(), [this](const IdxScorePair &pair) { return d_matches.at(pair.first); }); return res; } private: static bool compare(const IdxScorePair &aPair, const IdxScorePair &bPair) { return (aPair.second < bPair.second); } bool doesRGroupMatchHydrogen(const std::pair &pair) const { const auto queryAtom = d_query.getAtomWithIdx(pair.first); const auto molAtom = d_mol.getAtomWithIdx(pair.second); return (molAtom->getAtomicNum() == 1 && isAtomTerminalRGroupOrQueryHydrogen(queryAtom)); } double computeScore(const RDKit::MatchVectType &match) const { double penalty = 0.0; double i = 0.0; for (const auto &pair : match) { i += static_cast(pair.second); if (doesRGroupMatchHydrogen(pair)) { penalty += 1.0; } } penalty += i / d_sumIndices; return penalty; } const RDKit::ROMol &d_mol; const RDKit::ROMol &d_query; const std::vector &d_matches; std::vector d_matchIdxVsScore; double d_sumIndices; int d_minIdx; bool d_isSorted; }; } // namespace detail bool propertyCompat(const RDProps *r1, const RDProps *r2, const std::vector &properties) { PRECONDITION(r1, "bad RDProps"); PRECONDITION(r2, "bad RDProps"); for (const auto &prop : properties) { std::string prop1; bool hasprop1 = r1->getPropIfPresent(prop, prop1); std::string prop2; bool hasprop2 = r2->getPropIfPresent(prop, prop2); if (hasprop1 && hasprop2) { if (prop1 != prop2) { return false; } } else if (hasprop1 || hasprop2) { // only one has the property return false; } } return true; } bool atomCompat(const Atom *a1, const Atom *a2, const SubstructMatchParameters &ps) { PRECONDITION(a1, "bad atom"); PRECONDITION(a2, "bad atom"); // std::cerr << "\t\tatomCompat: "<< a1 << " " << a1->getIdx() << "-" << a2 << // " " << a2->getIdx() << std::endl; if (ps.extraAtomCheckOverridesDefaultCheck && ps.extraAtomCheck) { return ps.extraAtomCheck(*a1, *a2); } bool res; if (ps.useQueryQueryMatches && a1->hasQuery() && a2->hasQuery()) { res = static_cast(a1)->QueryMatch( static_cast(a2)); } else { res = a1->Match(a2); } if (!res) { return false; } if (!ps.atomProperties.empty()) { if (!propertyCompat(a1, a2, ps.atomProperties)) { return false; } } if (ps.extraAtomCheck && !ps.extraAtomCheck(*a1, *a2)) { return false; } return res; } bool chiralAtomCompat(const Atom *&a1, const Atom *&a2) { /// DEPRECATED PRECONDITION(a1, "bad atom"); PRECONDITION(a2, "bad atom"); bool res = a1->Match(a2); if (res) { std::string s1, s2; bool hascode1 = a1->getPropIfPresent(common_properties::_CIPCode, s1); bool hascode2 = a2->getPropIfPresent(common_properties::_CIPCode, s2); if (hascode1 || hascode2) { res = hascode1 && hascode2 && s1 == s2; } } std::cerr << "\t\tchiralAtomCompat: " << a1 << " " << a1->getIdx() << "-" << a2 << " " << a2->getIdx() << std::endl; std::cerr << "\t\t " << res << std::endl; return res; } bool bondCompat(const Bond *b1, const Bond *b2, const SubstructMatchParameters &ps) { PRECONDITION(b1, "bad bond"); PRECONDITION(b2, "bad bond"); if (ps.extraBondCheckOverridesDefaultCheck && ps.extraBondCheck) { return ps.extraBondCheck(*b1, *b2); } bool res; auto isConjugatedSingleOrDoubleBond([](const Bond *bond) { return bond->getIsConjugated() && (bond->getBondType() == Bond::SINGLE || bond->getBondType() == Bond::DOUBLE); }); auto isSingleOrDoubleBond([](const Bond *bond) { return (bond->getBondType() == Bond::SINGLE || bond->getBondType() == Bond::DOUBLE); }); if (ps.useQueryQueryMatches && b1->hasQuery() && b2->hasQuery()) { res = static_cast(b1)->QueryMatch( static_cast(b2)); } else if (ps.aromaticMatchesConjugated && !b1->hasQuery() && !b2->hasQuery() && ((b1->getBondType() == Bond::AROMATIC && b2->getBondType() == Bond::AROMATIC) || (b1->getBondType() == Bond::AROMATIC && isConjugatedSingleOrDoubleBond(b2)) || (b2->getBondType() == Bond::AROMATIC && isConjugatedSingleOrDoubleBond(b1)))) { res = true; } else if (ps.aromaticMatchesSingleOrDouble && !b1->hasQuery() && !b2->hasQuery() && ((b1->getBondType() == Bond::AROMATIC && b2->getBondType() == Bond::AROMATIC) || (b1->getBondType() == Bond::AROMATIC && isSingleOrDoubleBond(b2)) || (b2->getBondType() == Bond::AROMATIC && isSingleOrDoubleBond(b1)))) { res = true; } else { res = b1->Match(b2); } if (!res) { return false; } if (b1->getBondType() == Bond::DATIVE && b2->getBondType() == Bond::DATIVE) { // for dative bonds we need to make sure that the direction also matches: if (!b1->getBeginAtom()->Match(b2->getBeginAtom()) || !b1->getEndAtom()->Match(b2->getEndAtom())) { return false; } } if (!ps.bondProperties.empty()) { if (!propertyCompat(b1, b2, ps.bondProperties)) { return false; } } if (ps.extraBondCheck && !ps.extraBondCheck(*b1, *b2)) { return false; } return res; } void removeDuplicates(std::vector &matches, unsigned int nAtoms) { // // This works by tracking the indices of the atoms in each match vector. // This can lead to unexpected behavior when looking at rings and queries // that don't specify bond orders. For example querying this molecule: // C1CCC=1 // with the pattern constructed from SMARTS C~C~C~C will return a // single match, despite the fact that there are 4 different paths // when valence is considered. The defense of this behavior is // that the 4 paths are equivalent in the semantics of the query. // Also, OELib returns the same results // std::unordered_set seen; std::vector res; res.reserve(matches.size()); seen.reserve(matches.size()); for (const auto &match : matches) { std::string val(nAtoms, '0'); for (const auto &ci : match) { val[ci.second] = '1'; } const bool inserted = seen.insert(std::move(val)).second; if (inserted) { res.push_back(match); } } res.shrink_to_fit(); matches = std::move(res); } const MatchVectType &getMostSubstitutedCoreMatch( const ROMol &mol, const ROMol &core, const std::vector &matches) { detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core, matches); return matchScorer.getMostSubstitutedCoreMatch(); } std::vector sortMatchesByDegreeOfCoreSubstitution( const ROMol &mol, const ROMol &core, const std::vector &matches) { detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core, matches); return matchScorer.sortMatchesByDegreeOfCoreSubstitution(); } bool isAtomTerminalRGroupOrQueryHydrogen(const Atom *atom) { return (atom->getDegree() == 1 && isAtomDummy(atom)) || (atom->hasQuery() && describeQuery(atom).find("AtomAtomicNum 1 = val") != std::string::npos); } #define PT_OPT_GET(opt) params.opt = pt.get(#opt, params.opt) #define PT_OPT_PUT(opt) pt.put(#opt, params.opt); void updateSubstructMatchParamsFromJSON(SubstructMatchParameters ¶ms, const std::string &json) { if (json.empty()) { return; } std::istringstream ss; ss.str(json); boost::property_tree::ptree pt; boost::property_tree::read_json(ss, pt); PT_OPT_GET(useChirality); PT_OPT_GET(useEnhancedStereo); PT_OPT_GET(aromaticMatchesConjugated); PT_OPT_GET(useQueryQueryMatches); PT_OPT_GET(recursionPossible); PT_OPT_GET(uniquify); PT_OPT_GET(maxMatches); PT_OPT_GET(maxRecursiveMatches); PT_OPT_GET(numThreads); PT_OPT_GET(specifiedStereoQueryMatchesUnspecified); PT_OPT_GET(aromaticMatchesSingleOrDouble); } std::string substructMatchParamsToJSON(const SubstructMatchParameters ¶ms) { boost::property_tree::ptree pt; PT_OPT_PUT(useChirality); PT_OPT_PUT(useEnhancedStereo); PT_OPT_PUT(aromaticMatchesConjugated); PT_OPT_PUT(useQueryQueryMatches); PT_OPT_PUT(recursionPossible); PT_OPT_PUT(uniquify); PT_OPT_PUT(maxMatches); PT_OPT_PUT(maxRecursiveMatches); PT_OPT_PUT(numThreads); PT_OPT_PUT(specifiedStereoQueryMatchesUnspecified); PT_OPT_PUT(aromaticMatchesSingleOrDouble); std::stringstream ss; boost::property_tree::json_parser::write_json(ss, pt); return ss.str(); } } // namespace RDKit