mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* backup * basic tests pass * add JSON out to substruct match parameters * serialize the substruct match parameters in reactions * add that to the python wrapper * more testing
260 lines
8.9 KiB
C++
260 lines
8.9 KiB
C++
//
|
|
// Copyright (C) 2003-2021 Greg Landrum and Rational Discovery LLC
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include "SubstructUtils.h"
|
|
#include <set>
|
|
#include <RDGeneral/utils.h>
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/RDKitQueries.h>
|
|
#include <GraphMol/Substruct/SubstructUtils.h>
|
|
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/dynamic_bitset.hpp>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <boost/property_tree/ptree.hpp>
|
|
#include <boost/property_tree/json_parser.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
namespace RDKit {
|
|
|
|
namespace detail {
|
|
// Helper class used by the sortMatchesByDegreeOfCoreSubstitution
|
|
// and getMostSubstitutedCoreMatch functions. A penalty of 1.0 is assigned
|
|
// to matches for each terminal dummy atom matching hydrogen.
|
|
// To make the sort stable in case of ties, a fraction of 1.0
|
|
// is added to each score based on match indices.
|
|
class ScoreMatchesByDegreeOfCoreSubstitution {
|
|
public:
|
|
typedef std::pair<unsigned int, double> IdxScorePair;
|
|
ScoreMatchesByDegreeOfCoreSubstitution(
|
|
const RDKit::ROMol &mol, const RDKit::ROMol &query,
|
|
const std::vector<RDKit::MatchVectType> &matches)
|
|
: d_mol(mol),
|
|
d_query(query),
|
|
d_matches(matches),
|
|
d_sumIndices(0.0),
|
|
d_minIdx(-1),
|
|
d_isSorted(false) {
|
|
PRECONDITION(!matches.empty(), "matches must not be empty");
|
|
for (unsigned int i = 0; i < d_mol.getNumAtoms(); ++i) {
|
|
d_sumIndices += static_cast<double>(i);
|
|
}
|
|
unsigned int i = 0;
|
|
d_matchIdxVsScore.reserve(d_matches.size());
|
|
for (const auto &match : d_matches) {
|
|
d_matchIdxVsScore.emplace_back(std::make_pair(i++, computeScore(match)));
|
|
}
|
|
}
|
|
const RDKit::MatchVectType &getMostSubstitutedCoreMatch() {
|
|
if (d_minIdx == -1) {
|
|
d_minIdx = std::min_element(d_matchIdxVsScore.begin(),
|
|
d_matchIdxVsScore.end(), compare)
|
|
->first;
|
|
}
|
|
return d_matches.at(d_minIdx);
|
|
}
|
|
std::vector<MatchVectType> sortMatchesByDegreeOfCoreSubstitution() {
|
|
if (!d_isSorted) {
|
|
std::sort(d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), compare);
|
|
d_isSorted = true;
|
|
d_minIdx = d_matchIdxVsScore.front().first;
|
|
}
|
|
std::vector<MatchVectType> res(d_matches.size());
|
|
std::transform(
|
|
d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), res.begin(),
|
|
[this](const IdxScorePair &pair) { return d_matches.at(pair.first); });
|
|
return res;
|
|
}
|
|
|
|
private:
|
|
static bool compare(const IdxScorePair &aPair, const IdxScorePair &bPair) {
|
|
return (aPair.second < bPair.second);
|
|
}
|
|
bool doesRGroupMatchHydrogen(const std::pair<int, int> &pair) const {
|
|
const auto queryAtom = d_query.getAtomWithIdx(pair.first);
|
|
const auto molAtom = d_mol.getAtomWithIdx(pair.second);
|
|
return (queryAtom->getAtomicNum() == 0 && queryAtom->getDegree() == 1 &&
|
|
molAtom->getAtomicNum() == 1);
|
|
}
|
|
double computeScore(const RDKit::MatchVectType &match) const {
|
|
double penalty = 0.0;
|
|
double i = 0.0;
|
|
for (const auto &pair : match) {
|
|
i += static_cast<double>(pair.second);
|
|
if (doesRGroupMatchHydrogen(pair)) {
|
|
penalty += 1.0;
|
|
}
|
|
}
|
|
penalty += i / d_sumIndices;
|
|
return penalty;
|
|
}
|
|
const RDKit::ROMol &d_mol;
|
|
const RDKit::ROMol &d_query;
|
|
const std::vector<RDKit::MatchVectType> &d_matches;
|
|
std::vector<IdxScorePair> d_matchIdxVsScore;
|
|
double d_sumIndices;
|
|
int d_minIdx;
|
|
bool d_isSorted;
|
|
};
|
|
} // namespace detail
|
|
|
|
bool atomCompat(const Atom *a1, const Atom *a2,
|
|
const SubstructMatchParameters &ps) {
|
|
PRECONDITION(a1, "bad atom");
|
|
PRECONDITION(a2, "bad atom");
|
|
// std::cerr << "\t\tatomCompat: "<< a1 << " " << a1->getIdx() << "-" << a2 <<
|
|
// " " << a2->getIdx() << std::endl;
|
|
bool res;
|
|
if (ps.useQueryQueryMatches && a1->hasQuery() && a2->hasQuery()) {
|
|
res = static_cast<const QueryAtom *>(a1)->QueryMatch(
|
|
static_cast<const QueryAtom *>(a2));
|
|
} else {
|
|
res = a1->Match(a2);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
bool chiralAtomCompat(const Atom *&a1, const Atom *&a2) {
|
|
PRECONDITION(a1, "bad atom");
|
|
PRECONDITION(a2, "bad atom");
|
|
bool res = a1->Match(a2);
|
|
if (res) {
|
|
std::string s1, s2;
|
|
bool hascode1 = a1->getPropIfPresent(common_properties::_CIPCode, s1);
|
|
bool hascode2 = a2->getPropIfPresent(common_properties::_CIPCode, s2);
|
|
if (hascode1 || hascode2) {
|
|
res = hascode1 && hascode2 && s1 == s2;
|
|
}
|
|
}
|
|
std::cerr << "\t\tchiralAtomCompat: " << a1 << " " << a1->getIdx() << "-"
|
|
<< a2 << " " << a2->getIdx() << std::endl;
|
|
std::cerr << "\t\t " << res << std::endl;
|
|
return res;
|
|
}
|
|
|
|
bool bondCompat(const Bond *b1, const Bond *b2,
|
|
const SubstructMatchParameters &ps) {
|
|
PRECONDITION(b1, "bad bond");
|
|
PRECONDITION(b2, "bad bond");
|
|
bool res;
|
|
if (ps.useQueryQueryMatches && b1->hasQuery() && b2->hasQuery()) {
|
|
res = static_cast<const QueryBond *>(b1)->QueryMatch(
|
|
static_cast<const QueryBond *>(b2));
|
|
} else if (ps.aromaticMatchesConjugated && !b1->hasQuery() &&
|
|
!b2->hasQuery() &&
|
|
((b1->getBondType() == Bond::AROMATIC &&
|
|
b2->getBondType() == Bond::AROMATIC) ||
|
|
(b1->getBondType() == Bond::AROMATIC && b2->getIsConjugated()) ||
|
|
(b2->getBondType() == Bond::AROMATIC && b1->getIsConjugated()))) {
|
|
res = true;
|
|
} else {
|
|
res = b1->Match(b2);
|
|
}
|
|
if (res && b1->getBondType() == Bond::DATIVE &&
|
|
b2->getBondType() == Bond::DATIVE) {
|
|
// for dative bonds we need to make sure that the direction also matches:
|
|
if (!b1->getBeginAtom()->Match(b2->getBeginAtom()) ||
|
|
!b1->getEndAtom()->Match(b2->getEndAtom())) {
|
|
res = false;
|
|
}
|
|
}
|
|
// std::cerr << "\t\tbondCompat: " << b1->getIdx() << "-" << b2->getIdx() <<
|
|
// ":"
|
|
// << res << std::endl;
|
|
return res;
|
|
}
|
|
|
|
void removeDuplicates(std::vector<MatchVectType> &matches,
|
|
unsigned int nAtoms) {
|
|
//
|
|
// This works by tracking the indices of the atoms in each match vector.
|
|
// This can lead to unexpected behavior when looking at rings and queries
|
|
// that don't specify bond orders. For example querying this molecule:
|
|
// C1CCC=1
|
|
// with the pattern constructed from SMARTS C~C~C~C will return a
|
|
// single match, despite the fact that there are 4 different paths
|
|
// when valence is considered. The defense of this behavior is
|
|
// that the 4 paths are equivalent in the semantics of the query.
|
|
// Also, OELib returns the same results
|
|
//
|
|
std::set<boost::dynamic_bitset<>> seen;
|
|
std::vector<MatchVectType> res;
|
|
res.reserve(matches.size());
|
|
for (auto &&match : matches) {
|
|
boost::dynamic_bitset<> val(nAtoms);
|
|
for (const auto &ci : match) {
|
|
val.set(ci.second);
|
|
}
|
|
auto pos = seen.lower_bound(val);
|
|
if (pos == seen.end() || *pos != val) {
|
|
res.push_back(std::move(match));
|
|
seen.insert(pos, std::move(val));
|
|
}
|
|
}
|
|
res.shrink_to_fit();
|
|
matches = std::move(res);
|
|
}
|
|
|
|
const MatchVectType &getMostSubstitutedCoreMatch(
|
|
const ROMol &mol, const ROMol &core,
|
|
const std::vector<MatchVectType> &matches) {
|
|
detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core,
|
|
matches);
|
|
return matchScorer.getMostSubstitutedCoreMatch();
|
|
}
|
|
|
|
std::vector<MatchVectType> sortMatchesByDegreeOfCoreSubstitution(
|
|
const ROMol &mol, const ROMol &core,
|
|
const std::vector<MatchVectType> &matches) {
|
|
detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core,
|
|
matches);
|
|
return matchScorer.sortMatchesByDegreeOfCoreSubstitution();
|
|
}
|
|
|
|
#define PT_OPT_GET(opt) params.opt = pt.get(#opt, params.opt)
|
|
#define PT_OPT_PUT(opt) pt.put(#opt, params.opt);
|
|
|
|
void updateSubstructMatchParamsFromJSON(SubstructMatchParameters ¶ms,
|
|
const std::string &json) {
|
|
if (json.empty()) {
|
|
return;
|
|
}
|
|
std::istringstream ss;
|
|
ss.str(json);
|
|
boost::property_tree::ptree pt;
|
|
boost::property_tree::read_json(ss, pt);
|
|
PT_OPT_GET(useChirality);
|
|
PT_OPT_GET(useEnhancedStereo);
|
|
PT_OPT_GET(aromaticMatchesConjugated);
|
|
PT_OPT_GET(useQueryQueryMatches);
|
|
PT_OPT_GET(recursionPossible);
|
|
PT_OPT_GET(uniquify);
|
|
PT_OPT_GET(maxMatches);
|
|
PT_OPT_GET(numThreads);
|
|
}
|
|
|
|
std::string substructMatchParamsToJSON(const SubstructMatchParameters ¶ms) {
|
|
boost::property_tree::ptree pt;
|
|
|
|
PT_OPT_PUT(useChirality);
|
|
PT_OPT_PUT(useEnhancedStereo);
|
|
PT_OPT_PUT(aromaticMatchesConjugated);
|
|
PT_OPT_PUT(useQueryQueryMatches);
|
|
PT_OPT_PUT(recursionPossible);
|
|
PT_OPT_PUT(uniquify);
|
|
PT_OPT_PUT(maxMatches);
|
|
PT_OPT_PUT(numThreads);
|
|
|
|
std::stringstream ss;
|
|
boost::property_tree::json_parser::write_json(ss, pt);
|
|
return ss.str();
|
|
}
|
|
|
|
} // namespace RDKit
|