Files
rdkit/Code/GraphMol/Substruct/SubstructUtils.cpp
Paolo Tosco 19c9a3905c Enhanced generateDepictionMatching2DStructure functionality (#3811)
* - generateDepictionMatching2DStructure can be used with referencePattern smaller than reference
  to only use part of a scaffold
- adds generateDepictionMatching2DStructure overload to pass a matchVect instead of doing a substructure match
- adds allowRGroups parameter to enable using a scaffold bearing R groups as reference

* changes in response to review

* added comments

* fixes failing doctest

* - reverted change committed accidentally
- fixed get_sss_json for the case where R groups are not included in the match
- added tests for the return value of generate_aligned_coords

* Documented the value returned by GenerateDepictionMatching2DStructure (Python)

* changes in response to review

* - changes in response to review
- fixed sortMatchesByDegreeOfCoreSubstitution that was not working
- added Python wrappers for sortMatchesByDegreeOfCoreSubstitution and getMostSubstitutedCoreMatch
- added C++ and Pyhon unit tests for the above

* added missing variable initialization

Co-authored-by: Tosco, Paolo <paolo.tosco@novartis.com>
2021-02-24 05:37:31 +01:00

211 lines
7.4 KiB
C++

//
// Copyright (C) 2003-2019 Greg Landrum and Rational Discovery LLC
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "SubstructUtils.h"
#include <RDGeneral/utils.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <boost/dynamic_bitset.hpp>
namespace RDKit {
namespace detail {
// Helper class used by the sortMatchesByDegreeOfCoreSubstitution
// and getMostSubstitutedCoreMatch functions. A penalty of 1.0 is assigned
// to matches for each terminal dummy atom matching hydrogen.
// To make the sort stable in case of ties, a fraction of 1.0
// is added to each score based on match indices.
class ScoreMatchesByDegreeOfCoreSubstitution {
public:
typedef std::pair<unsigned int, double> IdxScorePair;
ScoreMatchesByDegreeOfCoreSubstitution(
const RDKit::ROMol &mol, const RDKit::ROMol &query,
const std::vector<RDKit::MatchVectType> &matches)
: d_mol(mol),
d_query(query),
d_matches(matches),
d_sumIndices(0.0),
d_minIdx(-1),
d_isSorted(false) {
PRECONDITION(!matches.empty(), "matches must not be empty");
for (unsigned int i = 0; i < d_mol.getNumAtoms(); ++i) {
d_sumIndices += static_cast<double>(i);
}
unsigned int i = 0;
d_matchIdxVsScore.reserve(d_matches.size());
for (const auto &match : d_matches) {
d_matchIdxVsScore.emplace_back(std::make_pair(i++, computeScore(match)));
}
}
const RDKit::MatchVectType &getMostSubstitutedCoreMatch() {
if (d_minIdx == -1) {
d_minIdx = std::min_element(d_matchIdxVsScore.begin(),
d_matchIdxVsScore.end(), compare)
->first;
}
return d_matches.at(d_minIdx);
}
std::vector<MatchVectType> sortMatchesByDegreeOfCoreSubstitution() {
if (!d_isSorted) {
std::sort(d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), compare);
d_isSorted = true;
d_minIdx = d_matchIdxVsScore.front().first;
}
std::vector<MatchVectType> res(d_matches.size());
std::transform(
d_matchIdxVsScore.begin(), d_matchIdxVsScore.end(), res.begin(),
[this](const IdxScorePair &pair) { return d_matches.at(pair.first); });
return res;
}
private:
static bool compare(const IdxScorePair &aPair, const IdxScorePair &bPair) {
return (aPair.second < bPair.second);
}
bool doesRGroupMatchHydrogen(const std::pair<int, int> &pair) const {
const auto queryAtom = d_query.getAtomWithIdx(pair.first);
const auto molAtom = d_mol.getAtomWithIdx(pair.second);
return (queryAtom->getAtomicNum() == 0 && queryAtom->getDegree() == 1 &&
molAtom->getAtomicNum() == 1);
}
double computeScore(const RDKit::MatchVectType &match) const {
double penalty = 0.0;
double i = 0.0;
for (const auto &pair : match) {
i += static_cast<double>(pair.second);
if (doesRGroupMatchHydrogen(pair)) {
penalty += 1.0;
}
}
penalty += i / d_sumIndices;
return penalty;
}
const RDKit::ROMol &d_mol;
const RDKit::ROMol &d_query;
const std::vector<RDKit::MatchVectType> &d_matches;
std::vector<IdxScorePair> d_matchIdxVsScore;
double d_sumIndices;
int d_minIdx;
bool d_isSorted;
};
} // namespace detail
bool atomCompat(const Atom *a1, const Atom *a2,
const SubstructMatchParameters &ps) {
PRECONDITION(a1, "bad atom");
PRECONDITION(a2, "bad atom");
// std::cerr << "\t\tatomCompat: "<< a1 << " " << a1->getIdx() << "-" << a2 <<
// " " << a2->getIdx() << std::endl;
bool res;
if (ps.useQueryQueryMatches && a1->hasQuery() && a2->hasQuery()) {
res = static_cast<const QueryAtom *>(a1)->QueryMatch(
static_cast<const QueryAtom *>(a2));
} else {
res = a1->Match(a2);
}
return res;
}
bool chiralAtomCompat(const Atom *&a1, const Atom *&a2) {
PRECONDITION(a1, "bad atom");
PRECONDITION(a2, "bad atom");
bool res = a1->Match(a2);
if (res) {
std::string s1, s2;
bool hascode1 = a1->getPropIfPresent(common_properties::_CIPCode, s1);
bool hascode2 = a2->getPropIfPresent(common_properties::_CIPCode, s2);
if (hascode1 || hascode2) {
res = hascode1 && hascode2 && s1 == s2;
}
}
std::cerr << "\t\tchiralAtomCompat: " << a1 << " " << a1->getIdx() << "-"
<< a2 << " " << a2->getIdx() << std::endl;
std::cerr << "\t\t " << res << std::endl;
return res;
}
bool bondCompat(const Bond *b1, const Bond *b2,
const SubstructMatchParameters &ps) {
PRECONDITION(b1, "bad bond");
PRECONDITION(b2, "bad bond");
bool res;
if (ps.useQueryQueryMatches && b1->hasQuery() && b2->hasQuery()) {
res = static_cast<const QueryBond *>(b1)->QueryMatch(
static_cast<const QueryBond *>(b2));
} else if (ps.aromaticMatchesConjugated && !b1->hasQuery() &&
!b2->hasQuery() &&
((b1->getBondType() == Bond::AROMATIC &&
b2->getBondType() == Bond::AROMATIC) ||
(b1->getBondType() == Bond::AROMATIC && b2->getIsConjugated()) ||
(b2->getBondType() == Bond::AROMATIC && b1->getIsConjugated()))) {
res = true;
} else {
res = b1->Match(b2);
}
if (res && b1->getBondType() == Bond::DATIVE &&
b2->getBondType() == Bond::DATIVE) {
// for dative bonds we need to make sure that the direction also matches:
if (!b1->getBeginAtom()->Match(b2->getBeginAtom()) ||
!b1->getEndAtom()->Match(b2->getEndAtom())) {
res = false;
}
}
// std::cerr << "\t\tbondCompat: " << b1->getIdx() << "-" << b2->getIdx() <<
// ":"
// << res << std::endl;
return res;
}
void removeDuplicates(std::vector<MatchVectType> &v, unsigned int nAtoms) {
//
// This works by tracking the indices of the atoms in each match vector.
// This can lead to unexpected behavior when looking at rings and queries
// that don't specify bond orders. For example querying this molecule:
// C1CCC=1
// with the pattern constructed from SMARTS C~C~C~C will return a
// single match, despite the fact that there are 4 different paths
// when valence is considered. The defense of this behavior is
// that the 4 paths are equivalent in the semantics of the query.
// Also, OELib returns the same results
//
std::vector<boost::dynamic_bitset<>> seen;
std::vector<MatchVectType> res;
for (std::vector<MatchVectType>::const_iterator i = v.begin(); i != v.end();
++i) {
boost::dynamic_bitset<> val(nAtoms);
for (const auto &ci : *i) {
val.set(ci.second);
}
if (std::find(seen.begin(), seen.end(), val) == seen.end()) {
// it's something new
res.push_back(*i);
seen.push_back(val);
}
}
v = res;
}
const MatchVectType &getMostSubstitutedCoreMatch(
const ROMol &mol, const ROMol &core,
const std::vector<MatchVectType> &matches) {
detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core,
matches);
return matchScorer.getMostSubstitutedCoreMatch();
}
std::vector<MatchVectType> sortMatchesByDegreeOfCoreSubstitution(
const ROMol &mol, const ROMol &core,
const std::vector<MatchVectType> &matches) {
detail::ScoreMatchesByDegreeOfCoreSubstitution matchScorer(mol, core,
matches);
return matchScorer.sortMatchesByDegreeOfCoreSubstitution();
}
} // namespace RDKit