* Fixes #888

* support older versions of boost
support for hashing dynamic_bitset was not added until v1.71

* changes in response to review
This commit is contained in:
Greg Landrum
2023-01-30 17:18:22 +01:00
committed by GitHub
parent 0f88141c09
commit 4e1a590b9f
3 changed files with 134 additions and 13 deletions

View File

@@ -197,7 +197,7 @@ MolMatchFinalCheckFunctor::MolMatchFinalCheckFunctor(
}
bool MolMatchFinalCheckFunctor::operator()(const std::uint32_t q_c[],
const std::uint32_t m_c[]) const {
const std::uint32_t m_c[]) {
if (d_params.extraFinalCheck || d_params.useGenericMatchers) {
// EFF: we can no-doubt do better than this
std::vector<unsigned int> aids(m_c, m_c + d_query.getNumAtoms());
@@ -212,7 +212,27 @@ bool MolMatchFinalCheckFunctor::operator()(const std::uint32_t q_c[],
return false;
}
}
HashedStorageType match;
if (d_params.uniquify) {
match.resize(d_mol.getNumAtoms());
#ifdef RDK_INTERNAL_BITSET_HAS_HASH
match.reset();
#else
std::fill(match.begin(), match.end(), 0);
#endif
for (unsigned int i = 0; i < d_query.getNumAtoms(); ++i) {
match[m_c[i]] = 1;
}
if (matchesSeen.find(match) != matchesSeen.end()) {
return false;
}
}
if (!d_params.useChirality) {
if (d_params.uniquify) {
matchesSeen.insert(match);
}
return true;
}
@@ -356,7 +376,9 @@ bool MolMatchFinalCheckFunctor::operator()(const std::uint32_t q_c[],
return false;
}
}
if (d_params.uniquify) {
matchesSeen.insert(match);
}
return true;
}
@@ -502,9 +524,6 @@ std::vector<MatchVectType> SubstructMatch(
}
matches.push_back(matchVect);
}
if (params.uniquify) {
removeDuplicates(matches, mol.getNumAtoms());
}
}
return matches;
}
@@ -597,16 +616,24 @@ unsigned int RecursiveMatcher(const ROMol &mol, const ROMol &query,
SUBQUERY_MAP &subqueryMap,
const SubstructMatchParameters &params,
std::vector<RecursiveStructureQuery *> &locked) {
SubstructMatchParameters lparams = params;
// NOTE: maxMatches and recursive queries is problematic. To make this really
// cover all cases we'd need a separate parameter for the number of possible
// recursive matches. We will add that for the 2023.03 release; for now
// we can still fix #888 without introducing any new problems using this
// heuristic:
lparams.maxMatches = std::max(1000u, params.maxMatches);
lparams.uniquify = false;
ROMol::ConstAtomIterator atIt;
for (atIt = query.beginAtoms(); atIt != query.endAtoms(); atIt++) {
if ((*atIt)->getQuery()) {
MatchSubqueries(mol, (*atIt)->getQuery(), params, subqueryMap, locked);
MatchSubqueries(mol, (*atIt)->getQuery(), lparams, subqueryMap, locked);
}
}
detail::AtomLabelFunctor atomLabeler(query, mol, params);
detail::BondLabelFunctor bondLabeler(query, mol, params);
MolMatchFinalCheckFunctor matchChecker(query, mol, params);
detail::AtomLabelFunctor atomLabeler(query, mol, lparams);
detail::BondLabelFunctor bondLabeler(query, mol, lparams);
MolMatchFinalCheckFunctor matchChecker(query, mol, lparams);
matches.clear();
matches.resize(0);

View File

@@ -13,12 +13,20 @@
// std bits
#include <vector>
#include <unordered_set>
#include <functional>
#include <unordered_map>
#include <cstdint>
#include "GraphMol/StereoGroup.h"
#include <string>
#include <boost/dynamic_bitset.hpp>
#if BOOST_VERSION >= 107100
#define RDK_INTERNAL_BITSET_HAS_HASH
#endif
#include <GraphMol/StereoGroup.h>
namespace RDKit {
class ROMol;
class Atom;
@@ -218,13 +226,21 @@ class RDKIT_SUBSTRUCTMATCH_EXPORT MolMatchFinalCheckFunctor {
MolMatchFinalCheckFunctor(const ROMol &query, const ROMol &mol,
const SubstructMatchParameters &ps);
bool operator()(const std::uint32_t q_c[], const std::uint32_t m_c[]) const;
bool operator()(const std::uint32_t q_c[], const std::uint32_t m_c[]);
private:
const ROMol &d_query;
const ROMol &d_mol;
const SubstructMatchParameters &d_params;
std::unordered_map<unsigned int, StereoGroup const *> d_molStereoGroups;
#ifdef RDK_INTERNAL_BITSET_HAS_HASH
// Boost 1.71 added support for std::hash with dynamic_bitset.
using HashedStorageType = boost::dynamic_bitset<>;
#else
// otherwise we use a less elegant solution
using HashedStorageType = std::string;
#endif
std::unordered_set<HashedStorageType> matchesSeen;
};
} // namespace RDKit

View File

@@ -1,5 +1,5 @@
//
// Copyright (C) 2019 Greg Landrum
// Copyright (C) 2019-2023 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
@@ -335,4 +335,82 @@ TEST_CASE("Github #4558: GetSubstructMatches() loops at 43690 iterations",
auto matches = SubstructMatch(*mol, *qry, ps);
CHECK(matches.size() == num_mols * 2);
}
}
TEST_CASE(
"Github #888: GetSubstructMatches uniquify and maxMatches don't work well together ") {
SECTION("Basics") {
auto m = "CCCCCC"_smiles;
auto q = "CC"_smarts;
REQUIRE(m);
REQUIRE(q);
SubstructMatchParameters ps;
ps.uniquify = false;
ps.maxMatches = 4;
auto matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 4);
ps.uniquify = true;
matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 4);
ps.useChirality = true;
matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 4);
}
SECTION("interaction with chirality") {
auto m = "C/C=C/C=C/C=C\\C=C/C=C/C"_smiles;
auto q = "C/C=C/C"_smarts;
REQUIRE(m);
REQUIRE(q);
SubstructMatchParameters ps;
ps.uniquify = false;
ps.maxMatches = 2;
auto matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 2);
ps.uniquify = true;
matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 2);
ps.useChirality = true;
matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 2);
ps.maxMatches = 1000;
matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 3);
}
SECTION("interactions with recursive SMARTS") {
{
auto m = "N#CC#N"_smiles;
REQUIRE(m);
auto q = "[!$(*#*)]"_smarts;
REQUIRE(q);
SubstructMatchParameters ps;
ps.uniquify = true;
ps.maxMatches = 1;
auto matches = SubstructMatch(*m, *q, ps);
CHECK(matches.empty());
}
{
auto m = "N#CC#N"_smiles;
REQUIRE(m);
auto q = "[$(*#*)&!D1]"_smarts;
REQUIRE(q);
SubstructMatchParameters ps;
ps.uniquify = true;
auto matches = SubstructMatch(*m, *q, ps);
CHECK(matches.size() == 2);
}
{
auto m = "N#CCC#N"_smiles;
REQUIRE(m);
auto q = "[!$(*#*)&!D1]-&!@[!$(*#*)&!D1]"_smarts;
REQUIRE(q);
SubstructMatchParameters ps;
auto matches = SubstructMatch(*m, *q, ps);
CHECK(matches.empty());
}
}
}