Files
rdkit/Code/GraphMol/FMCS/SubstructureCache.h
Paolo Tosco 350370abe3 - Changed all unsigned to unsigned int for clarity (#6646)
- Switched from dynamic to static allocation for an instance of `MCSParameters`
- Switched to using `auto` where possible
- Added a few `CHECK_INVARIANT` where appropriate before dereferencing pointers
- Moved some inline comments to the previous line to improve readability
- Added a early check for `CompleteRingsOnly` in `checkBondRingMatch()` to improve computational efficiency
- Removed `RingMatchTableSet` entirely as 1) it is unnecessary since its functionality is already provided by `RingInfo` 2) it abused the `userData` pointer. This allows cleaning up and simplifying the code, particularly the Python wrappers which had a significant amount of added complexity to support it
- Removed all the code that was deprecated several releases ago
- Reimplemented ringFusionCheck() from scratch to address several bug reports; also switched from std::set to boost::dynamic_bitset for better efficiency
- Replaced boost::tie with boost::make_iterator_range
- Modernized `for` loops where possible
- Removed entirely the QueryRings structure as its functionality is already available in RingInfo
- Removed entirely the _DFS() function since the same algorithm can be implemented in a simpler and more efficient way using RingInfo (from 2m28.441s to 2m9.859s for the same task)
- Replaced std::vector<bool> with boost::dynamic_bitset
- Replaced C-style casts with C++ casts
- Replaced some size_t with unsigned int
- Refactored checkIfRingsAreClosed() such that checkNoLoneRingAtoms() is not needed anymore
- Added a test for slow runtimes with CompleteRingsOnly
- Setting Timeout to 0 means no timeout, as it should be
- Removed unused `steps` variable from `MaximumCommonSubgraph::growSeeds`
- Storing both Atom and Bond pointers and their indices on Seed and MCS data structures is time-consuming and a potential source of incons
istencies; storing pointers is sufficient
- Promoted `MaximumCommonSubgraph::match` from `private` to `public`
- `NewBonds` was declared `mutable`, but `Seed::fillNewBonds()` was incorrectly declared as `non-const`, which caused the need for an ugly
(and unnecessary) `const_cast`.
I have now removed the `const_cast` and correctly declared functions that alter `NewBonds` as `const`, since `NewBonds` is explicitly `mut
able`
- Removed some useless random scoping that was peppering the MCS code
- Removed a significant amount of duplicate code from the Python wrappers by inheriting from a base `PyMCSWrapper` class
- Fixed #6082
- Fixed #5510
- Fixed #5457
- Fixed #5440
- Fixed #5411
- Fixed #3965
- Fixed #6578

Co-authored-by: ptosco <paolo.tosco@novartis.com>
2023-08-25 06:09:19 +02:00

172 lines
5.6 KiB
C++

//
// Copyright (C) 2014 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#pragma once
#include <list>
#include <vector>
#include <string>
#include <stdexcept>
#include "../RDKitBase.h"
#include "Graph.h"
#include "Seed.h"
#include "DebugTrace.h" // algorithm filter definitions
namespace RDKit {
namespace FMCS {
class RDKIT_FMCS_EXPORT SubstructureCache {
public:
#pragma pack(push, 1)
struct KeyNumericMetrics {
typedef unsigned long long TValue;
TValue Value{0};
public:
KeyNumericMetrics() {}
};
#pragma pack(pop)
struct HashKey {
KeyNumericMetrics NumericMetrics;
public:
void computeKey(const Seed& seed,
const std::vector<unsigned int>& queryAtomLabels,
const std::vector<unsigned int>& queryBondLabels) {
computeMorganCodeHash(seed, queryAtomLabels, queryBondLabels);
}
private:
void computeMorganCodeHash(
const Seed& seed, const std::vector<unsigned int>& queryAtomLabels,
const std::vector<unsigned int>& queryBondLabels) {
size_t nv = seed.getNumAtoms();
size_t ne = seed.getNumBonds();
std::vector<unsigned long> currCodes(nv);
std::vector<unsigned long> prevCodes(nv);
size_t nIterations = seed.getNumBonds();
if (nIterations > 5) {
nIterations = 5;
}
for (unsigned int seedAtomIdx = 0; seedAtomIdx < seed.getNumAtoms();
++seedAtomIdx) {
currCodes[seedAtomIdx] = queryAtomLabels.at(
seed.MoleculeFragment.Atoms.at(seedAtomIdx)->getIdx());
}
for (size_t iter = 0; iter < nIterations; ++iter) {
for (size_t i = 0; i < nv; ++i) {
prevCodes[i] = currCodes[i];
}
for (size_t seedBondIdx = 0; seedBondIdx < ne; ++seedBondIdx) {
const Bond* bond = seed.MoleculeFragment.Bonds[seedBondIdx];
unsigned int order = queryBondLabels.at(
seed.MoleculeFragment.Bonds.at(seedBondIdx)->getIdx());
unsigned int atom1 = seed.MoleculeFragment.SeedAtomIdxMap
.find(bond->getBeginAtomIdx())
->second;
unsigned int atom2 =
seed.MoleculeFragment.SeedAtomIdxMap.find(bond->getEndAtomIdx())
->second;
unsigned int v1 = prevCodes[atom1];
unsigned int v2 = prevCodes[atom2];
currCodes[atom1] += v2 * v2 + (v2 + 23) * (order + 1721);
currCodes[atom2] += v1 * v1 + (v1 + 23) * (order + 1721);
}
}
KeyNumericMetrics::TValue result = 0;
for (unsigned int seedAtomIdx = 0; seedAtomIdx < nv; ++seedAtomIdx) {
unsigned long code = currCodes[seedAtomIdx];
result += code * (code + 6849) + 29;
}
NumericMetrics.Value = result;
}
// not implemented yet:
/*
void computeFingerprint(const Seed& seed)
{
unsigned int radius = seed.getNumBonds();
if (radius > 5)
radius = 5;
ExplicitBitVect *mf =
RDKit::MorganFingerprints::getFingerprintAsBitVect(seed.GraphTopology,
radius); //SLOW !!!
// ...
delete mf;
NumericMetrics.Field.hasFingerprint = 1;
}
*/
};
typedef HashKey TKey;
typedef std::list<FMCS::Graph> TIndexEntry; // hash-key is not unique key
private:
std::vector<TIndexEntry> ValueStorage;
std::map<KeyNumericMetrics::TValue, size_t> NumericIndex; // TIndexEntry
public:
// returns computed key, and pointer to index entry with a set of subgraphs
// corresponding to the key if found.
// then caller must find exactly matched subgraph in the result set with own
// search algorithm,
// including a resolving of collisions of hash key
TIndexEntry* find(const Seed& seed,
const std::vector<unsigned int>& queryAtomLabels,
const std::vector<unsigned int>& queryBondLabels,
TKey& key) { // compute key and find entry
key.computeKey(seed, queryAtomLabels, queryBondLabels);
const auto entryit = NumericIndex.find(key.NumericMetrics.Value);
if (NumericIndex.end() != entryit) {
return &ValueStorage[entryit->second];
}
return nullptr; // not found
}
// if find() did not found any entry for this key of seed a new entry will be
// created
void add(const Seed& seed, TKey& key,
TIndexEntry* entry) { // "compute" value and store it in NEW entry
// if not found
if (!entry) {
try {
ValueStorage.emplace_back();
} catch (...) {
return; // not enough memory room to add the item, but it's just a
// cache
}
entry = &ValueStorage.back();
}
entry->push_back(seed.Topology);
if (!NumericIndex
.insert(std::make_pair(key.NumericMetrics.Value,
ValueStorage.size() - 1))
.second) {
return; // not enough memory room to add the item, but it is just cache
}
}
size_t keyssize() const { // for statistics only
return ValueStorage.size();
}
size_t fullsize() const { // for statistics only
return std::accumulate(
ValueStorage.begin(), ValueStorage.end(), 0,
[](const auto& acc, const auto& v) { return acc + v.size(); });
}
};
} // namespace FMCS
} // namespace RDKit