mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
* synthon perf: replace O(N) haveEnoughHits scan with O(1) atomic counter processPartHitsFromDetails called haveEnoughHits after each verified hit, which scanned every slot of the pre-sized results vector (up to toTryChunkSize = 2.5M entries) to count non-null entries via std::accumulate. With ~3000 verified hits per search that is ~7.5B pointer reads per query. Replace with a std::atomic<int64_t> numHitsFound counter in makeHitsFromToTry, incremented via fetch_add on each verified hit. The early-exit condition becomes a single atomic read, O(1) per hit regardless of vector size. The atomic is local to makeHitsFromToTry so it resets correctly per chunk and is safe for the multi-threaded path without added synchronization. Measured on synthon_perf branch (42-rxn / 140B-product Freedom space, maxHits=3000, hitStart=1000, before boost::unordered_flat_set change): search-several (9 queries): ~30s → ~16.5s (~1.8x) search-one (benzene): ~3.5s → ~1.8s (~1.9x) All 4 synthon ctest cases pass. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * style ++ * Update Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com> Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
146 lines
5.9 KiB
C++
146 lines
5.9 KiB
C++
//
|
|
// Copyright (C) David Cosgrove 2024.
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
|
|
// This file declares an abstract base class for searching a synthon
|
|
// space. Concrete base classes include SynthonSpaceSubstructureSearcher
|
|
// and SynthonSpaceFingerprintSearcher.
|
|
|
|
#ifndef SYNTHONSPACESEARCHER_H
|
|
#define SYNTHONSPACESEARCHER_H
|
|
|
|
#include <atomic>
|
|
#include <chrono>
|
|
#include <random>
|
|
|
|
#include <RDGeneral/export.h>
|
|
#include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
|
|
#include <GraphMol/SynthonSpaceSearch/SynthonSpaceHitSet.h>
|
|
#include <GraphMol/SynthonSpaceSearch/SearchResults.h>
|
|
#include <boost/spirit/home/support/common_terminals.hpp>
|
|
|
|
using Clock = std::chrono::steady_clock;
|
|
using TimePoint = std::chrono::time_point<Clock>;
|
|
|
|
namespace RDKit {
|
|
class ROMol;
|
|
|
|
namespace SynthonSpaceSearch {
|
|
|
|
// Abstract base class for searching the SynthonSpace.
|
|
class SynthonSpaceSearcher {
|
|
public:
|
|
SynthonSpaceSearcher() = delete;
|
|
SynthonSpaceSearcher(const ROMol &query,
|
|
const SynthonSpaceSearchParams ¶ms,
|
|
SynthonSpace &space);
|
|
SynthonSpaceSearcher(const SynthonSpaceSearcher &other) = delete;
|
|
SynthonSpaceSearcher(SynthonSpaceSearcher &&other) = delete;
|
|
SynthonSpaceSearcher &operator=(const SynthonSpaceSearcher &other) = delete;
|
|
SynthonSpaceSearcher &operator=(SynthonSpaceSearcher &&other) = delete;
|
|
|
|
virtual ~SynthonSpaceSearcher() = default;
|
|
|
|
SearchResults search();
|
|
void search(const SearchResultCallback &cb);
|
|
|
|
SynthonSpace &getSpace() const { return d_space; }
|
|
const ROMol &getQuery() const { return d_query; }
|
|
const SynthonSpaceSearchParams &getParams() const { return d_params; }
|
|
|
|
// Do the search of this fragSet against the SynthonSet in the
|
|
// appropriate way, for example by substructure or fingerprint
|
|
// similarity.
|
|
virtual std::vector<std::unique_ptr<SynthonSpaceHitSet>> searchFragSet(
|
|
const std::vector<std::unique_ptr<ROMol>> &fragSet,
|
|
const SynthonSet &reaction) const = 0;
|
|
|
|
// Make the hit, constructed from a specific combination of
|
|
// synthons in the hitset, and verify that it matches the
|
|
// query in the appropriate way. There'll be 1 entry in synthNums
|
|
// for each synthon list in the hitset. Returns an empty pointer
|
|
// if the hit isn't accepted for whatever reason.
|
|
std::unique_ptr<ROMol> buildAndVerifyHit(
|
|
const SynthonSpaceHitSet *hitset,
|
|
const std::vector<size_t> &synthNums) const;
|
|
|
|
protected:
|
|
// Checks that the given molecule is definitely a hit according to
|
|
// the derived class' criteria. This function checks the chiralAtomCount
|
|
// if appropriate, which required a non-const ROMol.
|
|
virtual bool verifyHit(ROMol &mol) const;
|
|
|
|
// Do a check against number of heavy atoms etc. if options call for it
|
|
// which can be done without having to build the full molecule from the
|
|
// synthons. Some of the search methods (fingerprints, for example) can do
|
|
// additional quick checks on whether this set of synthons can match the query
|
|
// without building the full molecule.
|
|
virtual bool quickVerify(const SynthonSpaceHitSet *hitset,
|
|
const std::vector<size_t> &synthNums) const;
|
|
|
|
private:
|
|
std::unique_ptr<std::mt19937> d_randGen;
|
|
|
|
const ROMol &d_query;
|
|
const SynthonSpaceSearchParams &d_params;
|
|
SynthonSpace &d_space;
|
|
|
|
// Generally, the search needs the query fragmented into no more than
|
|
// the largest number synthon sets in any reaction. Substructure search
|
|
// needs more than that, sometimes.
|
|
virtual unsigned int getNumQueryFragmentsRequired();
|
|
// Some of the search methods might need extra setup of the fragment
|
|
// sets. The FingerprintSearcher, for example, needs fingerprints
|
|
// for all the fragments. The SubstructureSearcher needs connector
|
|
// regions and information about them.
|
|
virtual void extraSearchSetup(
|
|
[[maybe_unused]] std::vector<std::vector<std::unique_ptr<ROMol>>>
|
|
&fragSets) {}
|
|
|
|
std::vector<std::unique_ptr<SynthonSpaceHitSet>> assembleHitSets(
|
|
const TimePoint *endTime, bool &timedOut, std::uint64_t &totHits);
|
|
|
|
std::vector<std::unique_ptr<SynthonSpaceHitSet>> doTheSearch(
|
|
std::vector<std::vector<std::unique_ptr<ROMol>>> &fragSets,
|
|
const TimePoint *endTime, bool &timedOut, std::uint64_t &totHits);
|
|
|
|
// Build the molecules from the synthons identified in hitsets.
|
|
// Checks that all the results produced match the
|
|
// query. Duplicates by name are not returned,
|
|
// but duplicate SMILES from different reactions will be.
|
|
// Hitsets will be re-ordered on exit.
|
|
void buildHits(std::vector<std::unique_ptr<SynthonSpaceHitSet>> &hitsets,
|
|
const TimePoint *endTime, bool &timedOut,
|
|
std::vector<std::unique_ptr<ROMol>> &results) const;
|
|
void buildAllHits(
|
|
const std::vector<std::unique_ptr<SynthonSpaceHitSet>> &hitsets,
|
|
const TimePoint *endTime, bool &timedOut,
|
|
std::vector<std::unique_ptr<ROMol>> &results) const;
|
|
void makeHitsFromToTry(
|
|
const std::vector<
|
|
std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>> &toTry,
|
|
const TimePoint *endTime, std::vector<std::unique_ptr<ROMol>> &results,
|
|
std::atomic<std::int64_t> &numHitsFound) const;
|
|
void processToTrySet(
|
|
std::vector<std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>>
|
|
&toTry,
|
|
const TimePoint *endTime, std::vector<std::unique_ptr<ROMol>> &results,
|
|
std::atomic<std::int64_t> &numHitsFound) const;
|
|
|
|
// get the subset of synthons for the given reaction to use for this
|
|
// enumeration.
|
|
std::vector<std::vector<ROMol *>> getSynthonsToUse(
|
|
const std::vector<boost::dynamic_bitset<>> &synthonsToUse,
|
|
const std::string &reaction_id) const;
|
|
};
|
|
|
|
} // namespace SynthonSpaceSearch
|
|
} // namespace RDKit
|
|
#endif // SYNTHONSPACESEARCHER_H
|