mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
sortAndUniquifyToTry previously built a parallel vector of (index, string) pairs, sorted by string, erased duplicates, then rebuilt the original vector — O(N log N) with one heap allocation per candidate product. Replace with an erase-remove over a boost::unordered_flat_set<size_t> keyed on buildProductHash (boost::hash_combine over synthon IDs + reaction ID). Dedup is now O(N) average with no string allocations on the hot path. Also switch SearchResults::d_molNames from std::unordered_set<std::string> to boost::unordered_flat_set<std::string> for the same open-addressing cache locality benefit during mergeResults. Perf (42-rxn / 140B-product Freedom space, maxHits=3000, hitStart=1000, 9 queries; vanilla.log → 2unordered_flat_set.log): Benzene: 6.92s → 5.64s (−19%) Tolueneish: 6.19s → 5.07s (−18%) Acetaminophen: 4.50s → 3.63s (−19%) Allopurinol: 4.41s → 3.94s (−11%) Theophylline: 4.39s → 3.90s (−11%) Nicotine: 4.87s → 3.97s (−18%) Ciprofloxacin: 6.82s → 6.09s (−11%) Aspirin: 4.51s → 3.42s (−24%) Metoprolol: 5.11s → 4.07s (−20%) Total: 48.40s → 40.33s (−17%) Hit counts and MaxNumResults unchanged across all queries. Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
91 lines
3.0 KiB
C++
91 lines
3.0 KiB
C++
//
|
|
// Copyright (C) David Cosgrove 2024.
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
|
|
#ifndef RDKIT_SYNTHONSPACE_SEARCHRESULTS_H
|
|
#define RDKIT_SYNTHONSPACE_SEARCHRESULTS_H
|
|
|
|
#include <functional>
|
|
#include <boost/unordered/unordered_flat_set.hpp>
|
|
#include <RDGeneral/export.h>
|
|
#include <GraphMol/ROMol.h>
|
|
|
|
namespace RDKit::SynthonSpaceSearch {
|
|
|
|
// takes vector of search results; returns true if enough hits have been
|
|
// returned, false if the search should continue.
|
|
// Invoking the callback transfers ownership of the molecules to the
|
|
// callee, which avoids an extra copy of the molecule.
|
|
using SearchResultCallback =
|
|
std::function<bool(std::vector<std::unique_ptr<ROMol>> &)>;
|
|
|
|
// A class holding a set of results from a search. Contains the hit
|
|
// molecules and information about how the search progressed, whether
|
|
// it timed out etc.
|
|
class RDKIT_SYNTHONSPACESEARCH_EXPORT SearchResults {
|
|
public:
|
|
explicit SearchResults() : d_maxNumResults(0) {}
|
|
SearchResults(std::vector<std::unique_ptr<ROMol>> &&mols,
|
|
std::uint64_t maxNumRes, bool timedOut, bool cancelled);
|
|
SearchResults(const SearchResults &other);
|
|
SearchResults(SearchResults &&other) = default;
|
|
~SearchResults() = default;
|
|
|
|
SearchResults &operator=(const SearchResults &other);
|
|
SearchResults &operator=(SearchResults &&other) = default;
|
|
|
|
/*!
|
|
* Returns the upper bound on the number of results the search might return.
|
|
* There may be fewer than this in practice for several reasons such as
|
|
* duplicate reagent sets being removed or the final product not matching the
|
|
* query even though the synthons suggested it would.
|
|
*
|
|
* @return int
|
|
*/
|
|
std::uint64_t getMaxNumResults() const { return d_maxNumResults; }
|
|
/*!
|
|
* Returns the hit molecules from the search. Not necessarily all
|
|
* those possible, just the maximum number requested.
|
|
*
|
|
* @return std::vector<std::unique_ptr<ROMol>>
|
|
*/
|
|
const std::vector<std::unique_ptr<ROMol>> &getHitMolecules() const {
|
|
return d_hitMolecules;
|
|
}
|
|
|
|
/*!
|
|
* Returns whether the search timed out or not,
|
|
* @return bool
|
|
*/
|
|
bool getTimedOut() const { return d_timedOut; }
|
|
/*!
|
|
* Returns whether the search was cancelled or not,
|
|
* @return bool
|
|
*/
|
|
bool getCancelled() const { return d_cancelled; }
|
|
|
|
// Merge other into this, keeping only molecules with unique
|
|
// names and destroying contents of other on exit.
|
|
void mergeResults(SearchResults &other);
|
|
|
|
private:
|
|
std::vector<std::unique_ptr<ROMol>> d_hitMolecules;
|
|
// Only used when merging in another set, so will be
|
|
// filled in then if needed, empty otherwise.
|
|
boost::unordered_flat_set<std::string> d_molNames;
|
|
|
|
std::uint64_t d_maxNumResults;
|
|
bool d_timedOut{false};
|
|
bool d_cancelled{false};
|
|
};
|
|
|
|
} // namespace RDKit::SynthonSpaceSearch
|
|
|
|
#endif // RDKIT_SYNTHONSPACE_SEARCHRESULTS_H
|