mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
synthon perf: replace sort+unique dedup with boost::unordered_flat_set (#9305)
sortAndUniquifyToTry previously built a parallel vector of (index, string) pairs, sorted by string, erased duplicates, then rebuilt the original vector — O(N log N) with one heap allocation per candidate product. Replace with an erase-remove over a boost::unordered_flat_set<size_t> keyed on buildProductHash (boost::hash_combine over synthon IDs + reaction ID). Dedup is now O(N) average with no string allocations on the hot path. Also switch SearchResults::d_molNames from std::unordered_set<std::string> to boost::unordered_flat_set<std::string> for the same open-addressing cache locality benefit during mergeResults. Perf (42-rxn / 140B-product Freedom space, maxHits=3000, hitStart=1000, 9 queries; vanilla.log → 2unordered_flat_set.log): Benzene: 6.92s → 5.64s (−19%) Tolueneish: 6.19s → 5.07s (−18%) Acetaminophen: 4.50s → 3.63s (−19%) Allopurinol: 4.41s → 3.94s (−11%) Theophylline: 4.39s → 3.90s (−11%) Nicotine: 4.87s → 3.97s (−18%) Ciprofloxacin: 6.82s → 6.09s (−11%) Aspirin: 4.51s → 3.42s (−24%) Metoprolol: 5.11s → 4.07s (−20%) Total: 48.40s → 40.33s (−17%) Hit counts and MaxNumResults unchanged across all queries. Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
committed by
greg landrum
parent
4e9e504079
commit
db53c39aed
@@ -12,6 +12,7 @@
|
||||
#define RDKIT_SYNTHONSPACE_SEARCHRESULTS_H
|
||||
|
||||
#include <functional>
|
||||
#include <boost/unordered/unordered_flat_set.hpp>
|
||||
#include <RDGeneral/export.h>
|
||||
#include <GraphMol/ROMol.h>
|
||||
|
||||
@@ -77,7 +78,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SearchResults {
|
||||
std::vector<std::unique_ptr<ROMol>> d_hitMolecules;
|
||||
// Only used when merging in another set, so will be
|
||||
// filled in then if needed, empty otherwise.
|
||||
std::unordered_set<std::string> d_molNames;
|
||||
boost::unordered_flat_set<std::string> d_molNames;
|
||||
|
||||
std::uint64_t d_maxNumResults;
|
||||
bool d_timedOut{false};
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <boost/functional/hash.hpp>
|
||||
|
||||
#include <RDGeneral/ControlCHandler.h>
|
||||
#include <GraphMol/Chirality.h>
|
||||
@@ -825,6 +826,17 @@ std::string buildProductName(
|
||||
return prodName;
|
||||
}
|
||||
|
||||
std::size_t buildProductHash(
|
||||
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
|
||||
const std::vector<size_t> &fragNums) {
|
||||
std::size_t seed = 0;
|
||||
for (size_t i = 0; i < fragNums.size(); ++i) {
|
||||
boost::hash_combine(seed, hitset->synthonsToUse[i][fragNums[i]].first);
|
||||
}
|
||||
boost::hash_combine(seed, hitset->d_reaction->getId());
|
||||
return seed;
|
||||
}
|
||||
|
||||
std::unique_ptr<ROMol> buildProduct(
|
||||
const std::vector<const ROMol *> &synthons) {
|
||||
MolzipParams mzparams;
|
||||
|
||||
@@ -147,6 +147,11 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName(
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName(
|
||||
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
|
||||
const std::vector<size_t> &fragNums);
|
||||
// Hash of the product identity — same byte sequence as buildProductName but
|
||||
// without allocating the concatenated string. Use as a dedup key.
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT std::size_t buildProductHash(
|
||||
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
|
||||
const std::vector<size_t> &fragNums);
|
||||
// Zip the fragments together to make a molecule. Assumes the connection
|
||||
// points are marking by isotope numbers on dummy atoms.
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT std::unique_ptr<ROMol> buildProduct(
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <boost/random/discrete_distribution.hpp>
|
||||
#include <boost/unordered/unordered_flat_set.hpp>
|
||||
|
||||
#include <GraphMol/Chirality.h>
|
||||
#include <GraphMol/MolOps.h>
|
||||
@@ -380,27 +381,19 @@ void sortHits(std::vector<std::unique_ptr<ROMol>> &hits) {
|
||||
void sortAndUniquifyToTry(
|
||||
std::vector<std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>>
|
||||
&toTry) {
|
||||
std::vector<std::pair<size_t, std::string>> tmp;
|
||||
tmp.reserve(toTry.size());
|
||||
for (size_t i = 0; i < toTry.size(); i++) {
|
||||
tmp.emplace_back(
|
||||
i, details::buildProductName(toTry[i].first, toTry[i].second));
|
||||
}
|
||||
std::sort(tmp.begin(), tmp.end(),
|
||||
[](const auto &lhs, const auto &rhs) -> bool {
|
||||
return lhs.second < rhs.second;
|
||||
});
|
||||
tmp.erase(std::unique(tmp.begin(), tmp.end(),
|
||||
[](const auto &lhs, const auto &rhs) -> bool {
|
||||
return lhs.second == rhs.second;
|
||||
}),
|
||||
tmp.end());
|
||||
std::vector<std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>>
|
||||
newToTry;
|
||||
newToTry.reserve(tmp.size());
|
||||
std::transform(tmp.begin(), tmp.end(), back_inserter(newToTry),
|
||||
[&](const auto &p) -> auto { return toTry[p.first]; });
|
||||
toTry = newToTry;
|
||||
// Two query fragmentations can map to the same (reaction, synthon-combo)
|
||||
// pair; deduplicate so we don't build the same product twice.
|
||||
boost::unordered_flat_set<std::size_t> seen;
|
||||
seen.reserve(toTry.size());
|
||||
toTry.erase(
|
||||
std::remove_if(toTry.begin(), toTry.end(),
|
||||
[&seen](const auto &entry) {
|
||||
return !seen
|
||||
.insert(details::buildProductHash(
|
||||
entry.first, entry.second))
|
||||
.second;
|
||||
}),
|
||||
toTry.end());
|
||||
}
|
||||
|
||||
bool haveEnoughHits(const std::vector<std::unique_ptr<ROMol>> &results,
|
||||
|
||||
Reference in New Issue
Block a user