synthon perf: replace sort+unique dedup with boost::unordered_flat_set (#9305)

sortAndUniquifyToTry previously built a parallel vector of (index, string)
pairs, sorted by string, erased duplicates, then rebuilt the original vector
— O(N log N) with one heap allocation per candidate product.

Replace with an erase-remove over a boost::unordered_flat_set<size_t> keyed
on buildProductHash (boost::hash_combine over synthon IDs + reaction ID).
Dedup is now O(N) average with no string allocations on the hot path.

Also switch SearchResults::d_molNames from std::unordered_set<std::string>
to boost::unordered_flat_set<std::string> for the same open-addressing cache
locality benefit during mergeResults.

Perf (42-rxn / 140B-product Freedom space, maxHits=3000, hitStart=1000,
9 queries; vanilla.log → 2unordered_flat_set.log):
  Benzene:       6.92s → 5.64s  (−19%)
  Tolueneish:    6.19s → 5.07s  (−18%)
  Acetaminophen: 4.50s → 3.63s  (−19%)
  Allopurinol:   4.41s → 3.94s  (−11%)
  Theophylline:  4.39s → 3.90s  (−11%)
  Nicotine:      4.87s → 3.97s  (−18%)
  Ciprofloxacin: 6.82s → 6.09s  (−11%)
  Aspirin:       4.51s → 3.42s  (−24%)
  Metoprolol:    5.11s → 4.07s  (−20%)
  Total:        48.40s → 40.33s (−17%)

Hit counts and MaxNumResults unchanged across all queries.

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dan Nealschneider
2026-05-28 08:13:03 -07:00
committed by greg landrum
parent 4e9e504079
commit db53c39aed
4 changed files with 33 additions and 22 deletions

View File

@@ -12,6 +12,7 @@
#define RDKIT_SYNTHONSPACE_SEARCHRESULTS_H
#include <functional>
#include <boost/unordered/unordered_flat_set.hpp>
#include <RDGeneral/export.h>
#include <GraphMol/ROMol.h>
@@ -77,7 +78,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SearchResults {
std::vector<std::unique_ptr<ROMol>> d_hitMolecules;
// Only used when merging in another set, so will be
// filled in then if needed, empty otherwise.
std::unordered_set<std::string> d_molNames;
boost::unordered_flat_set<std::string> d_molNames;
std::uint64_t d_maxNumResults;
bool d_timedOut{false};

View File

@@ -17,6 +17,7 @@
#include <vector>
#include <boost/dynamic_bitset.hpp>
#include <boost/functional/hash.hpp>
#include <RDGeneral/ControlCHandler.h>
#include <GraphMol/Chirality.h>
@@ -825,6 +826,17 @@ std::string buildProductName(
return prodName;
}
std::size_t buildProductHash(
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
const std::vector<size_t> &fragNums) {
std::size_t seed = 0;
for (size_t i = 0; i < fragNums.size(); ++i) {
boost::hash_combine(seed, hitset->synthonsToUse[i][fragNums[i]].first);
}
boost::hash_combine(seed, hitset->d_reaction->getId());
return seed;
}
std::unique_ptr<ROMol> buildProduct(
const std::vector<const ROMol *> &synthons) {
MolzipParams mzparams;

View File

@@ -147,6 +147,11 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName(
RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName(
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
const std::vector<size_t> &fragNums);
// Hash of the product identity — same byte sequence as buildProductName but
// without allocating the concatenated string. Use as a dedup key.
RDKIT_SYNTHONSPACESEARCH_EXPORT std::size_t buildProductHash(
const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset,
const std::vector<size_t> &fragNums);
// Zip the fragments together to make a molecule. Assumes the connection
// points are marking by isotope numbers on dummy atoms.
RDKIT_SYNTHONSPACESEARCH_EXPORT std::unique_ptr<ROMol> buildProduct(

View File

@@ -11,6 +11,7 @@
#include <random>
#include <thread>
#include <boost/random/discrete_distribution.hpp>
#include <boost/unordered/unordered_flat_set.hpp>
#include <GraphMol/Chirality.h>
#include <GraphMol/MolOps.h>
@@ -380,27 +381,19 @@ void sortHits(std::vector<std::unique_ptr<ROMol>> &hits) {
void sortAndUniquifyToTry(
std::vector<std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>>
&toTry) {
std::vector<std::pair<size_t, std::string>> tmp;
tmp.reserve(toTry.size());
for (size_t i = 0; i < toTry.size(); i++) {
tmp.emplace_back(
i, details::buildProductName(toTry[i].first, toTry[i].second));
}
std::sort(tmp.begin(), tmp.end(),
[](const auto &lhs, const auto &rhs) -> bool {
return lhs.second < rhs.second;
});
tmp.erase(std::unique(tmp.begin(), tmp.end(),
[](const auto &lhs, const auto &rhs) -> bool {
return lhs.second == rhs.second;
}),
tmp.end());
std::vector<std::pair<const SynthonSpaceHitSet *, std::vector<size_t>>>
newToTry;
newToTry.reserve(tmp.size());
std::transform(tmp.begin(), tmp.end(), back_inserter(newToTry),
[&](const auto &p) -> auto { return toTry[p.first]; });
toTry = newToTry;
// Two query fragmentations can map to the same (reaction, synthon-combo)
// pair; deduplicate so we don't build the same product twice.
boost::unordered_flat_set<std::size_t> seen;
seen.reserve(toTry.size());
toTry.erase(
std::remove_if(toTry.begin(), toTry.end(),
[&seen](const auto &entry) {
return !seen
.insert(details::buildProductHash(
entry.first, entry.second))
.second;
}),
toTry.end());
}
bool haveEnoughHits(const std::vector<std::unique_ptr<ROMol>> &results,