diff --git a/Code/GraphMol/SynthonSpaceSearch/SearchResults.h b/Code/GraphMol/SynthonSpaceSearch/SearchResults.h index 72743f963..ffbc53a8b 100644 --- a/Code/GraphMol/SynthonSpaceSearch/SearchResults.h +++ b/Code/GraphMol/SynthonSpaceSearch/SearchResults.h @@ -12,6 +12,7 @@ #define RDKIT_SYNTHONSPACE_SEARCHRESULTS_H #include +#include #include #include @@ -77,7 +78,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SearchResults { std::vector> d_hitMolecules; // Only used when merging in another set, so will be // filled in then if needed, empty otherwise. - std::unordered_set d_molNames; + boost::unordered_flat_set d_molNames; std::uint64_t d_maxNumResults; bool d_timedOut{false}; diff --git a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp index 776508857..1f874327a 100644 --- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp +++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -825,6 +826,17 @@ std::string buildProductName( return prodName; } +std::size_t buildProductHash( + const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset, + const std::vector &fragNums) { + std::size_t seed = 0; + for (size_t i = 0; i < fragNums.size(); ++i) { + boost::hash_combine(seed, hitset->synthonsToUse[i][fragNums[i]].first); + } + boost::hash_combine(seed, hitset->d_reaction->getId()); + return seed; +} + std::unique_ptr buildProduct( const std::vector &synthons) { MolzipParams mzparams; diff --git a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h index 8cdab808c..09c2747df 100644 --- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h +++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h @@ -147,6 +147,11 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName( RDKIT_SYNTHONSPACESEARCH_EXPORT std::string buildProductName( const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset, const std::vector &fragNums); +// Hash of the product identity — same byte sequence as buildProductName but +// without allocating the concatenated string. Use as a dedup key. +RDKIT_SYNTHONSPACESEARCH_EXPORT std::size_t buildProductHash( + const RDKit::SynthonSpaceSearch::SynthonSpaceHitSet *hitset, + const std::vector &fragNums); // Zip the fragments together to make a molecule. Assumes the connection // points are marking by isotope numbers on dummy atoms. RDKIT_SYNTHONSPACESEARCH_EXPORT std::unique_ptr buildProduct( diff --git a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp index 725007dc8..0d5927a01 100644 --- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp +++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -380,27 +381,19 @@ void sortHits(std::vector> &hits) { void sortAndUniquifyToTry( std::vector>> &toTry) { - std::vector> tmp; - tmp.reserve(toTry.size()); - for (size_t i = 0; i < toTry.size(); i++) { - tmp.emplace_back( - i, details::buildProductName(toTry[i].first, toTry[i].second)); - } - std::sort(tmp.begin(), tmp.end(), - [](const auto &lhs, const auto &rhs) -> bool { - return lhs.second < rhs.second; - }); - tmp.erase(std::unique(tmp.begin(), tmp.end(), - [](const auto &lhs, const auto &rhs) -> bool { - return lhs.second == rhs.second; - }), - tmp.end()); - std::vector>> - newToTry; - newToTry.reserve(tmp.size()); - std::transform(tmp.begin(), tmp.end(), back_inserter(newToTry), - [&](const auto &p) -> auto { return toTry[p.first]; }); - toTry = newToTry; + // Two query fragmentations can map to the same (reaction, synthon-combo) + // pair; deduplicate so we don't build the same product twice. + boost::unordered_flat_set seen; + seen.reserve(toTry.size()); + toTry.erase( + std::remove_if(toTry.begin(), toTry.end(), + [&seen](const auto &entry) { + return !seen + .insert(details::buildProductHash( + entry.first, entry.second)) + .second; + }), + toTry.end()); } bool haveEnoughHits(const std::vector> &results,