Optimisation of fingerprint Synthon Search (#8223)

* Change how synthonsToUse is stored in SynthonSpaceHitSet.

* Sort fragments by descending similarity.

* Sort fragments by ascending size.

* Use pair not tuple.

* Un-cringe Greg.

---------

Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
This commit is contained in:
David Cosgrove
2025-01-30 03:59:19 +00:00
committed by greg landrum
parent b0a1d6fe50
commit 1ef3c4bbc6
6 changed files with 90 additions and 52 deletions

View File

@@ -102,8 +102,18 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
// Holds the information about a set of hits. The molecules can be built
// by making all combinations of synthons, one taken from each synthon set.
struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceHitSet {
SynthonSpaceHitSet() = delete;
SynthonSpaceHitSet(const std::string &id,
const std::vector<std::vector<size_t>> &stu)
: reactionId(id), synthonsToUse(stu) {
numHits = std::accumulate(
synthonsToUse.begin(), synthonsToUse.end(), size_t(1),
[](const int prevRes, const std::vector<size_t> &s2) -> size_t {
return prevRes * s2.size();
});
}
std::string reactionId;
std::vector<boost::dynamic_bitset<>> synthonsToUse;
std::vector<std::vector<size_t>> synthonsToUse;
size_t numHits{0};
};

View File

@@ -8,6 +8,8 @@
// of the RDKit source tree.
//
#include <algorithm>
#include <DataStructs/BitOps.h>
#include <GraphMol/MolOps.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
@@ -33,11 +35,15 @@ SynthonSpaceFingerprintSearcher::SynthonSpaceFingerprintSearcher(
namespace {
// Take the fragged mol fps and flag all those synthons that have a fragment as
// a similarity match.
std::vector<boost::dynamic_bitset<>> getHitSynthons(
std::vector<std::vector<size_t>> getHitSynthons(
const std::vector<std::unique_ptr<ExplicitBitVect>> &fragFPs,
const double similarityCutoff, const std::unique_ptr<SynthonSet> &reaction,
const std::vector<unsigned int> &synthonOrder) {
std::vector<boost::dynamic_bitset<>> synthonsToUse;
std::vector<std::vector<size_t>> retSynthons;
std::vector<std::vector<std::pair<size_t, double>>> fragSims(
reaction->getSynthons().size());
synthonsToUse.reserve(reaction->getSynthons().size());
for (const auto &synthonSet : reaction->getSynthons()) {
synthonsToUse.emplace_back(synthonSet.size());
@@ -49,21 +55,44 @@ std::vector<boost::dynamic_bitset<>> getHitSynthons(
if (const auto sim = TanimotoSimilarity(*fragFPs[i], *synthonFPs[j]);
sim >= similarityCutoff) {
synthonsToUse[synthonOrder[i]][j] = true;
fragSims[synthonOrder[i]].emplace_back(j, sim);
fragMatched = true;
}
}
if (!fragMatched) {
// No synthons matched this fragment, so the whole fragment set is a
// bust.
synthonsToUse.clear();
return synthonsToUse;
return retSynthons;
}
}
// Fill in any synthons where they all didn't match because there were
// fewer fragments than synthons.
details::expandBitSet(synthonsToUse);
return synthonsToUse;
details::bitSetsToVectors(synthonsToUse, retSynthons);
// Now order the synthons in descending order of their similarity to
// the corresponding fragFP.
for (size_t i = 0; i < fragFPs.size(); i++) {
if (fragSims[i].empty()) {
// This one will have been filled in by expandBitSet so we need to use
// all the synthons and a dummy similarity.
fragSims[i].resize(synthonsToUse[i].size());
for (size_t j = 0; j < fragSims[i].size(); j++) {
fragSims[i][j] = std::make_pair(j, 0.0);
}
} else {
std::sort(
fragSims[i].begin(), fragSims[i].end(),
[](const auto &a, const auto &b) { return a.second > b.second; });
}
retSynthons[i].clear();
std::transform(
fragSims[i].begin(), fragSims[i].end(),
std::back_inserter(retSynthons[i]),
[](const std::pair<size_t, double> &fs) { return fs.first; });
}
return retSynthons;
}
} // namespace
@@ -135,14 +164,9 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
getParams().similarityCutoff - getParams().fragSimilarityAdjuster,
reaction, synthonOrder);
if (!theseSynthons.empty()) {
const size_t numHits = std::accumulate(
theseSynthons.begin(), theseSynthons.end(), 1,
[](const int prevRes, const boost::dynamic_bitset<> &s2) {
return prevRes * s2.count();
});
if (numHits) {
results.push_back(
SynthonSpaceHitSet{reaction->getId(), theseSynthons, numHits});
SynthonSpaceHitSet hs{reaction->getId(), theseSynthons};
if (hs.numHits) {
results.push_back(hs);
}
}
}

View File

@@ -320,4 +320,17 @@ void expandBitSet(std::vector<boost::dynamic_bitset<>> &bitSets) {
}
}
void bitSetsToVectors(const std::vector<boost::dynamic_bitset<>> &bitSets,
std::vector<std::vector<size_t>> &outVecs) {
outVecs.resize(bitSets.size());
for (size_t i = 0; i < bitSets.size(); ++i) {
outVecs[i].reserve(bitSets[i].count());
for (size_t j = 0; j < bitSets[i].size(); j++) {
if (bitSets[i][j]) {
outVecs[i].push_back(j);
}
}
}
}
} // namespace RDKit::SynthonSpaceSearch::details

View File

@@ -77,6 +77,10 @@ getConnectorPermutations(const std::vector<std::unique_ptr<ROMol>> &molFrags,
RDKIT_SYNTHONSPACESEARCH_EXPORT void expandBitSet(
std::vector<boost::dynamic_bitset<>> &bitSets);
RDKIT_SYNTHONSPACESEARCH_EXPORT void bitSetsToVectors(
const std::vector<boost::dynamic_bitset<>> &bitSets,
std::vector<std::vector<size_t>> &outVecs);
// class to step through all combinations of lists of different sizes.
// returns (0,0,0), (0,0,1), (0,1,0) etc.
struct RDKIT_SYNTHONSPACESEARCH_EXPORT Stepper {

View File

@@ -179,26 +179,17 @@ void SynthonSpaceSearcher::buildAllHits(
bool &timedOut, std::vector<std::unique_ptr<ROMol>> &results) const {
std::uint64_t numTries = 100;
for (const auto &[reactionId, synthonsToUse, numHits] : hitsets) {
std::vector<std::vector<size_t>> synthonNums;
synthonNums.reserve(synthonsToUse.size());
std::vector<size_t> numSynthons;
numSynthons.reserve(synthonsToUse.size());
for (auto &stu : synthonsToUse) {
numSynthons.push_back(stu.count());
synthonNums.emplace_back();
synthonNums.back().reserve(stu.count());
for (size_t j = 0; j < stu.size(); ++j) {
if (stu[j]) {
synthonNums.back().push_back(j);
}
}
numSynthons.push_back(stu.size());
}
const auto &reaction = getSpace().getReactions().find(reactionId)->second;
details::Stepper stepper(numSynthons);
std::vector<size_t> theseSynthNums(synthonNums.size(), 0);
std::vector<size_t> theseSynthNums(synthonsToUse.size(), 0);
while (stepper.d_currState[0] != numSynthons[0]) {
for (size_t i = 0; i < stepper.d_currState.size(); ++i) {
theseSynthNums[i] = synthonNums[i][stepper.d_currState[i]];
theseSynthNums[i] = synthonsToUse[i][stepper.d_currState[i]];
}
if (auto prod =
buildAndVerifyHit(reaction, theseSynthNums, resultsNames)) {
@@ -242,24 +233,13 @@ struct RandomHitSelector {
d_hitSetSel = boost::random::discrete_distribution<size_t>(
d_hitSetWeights.begin(), d_hitSetWeights.end());
d_synthSels.resize(hitsets.size());
d_synthons.resize(hitsets.size());
for (size_t hi = 0; hi < hitsets.size(); ++hi) {
const SynthonSpaceHitSet &hs = hitsets[hi];
d_synthons[hi] =
std::vector<std::vector<size_t>>(hs.synthonsToUse.size());
d_synthSels[hi] =
std::vector<boost::random::uniform_int_distribution<size_t>>(
hs.synthonsToUse.size());
d_synthons[hi].resize(hs.synthonsToUse.size());
for (size_t i = 0; i < hs.synthonsToUse.size(); ++i) {
d_synthons[hi][i].reserve(hs.synthonsToUse[i].count());
hitsets[hi].synthonsToUse.size());
for (size_t i = 0; i < hitsets[hi].synthonsToUse.size(); ++i) {
d_synthSels[hi][i] = boost::random::uniform_int_distribution<size_t>(
0, hs.synthonsToUse[i].count() - 1);
for (size_t j = 0; j < hs.synthonsToUse[i].size(); ++j) {
if (hs.synthonsToUse[i][j]) {
d_synthons[hi][i].push_back(j);
}
}
0, hitsets[hi].synthonsToUse[i].size() - 1);
}
}
}
@@ -270,7 +250,7 @@ struct RandomHitSelector {
const size_t hitSetNum = d_hitSetSel(randGen);
for (size_t i = 0; i < d_hitsets[hitSetNum].synthonsToUse.size(); ++i) {
const size_t synthNum = d_synthSels[hitSetNum][i](randGen);
synths.push_back(d_synthons[hitSetNum][i][synthNum]);
synths.push_back(d_hitsets[hitSetNum].synthonsToUse[i][synthNum]);
}
return std::make_pair(d_hitsets[hitSetNum].reactionId, synths);
}
@@ -280,7 +260,6 @@ struct RandomHitSelector {
std::vector<size_t> d_hitSetWeights;
boost::random::discrete_distribution<size_t> d_hitSetSel;
std::vector<std::vector<std::vector<size_t>>> d_synthons;
std::vector<std::vector<boost::random::uniform_int_distribution<size_t>>>
d_synthSels;
};

View File

@@ -149,13 +149,14 @@ std::vector<boost::dynamic_bitset<>> screenSynthonsWithFPs(
// Take the fragged mol and flag all those synthons that have a fragment as
// a substructure match. Only do this for those synthons that have already
// passed previous screening, and are flagged as such in passedScreens.
std::vector<boost::dynamic_bitset<>> getHitSynthons(
std::vector<std::vector<size_t>> getHitSynthons(
const std::vector<std::unique_ptr<ROMol>> &molFrags,
const std::vector<boost::dynamic_bitset<>> &passedScreens,
const std::unique_ptr<SynthonSet> &reaction,
const std::vector<unsigned int> &synthonOrder) {
MatchVectType dontCare;
std::vector<boost::dynamic_bitset<>> synthonsToUse;
std::vector<std::vector<size_t>> retSynthons;
for (const auto &synthonSet : reaction->getSynthons()) {
synthonsToUse.emplace_back(synthonSet.size());
}
@@ -182,13 +183,25 @@ std::vector<boost::dynamic_bitset<>> getHitSynthons(
// if the fragment didn't match anything, the whole thing's a bust.
if (!fragMatched) {
synthonsToUse.clear();
return synthonsToUse;
return retSynthons;
}
}
// Fill in any synthons where they all didn't match.
details::expandBitSet(synthonsToUse);
return synthonsToUse;
details::bitSetsToVectors(synthonsToUse, retSynthons);
// Now sort the selected synthons into ascending order of number of atoms,
// since smaller molecules are likely to be of more interest.
for (size_t i = 0; i < retSynthons.size(); ++i) {
const auto &synthonsi = reaction->getSynthons()[i];
std::sort(retSynthons[i].begin(), retSynthons[i].end(),
[&](const size_t a, const size_t b) {
return (synthonsi[a]->getOrigMol()->getNumAtoms() <
synthonsi[b]->getOrigMol()->getNumAtoms());
});
}
return retSynthons;
}
} // namespace
@@ -256,14 +269,9 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceSubstructureSearcher::searchFragSet(
auto theseSynthons =
getHitSynthons(connComb, passedScreens, reaction, so);
if (!theseSynthons.empty()) {
const size_t numHits = std::accumulate(
theseSynthons.begin(), theseSynthons.end(), 1,
[](const int prevRes, const boost::dynamic_bitset<> &s2) {
return prevRes * s2.count();
});
if (numHits) {
results.push_back(
SynthonSpaceHitSet{reaction->getId(), theseSynthons, numHits});
SynthonSpaceHitSet hs{reaction->getId(), theseSynthons};
if (hs.numHits) {
results.push_back(hs);
}
}
}