mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Optimisation of fingerprint Synthon Search (#8223)
* Change how synthonsToUse is stored in SynthonSpaceHitSet. * Sort fragments by descending similarity. * Sort fragments by ascending size. * Use pair not tuple. * Un-cringe Greg. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
This commit is contained in:
committed by
greg landrum
parent
b0a1d6fe50
commit
1ef3c4bbc6
@@ -102,8 +102,18 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
|
||||
// Holds the information about a set of hits. The molecules can be built
|
||||
// by making all combinations of synthons, one taken from each synthon set.
|
||||
struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceHitSet {
|
||||
SynthonSpaceHitSet() = delete;
|
||||
SynthonSpaceHitSet(const std::string &id,
|
||||
const std::vector<std::vector<size_t>> &stu)
|
||||
: reactionId(id), synthonsToUse(stu) {
|
||||
numHits = std::accumulate(
|
||||
synthonsToUse.begin(), synthonsToUse.end(), size_t(1),
|
||||
[](const int prevRes, const std::vector<size_t> &s2) -> size_t {
|
||||
return prevRes * s2.size();
|
||||
});
|
||||
}
|
||||
std::string reactionId;
|
||||
std::vector<boost::dynamic_bitset<>> synthonsToUse;
|
||||
std::vector<std::vector<size_t>> synthonsToUse;
|
||||
size_t numHits{0};
|
||||
};
|
||||
|
||||
|
||||
@@ -8,6 +8,8 @@
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include <DataStructs/BitOps.h>
|
||||
#include <GraphMol/MolOps.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
@@ -33,11 +35,15 @@ SynthonSpaceFingerprintSearcher::SynthonSpaceFingerprintSearcher(
|
||||
namespace {
|
||||
// Take the fragged mol fps and flag all those synthons that have a fragment as
|
||||
// a similarity match.
|
||||
std::vector<boost::dynamic_bitset<>> getHitSynthons(
|
||||
std::vector<std::vector<size_t>> getHitSynthons(
|
||||
const std::vector<std::unique_ptr<ExplicitBitVect>> &fragFPs,
|
||||
const double similarityCutoff, const std::unique_ptr<SynthonSet> &reaction,
|
||||
const std::vector<unsigned int> &synthonOrder) {
|
||||
std::vector<boost::dynamic_bitset<>> synthonsToUse;
|
||||
std::vector<std::vector<size_t>> retSynthons;
|
||||
std::vector<std::vector<std::pair<size_t, double>>> fragSims(
|
||||
reaction->getSynthons().size());
|
||||
|
||||
synthonsToUse.reserve(reaction->getSynthons().size());
|
||||
for (const auto &synthonSet : reaction->getSynthons()) {
|
||||
synthonsToUse.emplace_back(synthonSet.size());
|
||||
@@ -49,21 +55,44 @@ std::vector<boost::dynamic_bitset<>> getHitSynthons(
|
||||
if (const auto sim = TanimotoSimilarity(*fragFPs[i], *synthonFPs[j]);
|
||||
sim >= similarityCutoff) {
|
||||
synthonsToUse[synthonOrder[i]][j] = true;
|
||||
fragSims[synthonOrder[i]].emplace_back(j, sim);
|
||||
fragMatched = true;
|
||||
}
|
||||
}
|
||||
if (!fragMatched) {
|
||||
// No synthons matched this fragment, so the whole fragment set is a
|
||||
// bust.
|
||||
synthonsToUse.clear();
|
||||
return synthonsToUse;
|
||||
return retSynthons;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill in any synthons where they all didn't match because there were
|
||||
// fewer fragments than synthons.
|
||||
details::expandBitSet(synthonsToUse);
|
||||
return synthonsToUse;
|
||||
details::bitSetsToVectors(synthonsToUse, retSynthons);
|
||||
|
||||
// Now order the synthons in descending order of their similarity to
|
||||
// the corresponding fragFP.
|
||||
for (size_t i = 0; i < fragFPs.size(); i++) {
|
||||
if (fragSims[i].empty()) {
|
||||
// This one will have been filled in by expandBitSet so we need to use
|
||||
// all the synthons and a dummy similarity.
|
||||
fragSims[i].resize(synthonsToUse[i].size());
|
||||
for (size_t j = 0; j < fragSims[i].size(); j++) {
|
||||
fragSims[i][j] = std::make_pair(j, 0.0);
|
||||
}
|
||||
} else {
|
||||
std::sort(
|
||||
fragSims[i].begin(), fragSims[i].end(),
|
||||
[](const auto &a, const auto &b) { return a.second > b.second; });
|
||||
}
|
||||
retSynthons[i].clear();
|
||||
std::transform(
|
||||
fragSims[i].begin(), fragSims[i].end(),
|
||||
std::back_inserter(retSynthons[i]),
|
||||
[](const std::pair<size_t, double> &fs) { return fs.first; });
|
||||
}
|
||||
return retSynthons;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
@@ -135,14 +164,9 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
|
||||
getParams().similarityCutoff - getParams().fragSimilarityAdjuster,
|
||||
reaction, synthonOrder);
|
||||
if (!theseSynthons.empty()) {
|
||||
const size_t numHits = std::accumulate(
|
||||
theseSynthons.begin(), theseSynthons.end(), 1,
|
||||
[](const int prevRes, const boost::dynamic_bitset<> &s2) {
|
||||
return prevRes * s2.count();
|
||||
});
|
||||
if (numHits) {
|
||||
results.push_back(
|
||||
SynthonSpaceHitSet{reaction->getId(), theseSynthons, numHits});
|
||||
SynthonSpaceHitSet hs{reaction->getId(), theseSynthons};
|
||||
if (hs.numHits) {
|
||||
results.push_back(hs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -320,4 +320,17 @@ void expandBitSet(std::vector<boost::dynamic_bitset<>> &bitSets) {
|
||||
}
|
||||
}
|
||||
|
||||
void bitSetsToVectors(const std::vector<boost::dynamic_bitset<>> &bitSets,
|
||||
std::vector<std::vector<size_t>> &outVecs) {
|
||||
outVecs.resize(bitSets.size());
|
||||
for (size_t i = 0; i < bitSets.size(); ++i) {
|
||||
outVecs[i].reserve(bitSets[i].count());
|
||||
for (size_t j = 0; j < bitSets[i].size(); j++) {
|
||||
if (bitSets[i][j]) {
|
||||
outVecs[i].push_back(j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace RDKit::SynthonSpaceSearch::details
|
||||
|
||||
@@ -77,6 +77,10 @@ getConnectorPermutations(const std::vector<std::unique_ptr<ROMol>> &molFrags,
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT void expandBitSet(
|
||||
std::vector<boost::dynamic_bitset<>> &bitSets);
|
||||
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT void bitSetsToVectors(
|
||||
const std::vector<boost::dynamic_bitset<>> &bitSets,
|
||||
std::vector<std::vector<size_t>> &outVecs);
|
||||
|
||||
// class to step through all combinations of lists of different sizes.
|
||||
// returns (0,0,0), (0,0,1), (0,1,0) etc.
|
||||
struct RDKIT_SYNTHONSPACESEARCH_EXPORT Stepper {
|
||||
|
||||
@@ -179,26 +179,17 @@ void SynthonSpaceSearcher::buildAllHits(
|
||||
bool &timedOut, std::vector<std::unique_ptr<ROMol>> &results) const {
|
||||
std::uint64_t numTries = 100;
|
||||
for (const auto &[reactionId, synthonsToUse, numHits] : hitsets) {
|
||||
std::vector<std::vector<size_t>> synthonNums;
|
||||
synthonNums.reserve(synthonsToUse.size());
|
||||
std::vector<size_t> numSynthons;
|
||||
numSynthons.reserve(synthonsToUse.size());
|
||||
for (auto &stu : synthonsToUse) {
|
||||
numSynthons.push_back(stu.count());
|
||||
synthonNums.emplace_back();
|
||||
synthonNums.back().reserve(stu.count());
|
||||
for (size_t j = 0; j < stu.size(); ++j) {
|
||||
if (stu[j]) {
|
||||
synthonNums.back().push_back(j);
|
||||
}
|
||||
}
|
||||
numSynthons.push_back(stu.size());
|
||||
}
|
||||
const auto &reaction = getSpace().getReactions().find(reactionId)->second;
|
||||
details::Stepper stepper(numSynthons);
|
||||
std::vector<size_t> theseSynthNums(synthonNums.size(), 0);
|
||||
std::vector<size_t> theseSynthNums(synthonsToUse.size(), 0);
|
||||
while (stepper.d_currState[0] != numSynthons[0]) {
|
||||
for (size_t i = 0; i < stepper.d_currState.size(); ++i) {
|
||||
theseSynthNums[i] = synthonNums[i][stepper.d_currState[i]];
|
||||
theseSynthNums[i] = synthonsToUse[i][stepper.d_currState[i]];
|
||||
}
|
||||
if (auto prod =
|
||||
buildAndVerifyHit(reaction, theseSynthNums, resultsNames)) {
|
||||
@@ -242,24 +233,13 @@ struct RandomHitSelector {
|
||||
d_hitSetSel = boost::random::discrete_distribution<size_t>(
|
||||
d_hitSetWeights.begin(), d_hitSetWeights.end());
|
||||
d_synthSels.resize(hitsets.size());
|
||||
d_synthons.resize(hitsets.size());
|
||||
for (size_t hi = 0; hi < hitsets.size(); ++hi) {
|
||||
const SynthonSpaceHitSet &hs = hitsets[hi];
|
||||
d_synthons[hi] =
|
||||
std::vector<std::vector<size_t>>(hs.synthonsToUse.size());
|
||||
d_synthSels[hi] =
|
||||
std::vector<boost::random::uniform_int_distribution<size_t>>(
|
||||
hs.synthonsToUse.size());
|
||||
d_synthons[hi].resize(hs.synthonsToUse.size());
|
||||
for (size_t i = 0; i < hs.synthonsToUse.size(); ++i) {
|
||||
d_synthons[hi][i].reserve(hs.synthonsToUse[i].count());
|
||||
hitsets[hi].synthonsToUse.size());
|
||||
for (size_t i = 0; i < hitsets[hi].synthonsToUse.size(); ++i) {
|
||||
d_synthSels[hi][i] = boost::random::uniform_int_distribution<size_t>(
|
||||
0, hs.synthonsToUse[i].count() - 1);
|
||||
for (size_t j = 0; j < hs.synthonsToUse[i].size(); ++j) {
|
||||
if (hs.synthonsToUse[i][j]) {
|
||||
d_synthons[hi][i].push_back(j);
|
||||
}
|
||||
}
|
||||
0, hitsets[hi].synthonsToUse[i].size() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,7 +250,7 @@ struct RandomHitSelector {
|
||||
const size_t hitSetNum = d_hitSetSel(randGen);
|
||||
for (size_t i = 0; i < d_hitsets[hitSetNum].synthonsToUse.size(); ++i) {
|
||||
const size_t synthNum = d_synthSels[hitSetNum][i](randGen);
|
||||
synths.push_back(d_synthons[hitSetNum][i][synthNum]);
|
||||
synths.push_back(d_hitsets[hitSetNum].synthonsToUse[i][synthNum]);
|
||||
}
|
||||
return std::make_pair(d_hitsets[hitSetNum].reactionId, synths);
|
||||
}
|
||||
@@ -280,7 +260,6 @@ struct RandomHitSelector {
|
||||
|
||||
std::vector<size_t> d_hitSetWeights;
|
||||
boost::random::discrete_distribution<size_t> d_hitSetSel;
|
||||
std::vector<std::vector<std::vector<size_t>>> d_synthons;
|
||||
std::vector<std::vector<boost::random::uniform_int_distribution<size_t>>>
|
||||
d_synthSels;
|
||||
};
|
||||
|
||||
@@ -149,13 +149,14 @@ std::vector<boost::dynamic_bitset<>> screenSynthonsWithFPs(
|
||||
// Take the fragged mol and flag all those synthons that have a fragment as
|
||||
// a substructure match. Only do this for those synthons that have already
|
||||
// passed previous screening, and are flagged as such in passedScreens.
|
||||
std::vector<boost::dynamic_bitset<>> getHitSynthons(
|
||||
std::vector<std::vector<size_t>> getHitSynthons(
|
||||
const std::vector<std::unique_ptr<ROMol>> &molFrags,
|
||||
const std::vector<boost::dynamic_bitset<>> &passedScreens,
|
||||
const std::unique_ptr<SynthonSet> &reaction,
|
||||
const std::vector<unsigned int> &synthonOrder) {
|
||||
MatchVectType dontCare;
|
||||
std::vector<boost::dynamic_bitset<>> synthonsToUse;
|
||||
std::vector<std::vector<size_t>> retSynthons;
|
||||
for (const auto &synthonSet : reaction->getSynthons()) {
|
||||
synthonsToUse.emplace_back(synthonSet.size());
|
||||
}
|
||||
@@ -182,13 +183,25 @@ std::vector<boost::dynamic_bitset<>> getHitSynthons(
|
||||
// if the fragment didn't match anything, the whole thing's a bust.
|
||||
if (!fragMatched) {
|
||||
synthonsToUse.clear();
|
||||
return synthonsToUse;
|
||||
return retSynthons;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill in any synthons where they all didn't match.
|
||||
details::expandBitSet(synthonsToUse);
|
||||
return synthonsToUse;
|
||||
details::bitSetsToVectors(synthonsToUse, retSynthons);
|
||||
|
||||
// Now sort the selected synthons into ascending order of number of atoms,
|
||||
// since smaller molecules are likely to be of more interest.
|
||||
for (size_t i = 0; i < retSynthons.size(); ++i) {
|
||||
const auto &synthonsi = reaction->getSynthons()[i];
|
||||
std::sort(retSynthons[i].begin(), retSynthons[i].end(),
|
||||
[&](const size_t a, const size_t b) {
|
||||
return (synthonsi[a]->getOrigMol()->getNumAtoms() <
|
||||
synthonsi[b]->getOrigMol()->getNumAtoms());
|
||||
});
|
||||
}
|
||||
return retSynthons;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
@@ -256,14 +269,9 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceSubstructureSearcher::searchFragSet(
|
||||
auto theseSynthons =
|
||||
getHitSynthons(connComb, passedScreens, reaction, so);
|
||||
if (!theseSynthons.empty()) {
|
||||
const size_t numHits = std::accumulate(
|
||||
theseSynthons.begin(), theseSynthons.end(), 1,
|
||||
[](const int prevRes, const boost::dynamic_bitset<> &s2) {
|
||||
return prevRes * s2.count();
|
||||
});
|
||||
if (numHits) {
|
||||
results.push_back(
|
||||
SynthonSpaceHitSet{reaction->getId(), theseSynthons, numHits});
|
||||
SynthonSpaceHitSet hs{reaction->getId(), theseSynthons};
|
||||
if (hs.numHits) {
|
||||
results.push_back(hs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user