Optimisations to fingerprint search of Synthon Space (#8152)

* First pass at approximate FP check.

* Tidy and Python wrapper.

* More tidying.

* Add addFP and subtractFP to binary file.

* Minor tidy.

* In splits code, check for duplicate fragmentations.

* Update test results.

* Tidy.

* Set configurable limit on number of fragments generated from query.

* Stash prior to trying counts fps.

* Stash count fps.

* Back to bit fingerprints again.

* Extra comment.

---------

Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
This commit is contained in:
David Cosgrove
2025-01-22 04:44:56 +00:00
committed by greg landrum
parent 8e9e1d9574
commit ecb0c31ba3
13 changed files with 373 additions and 42 deletions

View File

@@ -22,8 +22,12 @@
// for example, it uses a different fingerprint for the initial synthon
// screening.
#include <cmath>
#include <random>
#include <regex>
#include <boost/random/discrete_distribution.hpp>
#include <DataStructs/ExplicitBitVect.h>
#include <GraphMol/MolPickler.h>
#include <GraphMol/ChemTransforms/ChemTransforms.h>
@@ -42,12 +46,31 @@ const std::vector<std::shared_ptr<ROMol>> &SynthonSet::getConnectorRegions()
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getConnRegFP() const {
return d_connRegFP;
}
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getAddFP() const {
return d_addFP;
}
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getSubtractFP() const {
return d_subtractFP;
}
const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
SynthonSet::getSynthonFPs() const {
return d_synthonFPs;
}
namespace {
void writeBitSet(std::ostream &os, const boost::dynamic_bitset<> &bitset) {
streamWrite(os, bitset.size());
for (unsigned int i = 0; i < bitset.size(); ++i) {
if (bitset[i]) {
streamWrite(os, true);
} else {
streamWrite(os, false);
}
}
}
} // namespace
void SynthonSet::writeToDBStream(std::ostream &os) const {
streamWrite(os, d_id);
streamWrite(os, getConnectorRegions().size());
@@ -56,14 +79,13 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
}
auto connRegFPstr = getConnRegFP()->toString();
streamWrite(os, connRegFPstr);
streamWrite(os, d_connectors.size());
for (size_t i = 0; i < d_connectors.size(); ++i) {
if (d_connectors[i]) {
streamWrite(os, true);
} else {
streamWrite(os, false);
}
writeBitSet(os, d_connectors);
streamWrite(os, d_synthConnPatts.size());
for (const auto &scp : d_synthConnPatts) {
writeBitSet(os, scp);
}
streamWrite(os, d_synthons.size());
for (const auto &rs : d_synthons) {
streamWrite(os, rs.size());
@@ -71,6 +93,15 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
r->writeToDBStream(os);
}
}
if (d_addFP) {
streamWrite(os, true);
streamWrite(os, d_addFP->toString());
streamWrite(os, d_subtractFP->toString());
} else {
streamWrite(os, false);
}
streamWrite(os, d_synthonFPs.size());
for (const auto &fpv : d_synthonFPs) {
streamWrite(os, fpv.size());
@@ -80,7 +111,20 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
}
}
void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
namespace {
void readBitSet(std::istream &is, boost::dynamic_bitset<> &bitset) {
size_t bsSize;
streamRead(is, bsSize);
bitset.resize(bsSize);
bool s;
for (size_t i = 0; i < bsSize; ++i) {
streamRead(is, s);
bitset[i] = s;
}
}
} // namespace
void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t version) {
streamRead(is, d_id, 0);
size_t numConnRegs;
streamRead(is, numConnRegs);
@@ -92,14 +136,18 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
std::string pickle;
streamRead(is, pickle, 0);
d_connRegFP = std::make_unique<ExplicitBitVect>(pickle);
size_t connSize;
streamRead(is, connSize);
d_connectors.resize(connSize);
bool s;
for (size_t i = 0; i < connSize; ++i) {
streamRead(is, s);
d_connectors[i] = s;
readBitSet(is, d_connectors);
if (version >= 2010) {
size_t numSynthConnPatts;
streamRead(is, numSynthConnPatts);
d_synthConnPatts.resize(numSynthConnPatts);
for (size_t i = 0; i < numSynthConnPatts; ++i) {
boost::dynamic_bitset<> synthConnPatt;
readBitSet(is, synthConnPatt);
d_synthConnPatts[i] = synthConnPatt;
}
}
size_t numRS;
streamRead(is, numRS);
d_synthons.clear();
@@ -113,6 +161,19 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
d_synthons[i][j]->readFromDBStream(is);
}
}
if (version >= 2010) {
bool haveAddFP;
streamRead(is, haveAddFP);
if (haveAddFP) {
std::string fString;
streamRead(is, fString, 0);
d_addFP = std::make_unique<ExplicitBitVect>(fString);
streamRead(is, fString, 0);
d_subtractFP = std::make_unique<ExplicitBitVect>(fString);
}
}
size_t numFS;
streamRead(is, numFS);
d_synthonFPs.clear();
@@ -127,10 +188,10 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
d_synthonFPs[i][j] = std::make_unique<ExplicitBitVect>(fString);
}
}
// So that d_synthConnPatts is filled in. Next time the binary file format
// is updated they can be put in it, but they're cheap enough to calculate
// so leave it for now.
assignConnectorsUsed();
// So that d_synthConnPatts is filled in.
if (version < 2010) {
assignConnectorsUsed();
}
}
void SynthonSet::enumerateToStream(std::ostream &os) const {
@@ -164,8 +225,8 @@ namespace {
// element of the other vectors.
std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
const std::vector<std::vector<ROMol *>> &synthons,
const size_t longVecNum, const SynthonSet &reaction) {
const std::vector<std::vector<ROMol *>> &synthons, const size_t longVecNum,
const SynthonSet &reaction) {
std::vector<std::unique_ptr<ROMol>> sampleMolecules;
sampleMolecules.reserve(synthons[longVecNum].size());
@@ -187,13 +248,16 @@ std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
sampleMolecules.push_back(std::move(sampleMol));
} catch (std::exception &e) {
const auto &synths = reaction.getSynthons();
std::string msg("Error:: in reaction " + reaction.getId() + " :: building molecule from synthons :");
std::string msg("Error:: in reaction " + reaction.getId() +
" :: building molecule from synthons :");
for (size_t j = 0; j < synthons.size(); ++j) {
std::string sep = j ? " and " : " ";
if (j == longVecNum) {
msg += sep + synths[j][i]->getId() + " (" + synths[j][i]->getSmiles() + ")";
msg += sep + synths[j][i]->getId() + " (" +
synths[j][i]->getSmiles() + ")";
} else {
msg += sep + synths[j].front()->getId() + " (" + synths[j].front()->getSmiles() + ")";
msg += sep + synths[j].front()->getId() + " (" +
synths[j].front()->getSmiles() + ")";
}
}
msg += "\n" + std::string(e.what()) + "\n";
@@ -250,7 +314,8 @@ void SynthonSet::transferProductBondsToSynthons() {
synthsToUse[j][0] = true;
}
}
auto sampleMols = buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
auto sampleMols =
buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
for (size_t j = 0; j < sampleMols.size(); ++j) {
auto synthCp =
std::make_unique<RWMol>(*d_synthons[synthSetNum][j]->getOrigMol());
@@ -363,12 +428,17 @@ const std::vector<int> &SynthonSet::getNumConnectors() const {
}
bool SynthonSet::hasFingerprints() const { return !d_synthonFPs.empty(); }
bool SynthonSet::hasAddAndSubtractFPs() const {
return static_cast<bool>(d_addFP);
}
void SynthonSet::buildSynthonFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen) {
d_addFP.reset();
d_subtractFP.reset();
// The synthons should have had transferProductBondsToSynthons
// applied to them by now.
d_synthonFPs.clear();
d_synthonFPs.reserve(d_synthons.size());
@@ -382,6 +452,97 @@ void SynthonSet::buildSynthonFingerprints(
}
}
void SynthonSet::buildAddAndSubtractFPs(
const FingerprintGenerator<std::uint64_t> &fpGen) {
d_addFP.reset();
d_subtractFP.reset();
std::vector<std::vector<size_t>> synthonNums(d_synthons.size());
std::vector<size_t> numSynthons(d_synthons.size());
std::vector<int> naddbitcounts(fpGen.getOptions()->d_fpSize, 0);
std::vector<int> nsubbitcounts(fpGen.getOptions()->d_fpSize, 0);
size_t totSamples = 1;
// Sample the synthons evenly across their size ranges.
for (size_t i = 0; i < d_synthons.size(); ++i) {
std::vector<std::tuple<size_t, Synthon *>> sortedSynthons(
d_synthons[i].size());
for (size_t j = 0; j < d_synthons[i].size(); ++j) {
sortedSynthons[j] = std::make_tuple(j, d_synthons[i][j].get());
}
std::sort(sortedSynthons.begin(), sortedSynthons.end(),
[](const std::tuple<size_t, Synthon *> &a,
const std::tuple<size_t, Synthon *> &b) -> bool {
auto as = std::get<1>(a);
auto bs = std::get<1>(b);
if (as->getOrigMol()->getNumAtoms() ==
bs->getOrigMol()->getNumAtoms()) {
return as->getId() < bs->getId();
}
return as->getOrigMol()->getNumAtoms() <
bs->getOrigMol()->getNumAtoms();
});
size_t stride = d_synthons[i].size() / 40;
if (!stride) {
stride = 1;
}
for (size_t j = 0; j < d_synthons[i].size(); j += stride) {
synthonNums[i].push_back(j);
}
numSynthons[i] = synthonNums[i].size();
totSamples *= numSynthons[i];
}
details::Stepper stepper(numSynthons);
std::vector<size_t> theseSynthNums(synthonNums.size(), 0);
while (stepper.d_currState[0] != numSynthons[0]) {
for (size_t i = 0; i < stepper.d_currState.size(); ++i) {
theseSynthNums[i] = synthonNums[i][stepper.d_currState[i]];
}
auto prod = buildProduct(theseSynthNums);
std::unique_ptr<ExplicitBitVect> prodFP(fpGen.getFingerprint(*prod));
ExplicitBitVect approxFP(*d_synthonFPs[0][theseSynthNums[0]]);
for (size_t j = 1; j < d_synthonFPs.size(); ++j) {
approxFP |= *d_synthonFPs[j][theseSynthNums[j]];
}
// addFP is what's in the productFP and not in approxFP
// and subtractFP is vice versa. The former captures the bits of
// the molecule formed by the joining the fragments, the latter
// the bits connecting the dummy atoms.
std::unique_ptr<ExplicitBitVect> addFP(
new ExplicitBitVect(*prodFP & ~approxFP));
IntVect v;
addFP->getOnBits(v);
for (auto i : v) {
naddbitcounts[i]++;
}
std::unique_ptr<ExplicitBitVect> subtractFP(
new ExplicitBitVect(approxFP & ~(*prodFP)));
subtractFP->getOnBits(v);
for (auto i : v) {
nsubbitcounts[i]++;
}
stepper.step();
}
// This is the fraction of products that must set a bit for
// it to be included. Arrived at by empirical means.
double frac = 0.75;
d_addFP = std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
for (size_t i = 0; i < naddbitcounts.size(); ++i) {
if (naddbitcounts[i] > int(totSamples * frac)) {
d_addFP->setBit(i);
}
}
d_subtractFP =
std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
for (size_t i = 0; i < nsubbitcounts.size(); ++i) {
if (nsubbitcounts[i] > int(totSamples * frac)) {
d_subtractFP->setBit(i);
}
}
// Take the complement of the subtract FP so it can be used directly
*d_subtractFP = ~(*d_subtractFP);
}
std::string SynthonSet::buildProductName(
const std::vector<size_t> &synthNums) const {
std::string prodName = d_id;

View File

@@ -27,6 +27,7 @@ class ROMol;
namespace SynthonSpaceSearch {
class Synthon;
struct SynthonSpaceSearchParams;
// This class holds all the synthons for a particular reaction.
class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
@@ -49,11 +50,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
const std::vector<std::shared_ptr<ROMol>> &getConnectorRegions() const;
const std::unique_ptr<ExplicitBitVect> &getConnRegFP() const;
const std::unique_ptr<ExplicitBitVect> &getAddFP() const;
const std::unique_ptr<ExplicitBitVect> &getSubtractFP() const;
const std::vector<int> &getNumConnectors() const;
bool hasFingerprints() const;
bool hasAddAndSubtractFPs() const;
const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
getSynthonFPs() const;
// Writes to/reads from a binary stream.
void writeToDBStream(std::ostream &os) const;
void readFromDBStream(std::istream &is, std::uint32_t version);
@@ -86,6 +90,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
void buildSynthonFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen);
void buildAddAndSubtractFPs(const FingerprintGenerator<std::uint64_t> &fpGen);
// Return the molecules for synthons for which the bits are true.
// Obviously requires that reqSynths is the same dimensions as
@@ -118,6 +123,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
// The fingerprint of the connector regions. Fingerprints for all
// connector regions are folded into the same fingerprint.
std::unique_ptr<ExplicitBitVect> d_connRegFP;
// When doing an approximate FP similarity by ORing together
// the synthonFPs, adding d_addFP and subtracting d_subtractFP
// accounts (a bit) for the joins and the dummy atoms
// respectively.
std::unique_ptr<ExplicitBitVect> d_addFP;
std::unique_ptr<ExplicitBitVect> d_subtractFP;
// The number of connectors in the synthons in each synthon set.
std::vector<int> d_numConnectors;

View File

@@ -32,7 +32,7 @@ namespace RDKit::SynthonSpaceSearch {
// used for serialization
constexpr int32_t versionMajor = 2;
constexpr int32_t versionMinor = 0;
constexpr int32_t versionMinor = 1;
constexpr int32_t endianId = 0xa100f;
std::int64_t SynthonSpace::getNumProducts() const {
@@ -185,8 +185,8 @@ void SynthonSpace::writeDBFile(const std::string &outFilename) const {
streamWrite(os, d_fpType);
}
streamWrite(os, d_reactions.size());
for (const auto &[fst, snd] : d_reactions) {
snd->writeToDBStream(os);
for (const auto &[reactionId, reaction] : d_reactions) {
reaction->writeToDBStream(os);
}
os.close();
}
@@ -277,10 +277,10 @@ bool SynthonSpace::hasFingerprints() const {
void SynthonSpace::buildSynthonFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen) {
BOOST_LOG(rdWarningLog) << "Building the fingerprints may take some time."
<< std::endl;
if (const auto fpType = fpGen.infoString();
fpType != d_fpType || !hasFingerprints()) {
BOOST_LOG(rdWarningLog)
<< "Building the fingerprints may take some time." << std::endl;
d_fpType = fpType;
for (const auto &[id, synthSet] : d_reactions) {
synthSet->buildSynthonFingerprints(fpGen);
@@ -288,4 +288,18 @@ void SynthonSpace::buildSynthonFingerprints(
}
}
bool SynthonSpace::hasAddAndSubstractFingerprints() const {
if (d_reactions.empty()) {
return false;
}
return d_reactions.begin()->second->hasAddAndSubtractFPs();
}
void SynthonSpace::buildAddAndSubstractFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen) {
for (const auto &[id, synthSet] : d_reactions) {
synthSet->buildAddAndSubtractFPs(fpGen);
}
}
} // namespace RDKit::SynthonSpaceSearch

View File

@@ -50,8 +50,15 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
// than that will not matter as it will
// be reduced to 4. Likewise, values
// lower than 1 will be increased to 1.
std::int64_t maxHits{1000}; // The maximum number of hits to return. Use -1
// for no maximum.
std::uint64_t maxNumFrags{
100000}; // The maximum number of fragments the query can
// be broken into. Big molecules will create huge
// numbers of fragments that may cause excessive
// memory use. If the number of fragments hits this number,
// fragmentation stops and the search results will likely be
// incomplete.
std::int64_t maxHits{1000}; // The maximum number of hits to return. Use
// -1 for no maximum.
std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
// you can return the next N hits of a search
// having already obtained N-1.
@@ -78,6 +85,16 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
// times, a lower number will give faster searches at the
// risk of missing some hits. The value you use should have
// a positive correlation with your FOMO.
double approxSimilarityAdjuster{
0.1}; // The fingerprint search uses an approximate similarity method
// before building a product and doing a final check. The
// similarityCutoff is reduced by this value for the approximate
// check. A lower value will give faster run times at the
// risk of missing some hits. The value you use should have a
// positive correlation with your FOMO. The default is
// appropriate for Morgan fingerprints. With RDKit fingerprints,
// 0.05 is adequate, and higher than that has been seen to
// produce long run times.
std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
// search. 0 means no maximum.
};
@@ -206,10 +223,16 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpace {
bool hasFingerprints() const;
// Create the fingerprints for the synthons ready for fingerprint searches.
// Valid values of fpType as described by SynthonSpaceSearchParams.
// Will be done by the fingerprint search if not done ahead of time.
void buildSynthonFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen);
bool hasAddAndSubstractFingerprints() const;
// Create the add and substract fingerprints for the SynthonSets.
// Will be done by the fingerprint search if not done ahead of time.
void buildAddAndSubstractFingerprints(
const FingerprintGenerator<std::uint64_t> &fpGen);
private:
std::string d_fileName;
std::map<std::string, std::unique_ptr<SynthonSet>> d_reactions;

View File

@@ -24,6 +24,9 @@ SynthonSpaceFingerprintSearcher::SynthonSpaceFingerprintSearcher(
getSpace().getSynthonFingerprintType() != fpGen.infoString()) {
getSpace().buildSynthonFingerprints(fpGen);
}
if (!getSpace().hasAddAndSubstractFingerprints()) {
getSpace().buildAddAndSubstractFingerprints(fpGen);
}
d_queryFP = std::unique_ptr<ExplicitBitVect>(d_fpGen.getFingerprint(query));
}
@@ -148,6 +151,27 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
return results;
}
bool SynthonSpaceFingerprintSearcher::quickVerify(
const std::unique_ptr<SynthonSet> &reaction,
const std::vector<size_t> &synthNums) const {
// Make an approximate fingerprint by combining the FPs for
// these synthons, adding in the addFP and taking out the
// subtractFP.
const auto &synthFPs = reaction->getSynthonFPs();
ExplicitBitVect fullFP(*synthFPs[0][synthNums[0]]);
for (unsigned int i = 1; i < synthNums.size(); ++i) {
fullFP |= *synthFPs[i][synthNums[i]];
}
fullFP |= *(reaction->getAddFP());
// The subtract FP has already had its bits flipped, so just do a
// straight AND.
fullFP &= *(reaction->getSubtractFP());
double approxSim = TanimotoSimilarity(fullFP, *d_queryFP);
return approxSim >=
getParams().similarityCutoff - getParams().approxSimilarityAdjuster;
}
bool SynthonSpaceFingerprintSearcher::verifyHit(const ROMol &hit) const {
const std::unique_ptr<ExplicitBitVect> fp(d_fpGen.getFingerprint(hit));
if (const auto sim = TanimotoSimilarity(*fp, *d_queryFP);

View File

@@ -32,6 +32,8 @@ class SynthonSpaceFingerprintSearcher : public SynthonSpaceSearcher {
std::vector<SynthonSpaceHitSet> searchFragSet(
std::vector<std::unique_ptr<ROMol>> &fragSet) const override;
bool quickVerify(const std::unique_ptr<SynthonSet> &reaction,
const std::vector<size_t> &synthNums) const override;
bool verifyHit(const ROMol &hit) const override;
};
} // namespace RDKit::SynthonSpaceSearch

View File

@@ -115,7 +115,7 @@ std::vector<const Bond *> getContiguousAromaticBonds(const ROMol &mol,
}
std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
const ROMol &query, unsigned int maxBondSplits) {
const ROMol &query, unsigned int maxBondSplits, std::uint64_t maxNumFrags) {
if (maxBondSplits < 1) {
maxBondSplits = 1;
}
@@ -139,7 +139,10 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
fragments.emplace_back();
fragments.back().emplace_back(new ROMol(query));
// Now do the splits.
// Now do the splits. Symmetrical molecules can give rise to the same
// fragment set in different ways so keep track of what we've had to
// avoid duplicates.
std::set<std::string> fragSmis;
for (unsigned int i = 1; i <= maxBondSplits; ++i) {
auto combs = combMFromN(i, static_cast<int>(query.getNumBonds()));
std::vector<std::pair<unsigned int, unsigned int>> dummyLabels;
@@ -174,9 +177,21 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
continue;
}
if (checkConnectorsInDifferentFrags(molFrags, i)) {
std::string fragSmi(MolToSmiles(*fragMol));
if (!fragSmis.insert(fragSmi).second) {
continue;
}
fragments.emplace_back(std::move(molFrags));
if (fragments.size() > maxNumFrags) {
BOOST_LOG(rdWarningLog)
<< "Maximum number of fragments reached." << std::endl;
break;
}
}
}
if (fragments.size() > maxNumFrags) {
break;
}
}
return fragments;
}

View File

@@ -37,7 +37,8 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT void fixAromaticRingSplits(
// be altered. Also, you can't split a molecule on 3 bonds if it only contains
// 2.
RDKIT_SYNTHONSPACESEARCH_EXPORT std::vector<std::vector<std::unique_ptr<ROMol>>>
splitMolecule(const ROMol &query, unsigned int maxBondSplits);
splitMolecule(const ROMol &query, unsigned int maxBondSplits,
std::uint64_t maxNumFrags);
// Counts the number of [1*], [2*]...[4*] in the string.
RDKIT_SYNTHONSPACESEARCH_EXPORT int countConnections(const ROMol &frag);

View File

@@ -51,7 +51,7 @@ SearchResults SynthonSpaceSearcher::search() {
}
std::vector<std::unique_ptr<ROMol>> results;
auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits);
auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits, d_params.maxNumFrags);
std::vector<SynthonSpaceHitSet> allHits;
size_t totHits = 0;
TimePoint *endTime = nullptr;
@@ -94,6 +94,9 @@ std::unique_ptr<ROMol> SynthonSpaceSearcher::buildAndVerifyHit(
if (resultsNames.size() < static_cast<size_t>(d_params.hitStart)) {
return prod;
}
if (!quickVerify(reaction, synthNums)) {
return prod;
}
prod = reaction->buildProduct(synthNums);
// Do a final check of the whole thing. It can happen that the

View File

@@ -71,6 +71,15 @@ class SynthonSpaceSearcher {
const std::unique_ptr<SynthonSet> &reaction,
const std::vector<size_t> &synthNums,
std::set<std::string> &resultsNames) const;
// Some of the search methods (Rascal, for example) can do a quick
// check on whether this set of synthons can match the query without having to
// build the full molecule from the synthons. They will over-ride this
// function which by default passes everything.
virtual bool quickVerify(
[[maybe_unused]] const std::unique_ptr<SynthonSet> &reaction,
[[maybe_unused]] const std::vector<size_t> &synthNums) const {
return true;
}
virtual bool verifyHit(const ROMol &mol) const = 0;
// Build the molecules from the synthons identified in reagentsToUse.

View File

@@ -105,6 +105,14 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
&SynthonSpaceSearch::SynthonSpaceSearchParams::maxHits,
"The maximum number of hits to return. Default=1000."
"Use -1 for no maximum.")
.def_readwrite(
"maxNumFrags",
&SynthonSpaceSearch::SynthonSpaceSearchParams::maxNumFrags,
"The maximum number of fragments the query can be broken into."
" Big molecules will create huge numbers of fragments that may cause"
" excessive memory use. If the number of fragments hits this number,"
" fragmentation stops and the search results will likely be incomplete."
" Default=100000.")
.def_readwrite(
"hitStart", &SynthonSpaceSearch::SynthonSpaceSearchParams::hitStart,
"The sequence number of the hit to start from. So that you"
@@ -144,6 +152,19 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
"Similarities of fragments are generally low due to low bit"
" densities. For the fragment matching, reduce the similarity cutoff"
" off by this amount. Default=0.1.")
.def_readwrite(
"approxSimilarityAdjuster",
&SynthonSpaceSearch::SynthonSpaceSearchParams::
approxSimilarityAdjuster,
"The fingerprint search uses an approximate similarity method"
" before building a product and doing a final check. The"
" similarityCutoff is reduced by this value for the approximate"
" check. A lower value will give faster run times at the"
" risk of missing some hits. The value you use should have a"
" positive correlation with your FOMO. The default of 0.1 is"
" appropriate for Morgan fingerprints. With RDKit fingerprints,"
" 0.05 is adequate, and higher than that has been seen to"
" produce long run times.")
.def_readwrite(
"timeOut", &SynthonSpaceSearch::SynthonSpaceSearchParams::timeOut,
"Time limit for search, in seconds. Default is 600s, 0 means no"

View File

@@ -8,11 +8,13 @@
// of the RDKit source tree.
#include <algorithm>
#include <chrono>
#include <fstream>
#include <GraphMol/SubstructLibrary/SubstructLibrary.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/Fingerprints/MorganGenerator.h>
#include <GraphMol/Fingerprints/RDKitFPGenerator.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
#include <GraphMol/SynthonSpaceSearch/SearchResults.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h>
@@ -93,6 +95,8 @@ TEST_CASE("FP Small tests") {
synthonspace.readTextFile(libNames[i]);
SynthonSpaceSearchParams params;
params.maxBondSplits = 3;
params.randomSeed = 1;
params.approxSimilarityAdjuster = 0.2;
auto queryMol = v2::SmilesParse::MolFromSmiles(querySmis[i]);
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
@@ -147,8 +151,12 @@ TEST_CASE("FP Biggy") {
const std::vector<size_t> numRes{46, 2, 0, 123, 0, 0};
const std::vector<size_t> maxRes{2408, 197, 0, 833, 0, 4};
SynthonSpaceSearchParams params;
params.approxSimilarityAdjuster = 0.2;
params.maxHits = -1;
for (size_t i = 0; i < smis.size(); ++i) {
if (i != 4) {
continue;
}
auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == numRes[i]);
@@ -243,3 +251,40 @@ TEST_CASE("Timeout") {
auto results1 = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(!results1.getTimedOut());
}
TEST_CASE("FP Approx Similarity") {
REQUIRE(rdbase);
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
SynthonSpace synthonspace;
synthonspace.readTextFile(libName);
SynthonSpaceSearchParams params;
// The addFP and subtractFP are built from a random selection of
// products so do occasionally vary, so use a fixed seed.
params.randomSeed = 1;
params.similarityCutoff = 0.5;
params.timeOut = 0;
params.maxHits = 1000;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
RDKitFP::getRDKitFPGenerator<std::uint64_t>(3));
auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
// With RDKit fingerprints, 0.05 gives a reasonable compromise
// between speed and hits missed.
params.approxSimilarityAdjuster = 0.05;
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 482);
CHECK(results.getMaxNumResults() == 1466);
// A tighter adjuster misses more hits.
params.approxSimilarityAdjuster = 0.01;
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 124);
// This is the actual number of hits achievable.
params.approxSimilarityAdjuster = 0.25;
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 914);
}

View File

@@ -46,12 +46,12 @@ std::unique_ptr<SubstructLibrary> loadSubstructLibrary(
TEST_CASE("Test splits 1") {
const std::vector<std::string> smiles{"c1ccccc1CN1CCN(CC1)C(-O)c1ncc(F)cc1",
"CC(C)OCc1nnc(N2CC(C)CC2)n1C1CCCC1"};
std::vector<std::vector<size_t>> expCounts{{1, 51, 345, 20},
{1, 38, 298, 56}};
std::vector<std::vector<size_t>> expCounts{{1, 47, 345, 20},
{1, 37, 262, 41}};
for (size_t i = 0; i < smiles.size(); ++i) {
auto mol = v2::SmilesParse::MolFromSmiles(smiles[i]);
REQUIRE(mol);
auto fragments = splitMolecule(*mol, 3);
auto fragments = splitMolecule(*mol, 3, 100000);
CHECK(fragments.size() ==
std::accumulate(expCounts[i].begin(), expCounts[i].end(), size_t(0)));
// The first fragment set should just be the molecule itself.