mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Optimisations to fingerprint search of Synthon Space (#8152)
* First pass at approximate FP check. * Tidy and Python wrapper. * More tidying. * Add addFP and subtractFP to binary file. * Minor tidy. * In splits code, check for duplicate fragmentations. * Update test results. * Tidy. * Set configurable limit on number of fragments generated from query. * Stash prior to trying counts fps. * Stash count fps. * Back to bit fingerprints again. * Extra comment. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
This commit is contained in:
committed by
greg landrum
parent
8e9e1d9574
commit
ecb0c31ba3
@@ -22,8 +22,12 @@
|
||||
// for example, it uses a different fingerprint for the initial synthon
|
||||
// screening.
|
||||
|
||||
#include <cmath>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
|
||||
#include <boost/random/discrete_distribution.hpp>
|
||||
|
||||
#include <DataStructs/ExplicitBitVect.h>
|
||||
#include <GraphMol/MolPickler.h>
|
||||
#include <GraphMol/ChemTransforms/ChemTransforms.h>
|
||||
@@ -42,12 +46,31 @@ const std::vector<std::shared_ptr<ROMol>> &SynthonSet::getConnectorRegions()
|
||||
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getConnRegFP() const {
|
||||
return d_connRegFP;
|
||||
}
|
||||
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getAddFP() const {
|
||||
return d_addFP;
|
||||
}
|
||||
const std::unique_ptr<ExplicitBitVect> &SynthonSet::getSubtractFP() const {
|
||||
return d_subtractFP;
|
||||
}
|
||||
|
||||
const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
|
||||
SynthonSet::getSynthonFPs() const {
|
||||
return d_synthonFPs;
|
||||
}
|
||||
|
||||
namespace {
|
||||
void writeBitSet(std::ostream &os, const boost::dynamic_bitset<> &bitset) {
|
||||
streamWrite(os, bitset.size());
|
||||
for (unsigned int i = 0; i < bitset.size(); ++i) {
|
||||
if (bitset[i]) {
|
||||
streamWrite(os, true);
|
||||
} else {
|
||||
streamWrite(os, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void SynthonSet::writeToDBStream(std::ostream &os) const {
|
||||
streamWrite(os, d_id);
|
||||
streamWrite(os, getConnectorRegions().size());
|
||||
@@ -56,14 +79,13 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
|
||||
}
|
||||
auto connRegFPstr = getConnRegFP()->toString();
|
||||
streamWrite(os, connRegFPstr);
|
||||
streamWrite(os, d_connectors.size());
|
||||
for (size_t i = 0; i < d_connectors.size(); ++i) {
|
||||
if (d_connectors[i]) {
|
||||
streamWrite(os, true);
|
||||
} else {
|
||||
streamWrite(os, false);
|
||||
}
|
||||
|
||||
writeBitSet(os, d_connectors);
|
||||
streamWrite(os, d_synthConnPatts.size());
|
||||
for (const auto &scp : d_synthConnPatts) {
|
||||
writeBitSet(os, scp);
|
||||
}
|
||||
|
||||
streamWrite(os, d_synthons.size());
|
||||
for (const auto &rs : d_synthons) {
|
||||
streamWrite(os, rs.size());
|
||||
@@ -71,6 +93,15 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
|
||||
r->writeToDBStream(os);
|
||||
}
|
||||
}
|
||||
|
||||
if (d_addFP) {
|
||||
streamWrite(os, true);
|
||||
streamWrite(os, d_addFP->toString());
|
||||
streamWrite(os, d_subtractFP->toString());
|
||||
} else {
|
||||
streamWrite(os, false);
|
||||
}
|
||||
|
||||
streamWrite(os, d_synthonFPs.size());
|
||||
for (const auto &fpv : d_synthonFPs) {
|
||||
streamWrite(os, fpv.size());
|
||||
@@ -80,7 +111,20 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
|
||||
}
|
||||
}
|
||||
|
||||
void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
|
||||
namespace {
|
||||
void readBitSet(std::istream &is, boost::dynamic_bitset<> &bitset) {
|
||||
size_t bsSize;
|
||||
streamRead(is, bsSize);
|
||||
bitset.resize(bsSize);
|
||||
bool s;
|
||||
for (size_t i = 0; i < bsSize; ++i) {
|
||||
streamRead(is, s);
|
||||
bitset[i] = s;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t version) {
|
||||
streamRead(is, d_id, 0);
|
||||
size_t numConnRegs;
|
||||
streamRead(is, numConnRegs);
|
||||
@@ -92,14 +136,18 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
|
||||
std::string pickle;
|
||||
streamRead(is, pickle, 0);
|
||||
d_connRegFP = std::make_unique<ExplicitBitVect>(pickle);
|
||||
size_t connSize;
|
||||
streamRead(is, connSize);
|
||||
d_connectors.resize(connSize);
|
||||
bool s;
|
||||
for (size_t i = 0; i < connSize; ++i) {
|
||||
streamRead(is, s);
|
||||
d_connectors[i] = s;
|
||||
readBitSet(is, d_connectors);
|
||||
if (version >= 2010) {
|
||||
size_t numSynthConnPatts;
|
||||
streamRead(is, numSynthConnPatts);
|
||||
d_synthConnPatts.resize(numSynthConnPatts);
|
||||
for (size_t i = 0; i < numSynthConnPatts; ++i) {
|
||||
boost::dynamic_bitset<> synthConnPatt;
|
||||
readBitSet(is, synthConnPatt);
|
||||
d_synthConnPatts[i] = synthConnPatt;
|
||||
}
|
||||
}
|
||||
|
||||
size_t numRS;
|
||||
streamRead(is, numRS);
|
||||
d_synthons.clear();
|
||||
@@ -113,6 +161,19 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
|
||||
d_synthons[i][j]->readFromDBStream(is);
|
||||
}
|
||||
}
|
||||
|
||||
if (version >= 2010) {
|
||||
bool haveAddFP;
|
||||
streamRead(is, haveAddFP);
|
||||
if (haveAddFP) {
|
||||
std::string fString;
|
||||
streamRead(is, fString, 0);
|
||||
d_addFP = std::make_unique<ExplicitBitVect>(fString);
|
||||
streamRead(is, fString, 0);
|
||||
d_subtractFP = std::make_unique<ExplicitBitVect>(fString);
|
||||
}
|
||||
}
|
||||
|
||||
size_t numFS;
|
||||
streamRead(is, numFS);
|
||||
d_synthonFPs.clear();
|
||||
@@ -127,10 +188,10 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
|
||||
d_synthonFPs[i][j] = std::make_unique<ExplicitBitVect>(fString);
|
||||
}
|
||||
}
|
||||
// So that d_synthConnPatts is filled in. Next time the binary file format
|
||||
// is updated they can be put in it, but they're cheap enough to calculate
|
||||
// so leave it for now.
|
||||
assignConnectorsUsed();
|
||||
// So that d_synthConnPatts is filled in.
|
||||
if (version < 2010) {
|
||||
assignConnectorsUsed();
|
||||
}
|
||||
}
|
||||
|
||||
void SynthonSet::enumerateToStream(std::ostream &os) const {
|
||||
@@ -164,8 +225,8 @@ namespace {
|
||||
// element of the other vectors.
|
||||
|
||||
std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
|
||||
const std::vector<std::vector<ROMol *>> &synthons,
|
||||
const size_t longVecNum, const SynthonSet &reaction) {
|
||||
const std::vector<std::vector<ROMol *>> &synthons, const size_t longVecNum,
|
||||
const SynthonSet &reaction) {
|
||||
std::vector<std::unique_ptr<ROMol>> sampleMolecules;
|
||||
sampleMolecules.reserve(synthons[longVecNum].size());
|
||||
|
||||
@@ -187,13 +248,16 @@ std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
|
||||
sampleMolecules.push_back(std::move(sampleMol));
|
||||
} catch (std::exception &e) {
|
||||
const auto &synths = reaction.getSynthons();
|
||||
std::string msg("Error:: in reaction " + reaction.getId() + " :: building molecule from synthons :");
|
||||
std::string msg("Error:: in reaction " + reaction.getId() +
|
||||
" :: building molecule from synthons :");
|
||||
for (size_t j = 0; j < synthons.size(); ++j) {
|
||||
std::string sep = j ? " and " : " ";
|
||||
if (j == longVecNum) {
|
||||
msg += sep + synths[j][i]->getId() + " (" + synths[j][i]->getSmiles() + ")";
|
||||
msg += sep + synths[j][i]->getId() + " (" +
|
||||
synths[j][i]->getSmiles() + ")";
|
||||
} else {
|
||||
msg += sep + synths[j].front()->getId() + " (" + synths[j].front()->getSmiles() + ")";
|
||||
msg += sep + synths[j].front()->getId() + " (" +
|
||||
synths[j].front()->getSmiles() + ")";
|
||||
}
|
||||
}
|
||||
msg += "\n" + std::string(e.what()) + "\n";
|
||||
@@ -250,7 +314,8 @@ void SynthonSet::transferProductBondsToSynthons() {
|
||||
synthsToUse[j][0] = true;
|
||||
}
|
||||
}
|
||||
auto sampleMols = buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
|
||||
auto sampleMols =
|
||||
buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
|
||||
for (size_t j = 0; j < sampleMols.size(); ++j) {
|
||||
auto synthCp =
|
||||
std::make_unique<RWMol>(*d_synthons[synthSetNum][j]->getOrigMol());
|
||||
@@ -363,12 +428,17 @@ const std::vector<int> &SynthonSet::getNumConnectors() const {
|
||||
}
|
||||
|
||||
bool SynthonSet::hasFingerprints() const { return !d_synthonFPs.empty(); }
|
||||
bool SynthonSet::hasAddAndSubtractFPs() const {
|
||||
return static_cast<bool>(d_addFP);
|
||||
}
|
||||
|
||||
void SynthonSet::buildSynthonFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen) {
|
||||
d_addFP.reset();
|
||||
d_subtractFP.reset();
|
||||
|
||||
// The synthons should have had transferProductBondsToSynthons
|
||||
// applied to them by now.
|
||||
|
||||
d_synthonFPs.clear();
|
||||
|
||||
d_synthonFPs.reserve(d_synthons.size());
|
||||
@@ -382,6 +452,97 @@ void SynthonSet::buildSynthonFingerprints(
|
||||
}
|
||||
}
|
||||
|
||||
void SynthonSet::buildAddAndSubtractFPs(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen) {
|
||||
d_addFP.reset();
|
||||
d_subtractFP.reset();
|
||||
std::vector<std::vector<size_t>> synthonNums(d_synthons.size());
|
||||
std::vector<size_t> numSynthons(d_synthons.size());
|
||||
std::vector<int> naddbitcounts(fpGen.getOptions()->d_fpSize, 0);
|
||||
std::vector<int> nsubbitcounts(fpGen.getOptions()->d_fpSize, 0);
|
||||
size_t totSamples = 1;
|
||||
// Sample the synthons evenly across their size ranges.
|
||||
for (size_t i = 0; i < d_synthons.size(); ++i) {
|
||||
std::vector<std::tuple<size_t, Synthon *>> sortedSynthons(
|
||||
d_synthons[i].size());
|
||||
for (size_t j = 0; j < d_synthons[i].size(); ++j) {
|
||||
sortedSynthons[j] = std::make_tuple(j, d_synthons[i][j].get());
|
||||
}
|
||||
std::sort(sortedSynthons.begin(), sortedSynthons.end(),
|
||||
[](const std::tuple<size_t, Synthon *> &a,
|
||||
const std::tuple<size_t, Synthon *> &b) -> bool {
|
||||
auto as = std::get<1>(a);
|
||||
auto bs = std::get<1>(b);
|
||||
if (as->getOrigMol()->getNumAtoms() ==
|
||||
bs->getOrigMol()->getNumAtoms()) {
|
||||
return as->getId() < bs->getId();
|
||||
}
|
||||
return as->getOrigMol()->getNumAtoms() <
|
||||
bs->getOrigMol()->getNumAtoms();
|
||||
});
|
||||
size_t stride = d_synthons[i].size() / 40;
|
||||
if (!stride) {
|
||||
stride = 1;
|
||||
}
|
||||
for (size_t j = 0; j < d_synthons[i].size(); j += stride) {
|
||||
synthonNums[i].push_back(j);
|
||||
}
|
||||
numSynthons[i] = synthonNums[i].size();
|
||||
totSamples *= numSynthons[i];
|
||||
}
|
||||
details::Stepper stepper(numSynthons);
|
||||
std::vector<size_t> theseSynthNums(synthonNums.size(), 0);
|
||||
while (stepper.d_currState[0] != numSynthons[0]) {
|
||||
for (size_t i = 0; i < stepper.d_currState.size(); ++i) {
|
||||
theseSynthNums[i] = synthonNums[i][stepper.d_currState[i]];
|
||||
}
|
||||
auto prod = buildProduct(theseSynthNums);
|
||||
std::unique_ptr<ExplicitBitVect> prodFP(fpGen.getFingerprint(*prod));
|
||||
ExplicitBitVect approxFP(*d_synthonFPs[0][theseSynthNums[0]]);
|
||||
for (size_t j = 1; j < d_synthonFPs.size(); ++j) {
|
||||
approxFP |= *d_synthonFPs[j][theseSynthNums[j]];
|
||||
}
|
||||
// addFP is what's in the productFP and not in approxFP
|
||||
// and subtractFP is vice versa. The former captures the bits of
|
||||
// the molecule formed by the joining the fragments, the latter
|
||||
// the bits connecting the dummy atoms.
|
||||
std::unique_ptr<ExplicitBitVect> addFP(
|
||||
new ExplicitBitVect(*prodFP & ~approxFP));
|
||||
IntVect v;
|
||||
addFP->getOnBits(v);
|
||||
for (auto i : v) {
|
||||
naddbitcounts[i]++;
|
||||
}
|
||||
std::unique_ptr<ExplicitBitVect> subtractFP(
|
||||
new ExplicitBitVect(approxFP & ~(*prodFP)));
|
||||
subtractFP->getOnBits(v);
|
||||
for (auto i : v) {
|
||||
nsubbitcounts[i]++;
|
||||
}
|
||||
stepper.step();
|
||||
}
|
||||
|
||||
// This is the fraction of products that must set a bit for
|
||||
// it to be included. Arrived at by empirical means.
|
||||
double frac = 0.75;
|
||||
d_addFP = std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
|
||||
for (size_t i = 0; i < naddbitcounts.size(); ++i) {
|
||||
if (naddbitcounts[i] > int(totSamples * frac)) {
|
||||
d_addFP->setBit(i);
|
||||
}
|
||||
}
|
||||
d_subtractFP =
|
||||
std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
|
||||
for (size_t i = 0; i < nsubbitcounts.size(); ++i) {
|
||||
if (nsubbitcounts[i] > int(totSamples * frac)) {
|
||||
d_subtractFP->setBit(i);
|
||||
}
|
||||
}
|
||||
|
||||
// Take the complement of the subtract FP so it can be used directly
|
||||
*d_subtractFP = ~(*d_subtractFP);
|
||||
}
|
||||
|
||||
std::string SynthonSet::buildProductName(
|
||||
const std::vector<size_t> &synthNums) const {
|
||||
std::string prodName = d_id;
|
||||
|
||||
@@ -27,6 +27,7 @@ class ROMol;
|
||||
|
||||
namespace SynthonSpaceSearch {
|
||||
class Synthon;
|
||||
struct SynthonSpaceSearchParams;
|
||||
|
||||
// This class holds all the synthons for a particular reaction.
|
||||
class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
|
||||
@@ -49,11 +50,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
|
||||
const std::vector<std::shared_ptr<ROMol>> &getConnectorRegions() const;
|
||||
|
||||
const std::unique_ptr<ExplicitBitVect> &getConnRegFP() const;
|
||||
const std::unique_ptr<ExplicitBitVect> &getAddFP() const;
|
||||
const std::unique_ptr<ExplicitBitVect> &getSubtractFP() const;
|
||||
const std::vector<int> &getNumConnectors() const;
|
||||
bool hasFingerprints() const;
|
||||
bool hasAddAndSubtractFPs() const;
|
||||
|
||||
const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
|
||||
getSynthonFPs() const;
|
||||
|
||||
// Writes to/reads from a binary stream.
|
||||
void writeToDBStream(std::ostream &os) const;
|
||||
void readFromDBStream(std::istream &is, std::uint32_t version);
|
||||
@@ -86,6 +90,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
|
||||
|
||||
void buildSynthonFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen);
|
||||
void buildAddAndSubtractFPs(const FingerprintGenerator<std::uint64_t> &fpGen);
|
||||
|
||||
// Return the molecules for synthons for which the bits are true.
|
||||
// Obviously requires that reqSynths is the same dimensions as
|
||||
@@ -118,6 +123,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
|
||||
// The fingerprint of the connector regions. Fingerprints for all
|
||||
// connector regions are folded into the same fingerprint.
|
||||
std::unique_ptr<ExplicitBitVect> d_connRegFP;
|
||||
|
||||
// When doing an approximate FP similarity by ORing together
|
||||
// the synthonFPs, adding d_addFP and subtracting d_subtractFP
|
||||
// accounts (a bit) for the joins and the dummy atoms
|
||||
// respectively.
|
||||
std::unique_ptr<ExplicitBitVect> d_addFP;
|
||||
std::unique_ptr<ExplicitBitVect> d_subtractFP;
|
||||
|
||||
// The number of connectors in the synthons in each synthon set.
|
||||
std::vector<int> d_numConnectors;
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ namespace RDKit::SynthonSpaceSearch {
|
||||
|
||||
// used for serialization
|
||||
constexpr int32_t versionMajor = 2;
|
||||
constexpr int32_t versionMinor = 0;
|
||||
constexpr int32_t versionMinor = 1;
|
||||
constexpr int32_t endianId = 0xa100f;
|
||||
|
||||
std::int64_t SynthonSpace::getNumProducts() const {
|
||||
@@ -185,8 +185,8 @@ void SynthonSpace::writeDBFile(const std::string &outFilename) const {
|
||||
streamWrite(os, d_fpType);
|
||||
}
|
||||
streamWrite(os, d_reactions.size());
|
||||
for (const auto &[fst, snd] : d_reactions) {
|
||||
snd->writeToDBStream(os);
|
||||
for (const auto &[reactionId, reaction] : d_reactions) {
|
||||
reaction->writeToDBStream(os);
|
||||
}
|
||||
os.close();
|
||||
}
|
||||
@@ -277,10 +277,10 @@ bool SynthonSpace::hasFingerprints() const {
|
||||
|
||||
void SynthonSpace::buildSynthonFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen) {
|
||||
BOOST_LOG(rdWarningLog) << "Building the fingerprints may take some time."
|
||||
<< std::endl;
|
||||
if (const auto fpType = fpGen.infoString();
|
||||
fpType != d_fpType || !hasFingerprints()) {
|
||||
BOOST_LOG(rdWarningLog)
|
||||
<< "Building the fingerprints may take some time." << std::endl;
|
||||
d_fpType = fpType;
|
||||
for (const auto &[id, synthSet] : d_reactions) {
|
||||
synthSet->buildSynthonFingerprints(fpGen);
|
||||
@@ -288,4 +288,18 @@ void SynthonSpace::buildSynthonFingerprints(
|
||||
}
|
||||
}
|
||||
|
||||
bool SynthonSpace::hasAddAndSubstractFingerprints() const {
|
||||
if (d_reactions.empty()) {
|
||||
return false;
|
||||
}
|
||||
return d_reactions.begin()->second->hasAddAndSubtractFPs();
|
||||
}
|
||||
|
||||
void SynthonSpace::buildAddAndSubstractFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen) {
|
||||
for (const auto &[id, synthSet] : d_reactions) {
|
||||
synthSet->buildAddAndSubtractFPs(fpGen);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace RDKit::SynthonSpaceSearch
|
||||
|
||||
@@ -50,8 +50,15 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
|
||||
// than that will not matter as it will
|
||||
// be reduced to 4. Likewise, values
|
||||
// lower than 1 will be increased to 1.
|
||||
std::int64_t maxHits{1000}; // The maximum number of hits to return. Use -1
|
||||
// for no maximum.
|
||||
std::uint64_t maxNumFrags{
|
||||
100000}; // The maximum number of fragments the query can
|
||||
// be broken into. Big molecules will create huge
|
||||
// numbers of fragments that may cause excessive
|
||||
// memory use. If the number of fragments hits this number,
|
||||
// fragmentation stops and the search results will likely be
|
||||
// incomplete.
|
||||
std::int64_t maxHits{1000}; // The maximum number of hits to return. Use
|
||||
// -1 for no maximum.
|
||||
std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
|
||||
// you can return the next N hits of a search
|
||||
// having already obtained N-1.
|
||||
@@ -78,6 +85,16 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
|
||||
// times, a lower number will give faster searches at the
|
||||
// risk of missing some hits. The value you use should have
|
||||
// a positive correlation with your FOMO.
|
||||
double approxSimilarityAdjuster{
|
||||
0.1}; // The fingerprint search uses an approximate similarity method
|
||||
// before building a product and doing a final check. The
|
||||
// similarityCutoff is reduced by this value for the approximate
|
||||
// check. A lower value will give faster run times at the
|
||||
// risk of missing some hits. The value you use should have a
|
||||
// positive correlation with your FOMO. The default is
|
||||
// appropriate for Morgan fingerprints. With RDKit fingerprints,
|
||||
// 0.05 is adequate, and higher than that has been seen to
|
||||
// produce long run times.
|
||||
std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
|
||||
// search. 0 means no maximum.
|
||||
};
|
||||
@@ -206,10 +223,16 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpace {
|
||||
|
||||
bool hasFingerprints() const;
|
||||
// Create the fingerprints for the synthons ready for fingerprint searches.
|
||||
// Valid values of fpType as described by SynthonSpaceSearchParams.
|
||||
// Will be done by the fingerprint search if not done ahead of time.
|
||||
void buildSynthonFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen);
|
||||
|
||||
bool hasAddAndSubstractFingerprints() const;
|
||||
// Create the add and substract fingerprints for the SynthonSets.
|
||||
// Will be done by the fingerprint search if not done ahead of time.
|
||||
void buildAddAndSubstractFingerprints(
|
||||
const FingerprintGenerator<std::uint64_t> &fpGen);
|
||||
|
||||
private:
|
||||
std::string d_fileName;
|
||||
std::map<std::string, std::unique_ptr<SynthonSet>> d_reactions;
|
||||
|
||||
@@ -24,6 +24,9 @@ SynthonSpaceFingerprintSearcher::SynthonSpaceFingerprintSearcher(
|
||||
getSpace().getSynthonFingerprintType() != fpGen.infoString()) {
|
||||
getSpace().buildSynthonFingerprints(fpGen);
|
||||
}
|
||||
if (!getSpace().hasAddAndSubstractFingerprints()) {
|
||||
getSpace().buildAddAndSubstractFingerprints(fpGen);
|
||||
}
|
||||
d_queryFP = std::unique_ptr<ExplicitBitVect>(d_fpGen.getFingerprint(query));
|
||||
}
|
||||
|
||||
@@ -148,6 +151,27 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
|
||||
return results;
|
||||
}
|
||||
|
||||
bool SynthonSpaceFingerprintSearcher::quickVerify(
|
||||
const std::unique_ptr<SynthonSet> &reaction,
|
||||
const std::vector<size_t> &synthNums) const {
|
||||
// Make an approximate fingerprint by combining the FPs for
|
||||
// these synthons, adding in the addFP and taking out the
|
||||
// subtractFP.
|
||||
const auto &synthFPs = reaction->getSynthonFPs();
|
||||
ExplicitBitVect fullFP(*synthFPs[0][synthNums[0]]);
|
||||
for (unsigned int i = 1; i < synthNums.size(); ++i) {
|
||||
fullFP |= *synthFPs[i][synthNums[i]];
|
||||
}
|
||||
fullFP |= *(reaction->getAddFP());
|
||||
// The subtract FP has already had its bits flipped, so just do a
|
||||
// straight AND.
|
||||
fullFP &= *(reaction->getSubtractFP());
|
||||
|
||||
double approxSim = TanimotoSimilarity(fullFP, *d_queryFP);
|
||||
return approxSim >=
|
||||
getParams().similarityCutoff - getParams().approxSimilarityAdjuster;
|
||||
}
|
||||
|
||||
bool SynthonSpaceFingerprintSearcher::verifyHit(const ROMol &hit) const {
|
||||
const std::unique_ptr<ExplicitBitVect> fp(d_fpGen.getFingerprint(hit));
|
||||
if (const auto sim = TanimotoSimilarity(*fp, *d_queryFP);
|
||||
|
||||
@@ -32,6 +32,8 @@ class SynthonSpaceFingerprintSearcher : public SynthonSpaceSearcher {
|
||||
|
||||
std::vector<SynthonSpaceHitSet> searchFragSet(
|
||||
std::vector<std::unique_ptr<ROMol>> &fragSet) const override;
|
||||
bool quickVerify(const std::unique_ptr<SynthonSet> &reaction,
|
||||
const std::vector<size_t> &synthNums) const override;
|
||||
bool verifyHit(const ROMol &hit) const override;
|
||||
};
|
||||
} // namespace RDKit::SynthonSpaceSearch
|
||||
|
||||
@@ -115,7 +115,7 @@ std::vector<const Bond *> getContiguousAromaticBonds(const ROMol &mol,
|
||||
}
|
||||
|
||||
std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
|
||||
const ROMol &query, unsigned int maxBondSplits) {
|
||||
const ROMol &query, unsigned int maxBondSplits, std::uint64_t maxNumFrags) {
|
||||
if (maxBondSplits < 1) {
|
||||
maxBondSplits = 1;
|
||||
}
|
||||
@@ -139,7 +139,10 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
|
||||
fragments.emplace_back();
|
||||
fragments.back().emplace_back(new ROMol(query));
|
||||
|
||||
// Now do the splits.
|
||||
// Now do the splits. Symmetrical molecules can give rise to the same
|
||||
// fragment set in different ways so keep track of what we've had to
|
||||
// avoid duplicates.
|
||||
std::set<std::string> fragSmis;
|
||||
for (unsigned int i = 1; i <= maxBondSplits; ++i) {
|
||||
auto combs = combMFromN(i, static_cast<int>(query.getNumBonds()));
|
||||
std::vector<std::pair<unsigned int, unsigned int>> dummyLabels;
|
||||
@@ -174,9 +177,21 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
|
||||
continue;
|
||||
}
|
||||
if (checkConnectorsInDifferentFrags(molFrags, i)) {
|
||||
std::string fragSmi(MolToSmiles(*fragMol));
|
||||
if (!fragSmis.insert(fragSmi).second) {
|
||||
continue;
|
||||
}
|
||||
fragments.emplace_back(std::move(molFrags));
|
||||
if (fragments.size() > maxNumFrags) {
|
||||
BOOST_LOG(rdWarningLog)
|
||||
<< "Maximum number of fragments reached." << std::endl;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fragments.size() > maxNumFrags) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return fragments;
|
||||
}
|
||||
|
||||
@@ -37,7 +37,8 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT void fixAromaticRingSplits(
|
||||
// be altered. Also, you can't split a molecule on 3 bonds if it only contains
|
||||
// 2.
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT std::vector<std::vector<std::unique_ptr<ROMol>>>
|
||||
splitMolecule(const ROMol &query, unsigned int maxBondSplits);
|
||||
splitMolecule(const ROMol &query, unsigned int maxBondSplits,
|
||||
std::uint64_t maxNumFrags);
|
||||
// Counts the number of [1*], [2*]...[4*] in the string.
|
||||
RDKIT_SYNTHONSPACESEARCH_EXPORT int countConnections(const ROMol &frag);
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ SearchResults SynthonSpaceSearcher::search() {
|
||||
}
|
||||
std::vector<std::unique_ptr<ROMol>> results;
|
||||
|
||||
auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits);
|
||||
auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits, d_params.maxNumFrags);
|
||||
std::vector<SynthonSpaceHitSet> allHits;
|
||||
size_t totHits = 0;
|
||||
TimePoint *endTime = nullptr;
|
||||
@@ -94,6 +94,9 @@ std::unique_ptr<ROMol> SynthonSpaceSearcher::buildAndVerifyHit(
|
||||
if (resultsNames.size() < static_cast<size_t>(d_params.hitStart)) {
|
||||
return prod;
|
||||
}
|
||||
if (!quickVerify(reaction, synthNums)) {
|
||||
return prod;
|
||||
}
|
||||
prod = reaction->buildProduct(synthNums);
|
||||
|
||||
// Do a final check of the whole thing. It can happen that the
|
||||
|
||||
@@ -71,6 +71,15 @@ class SynthonSpaceSearcher {
|
||||
const std::unique_ptr<SynthonSet> &reaction,
|
||||
const std::vector<size_t> &synthNums,
|
||||
std::set<std::string> &resultsNames) const;
|
||||
// Some of the search methods (Rascal, for example) can do a quick
|
||||
// check on whether this set of synthons can match the query without having to
|
||||
// build the full molecule from the synthons. They will over-ride this
|
||||
// function which by default passes everything.
|
||||
virtual bool quickVerify(
|
||||
[[maybe_unused]] const std::unique_ptr<SynthonSet> &reaction,
|
||||
[[maybe_unused]] const std::vector<size_t> &synthNums) const {
|
||||
return true;
|
||||
}
|
||||
virtual bool verifyHit(const ROMol &mol) const = 0;
|
||||
|
||||
// Build the molecules from the synthons identified in reagentsToUse.
|
||||
|
||||
@@ -105,6 +105,14 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
|
||||
&SynthonSpaceSearch::SynthonSpaceSearchParams::maxHits,
|
||||
"The maximum number of hits to return. Default=1000."
|
||||
"Use -1 for no maximum.")
|
||||
.def_readwrite(
|
||||
"maxNumFrags",
|
||||
&SynthonSpaceSearch::SynthonSpaceSearchParams::maxNumFrags,
|
||||
"The maximum number of fragments the query can be broken into."
|
||||
" Big molecules will create huge numbers of fragments that may cause"
|
||||
" excessive memory use. If the number of fragments hits this number,"
|
||||
" fragmentation stops and the search results will likely be incomplete."
|
||||
" Default=100000.")
|
||||
.def_readwrite(
|
||||
"hitStart", &SynthonSpaceSearch::SynthonSpaceSearchParams::hitStart,
|
||||
"The sequence number of the hit to start from. So that you"
|
||||
@@ -144,6 +152,19 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
|
||||
"Similarities of fragments are generally low due to low bit"
|
||||
" densities. For the fragment matching, reduce the similarity cutoff"
|
||||
" off by this amount. Default=0.1.")
|
||||
.def_readwrite(
|
||||
"approxSimilarityAdjuster",
|
||||
&SynthonSpaceSearch::SynthonSpaceSearchParams::
|
||||
approxSimilarityAdjuster,
|
||||
"The fingerprint search uses an approximate similarity method"
|
||||
" before building a product and doing a final check. The"
|
||||
" similarityCutoff is reduced by this value for the approximate"
|
||||
" check. A lower value will give faster run times at the"
|
||||
" risk of missing some hits. The value you use should have a"
|
||||
" positive correlation with your FOMO. The default of 0.1 is"
|
||||
" appropriate for Morgan fingerprints. With RDKit fingerprints,"
|
||||
" 0.05 is adequate, and higher than that has been seen to"
|
||||
" produce long run times.")
|
||||
.def_readwrite(
|
||||
"timeOut", &SynthonSpaceSearch::SynthonSpaceSearchParams::timeOut,
|
||||
"Time limit for search, in seconds. Default is 600s, 0 means no"
|
||||
|
||||
@@ -8,11 +8,13 @@
|
||||
// of the RDKit source tree.
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
|
||||
#include <GraphMol/SubstructLibrary/SubstructLibrary.h>
|
||||
#include <GraphMol/FileParsers/MolSupplier.h>
|
||||
#include <GraphMol/Fingerprints/MorganGenerator.h>
|
||||
#include <GraphMol/Fingerprints/RDKitFPGenerator.h>
|
||||
#include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
|
||||
#include <GraphMol/SynthonSpaceSearch/SearchResults.h>
|
||||
#include <GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h>
|
||||
@@ -93,6 +95,8 @@ TEST_CASE("FP Small tests") {
|
||||
synthonspace.readTextFile(libNames[i]);
|
||||
SynthonSpaceSearchParams params;
|
||||
params.maxBondSplits = 3;
|
||||
params.randomSeed = 1;
|
||||
params.approxSimilarityAdjuster = 0.2;
|
||||
auto queryMol = v2::SmilesParse::MolFromSmiles(querySmis[i]);
|
||||
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
|
||||
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
|
||||
@@ -147,8 +151,12 @@ TEST_CASE("FP Biggy") {
|
||||
const std::vector<size_t> numRes{46, 2, 0, 123, 0, 0};
|
||||
const std::vector<size_t> maxRes{2408, 197, 0, 833, 0, 4};
|
||||
SynthonSpaceSearchParams params;
|
||||
params.approxSimilarityAdjuster = 0.2;
|
||||
params.maxHits = -1;
|
||||
for (size_t i = 0; i < smis.size(); ++i) {
|
||||
if (i != 4) {
|
||||
continue;
|
||||
}
|
||||
auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
|
||||
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
|
||||
CHECK(results.getHitMolecules().size() == numRes[i]);
|
||||
@@ -243,3 +251,40 @@ TEST_CASE("Timeout") {
|
||||
auto results1 = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
|
||||
CHECK(!results1.getTimedOut());
|
||||
}
|
||||
|
||||
TEST_CASE("FP Approx Similarity") {
|
||||
REQUIRE(rdbase);
|
||||
std::string fName(rdbase);
|
||||
std::string libName =
|
||||
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
|
||||
SynthonSpace synthonspace;
|
||||
synthonspace.readTextFile(libName);
|
||||
SynthonSpaceSearchParams params;
|
||||
// The addFP and subtractFP are built from a random selection of
|
||||
// products so do occasionally vary, so use a fixed seed.
|
||||
params.randomSeed = 1;
|
||||
params.similarityCutoff = 0.5;
|
||||
params.timeOut = 0;
|
||||
params.maxHits = 1000;
|
||||
|
||||
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
|
||||
RDKitFP::getRDKitFPGenerator<std::uint64_t>(3));
|
||||
auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
|
||||
|
||||
// With RDKit fingerprints, 0.05 gives a reasonable compromise
|
||||
// between speed and hits missed.
|
||||
params.approxSimilarityAdjuster = 0.05;
|
||||
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
|
||||
CHECK(results.getHitMolecules().size() == 482);
|
||||
CHECK(results.getMaxNumResults() == 1466);
|
||||
|
||||
// A tighter adjuster misses more hits.
|
||||
params.approxSimilarityAdjuster = 0.01;
|
||||
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
|
||||
CHECK(results.getHitMolecules().size() == 124);
|
||||
|
||||
// This is the actual number of hits achievable.
|
||||
params.approxSimilarityAdjuster = 0.25;
|
||||
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
|
||||
CHECK(results.getHitMolecules().size() == 914);
|
||||
}
|
||||
|
||||
@@ -46,12 +46,12 @@ std::unique_ptr<SubstructLibrary> loadSubstructLibrary(
|
||||
TEST_CASE("Test splits 1") {
|
||||
const std::vector<std::string> smiles{"c1ccccc1CN1CCN(CC1)C(-O)c1ncc(F)cc1",
|
||||
"CC(C)OCc1nnc(N2CC(C)CC2)n1C1CCCC1"};
|
||||
std::vector<std::vector<size_t>> expCounts{{1, 51, 345, 20},
|
||||
{1, 38, 298, 56}};
|
||||
std::vector<std::vector<size_t>> expCounts{{1, 47, 345, 20},
|
||||
{1, 37, 262, 41}};
|
||||
for (size_t i = 0; i < smiles.size(); ++i) {
|
||||
auto mol = v2::SmilesParse::MolFromSmiles(smiles[i]);
|
||||
REQUIRE(mol);
|
||||
auto fragments = splitMolecule(*mol, 3);
|
||||
auto fragments = splitMolecule(*mol, 3, 100000);
|
||||
CHECK(fragments.size() ==
|
||||
std::accumulate(expCounts[i].begin(), expCounts[i].end(), size_t(0)));
|
||||
// The first fragment set should just be the molecule itself.
|
||||
|
||||
Reference in New Issue
Block a user