Optimisations to fingerprint search of Synthon Space (#8152)

* First pass at approximate FP check. * Tidy and Python wrapper. * More tidying. * Add addFP and subtractFP to binary file. * Minor tidy. * In splits code, check for duplicate fragmentations. * Update test results. * Tidy. * Set configurable limit on number of fragments generated from query. * Stash prior to trying counts fps. * Stash count fps. * Back to bit fingerprints again. * Extra comment. --------- Co-authored-by: David Cosgrove <david@cozchemix.co.uk>
2026-06-03 21:44:30 +08:00 · 2025-01-22 04:44:56 +00:00
parent 8e9e1d9574
commit ecb0c31ba3
13 changed files with 373 additions and 42 deletions
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSet.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSet.cpp
@@ -22,8 +22,12 @@
 // for example, it uses a different fingerprint for the initial synthon
 // screening.

+#include <cmath>
+#include <random>
 #include <regex>

+#include <boost/random/discrete_distribution.hpp>
+
 #include <DataStructs/ExplicitBitVect.h>
 #include <GraphMol/MolPickler.h>
 #include <GraphMol/ChemTransforms/ChemTransforms.h>
@@ -42,12 +46,31 @@ const std::vector<std::shared_ptr<ROMol>> &SynthonSet::getConnectorRegions()
 const std::unique_ptr<ExplicitBitVect> &SynthonSet::getConnRegFP() const {
  return d_connRegFP;
 }
+const std::unique_ptr<ExplicitBitVect> &SynthonSet::getAddFP() const {
+  return d_addFP;
+}
+const std::unique_ptr<ExplicitBitVect> &SynthonSet::getSubtractFP() const {
+  return d_subtractFP;
+}

 const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
 SynthonSet::getSynthonFPs() const {
  return d_synthonFPs;
 }

+namespace {
+void writeBitSet(std::ostream &os, const boost::dynamic_bitset<> &bitset) {
+  streamWrite(os, bitset.size());
+  for (unsigned int i = 0; i < bitset.size(); ++i) {
+    if (bitset[i]) {
+      streamWrite(os, true);
+    } else {
+      streamWrite(os, false);
+    }
+  }
+}
+}  // namespace
+
 void SynthonSet::writeToDBStream(std::ostream &os) const {
  streamWrite(os, d_id);
  streamWrite(os, getConnectorRegions().size());
@@ -56,14 +79,13 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
  }
  auto connRegFPstr = getConnRegFP()->toString();
  streamWrite(os, connRegFPstr);
-  streamWrite(os, d_connectors.size());
-  for (size_t i = 0; i < d_connectors.size(); ++i) {
-    if (d_connectors[i]) {
-      streamWrite(os, true);
-    } else {
-      streamWrite(os, false);
-    }
+
+  writeBitSet(os, d_connectors);
+  streamWrite(os, d_synthConnPatts.size());
+  for (const auto &scp : d_synthConnPatts) {
+    writeBitSet(os, scp);
  }
+
  streamWrite(os, d_synthons.size());
  for (const auto &rs : d_synthons) {
    streamWrite(os, rs.size());
@@ -71,6 +93,15 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
      r->writeToDBStream(os);
    }
  }
+
+  if (d_addFP) {
+    streamWrite(os, true);
+    streamWrite(os, d_addFP->toString());
+    streamWrite(os, d_subtractFP->toString());
+  } else {
+    streamWrite(os, false);
+  }
+
  streamWrite(os, d_synthonFPs.size());
  for (const auto &fpv : d_synthonFPs) {
    streamWrite(os, fpv.size());
@@ -80,7 +111,20 @@ void SynthonSet::writeToDBStream(std::ostream &os) const {
  }
 }

-void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
+namespace {
+void readBitSet(std::istream &is, boost::dynamic_bitset<> &bitset) {
+  size_t bsSize;
+  streamRead(is, bsSize);
+  bitset.resize(bsSize);
+  bool s;
+  for (size_t i = 0; i < bsSize; ++i) {
+    streamRead(is, s);
+    bitset[i] = s;
+  }
+}
+}  // namespace
+
+void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t version) {
  streamRead(is, d_id, 0);
  size_t numConnRegs;
  streamRead(is, numConnRegs);
@@ -92,14 +136,18 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
  std::string pickle;
  streamRead(is, pickle, 0);
  d_connRegFP = std::make_unique<ExplicitBitVect>(pickle);
-  size_t connSize;
-  streamRead(is, connSize);
-  d_connectors.resize(connSize);
-  bool s;
-  for (size_t i = 0; i < connSize; ++i) {
-    streamRead(is, s);
-    d_connectors[i] = s;
+  readBitSet(is, d_connectors);
+  if (version >= 2010) {
+    size_t numSynthConnPatts;
+    streamRead(is, numSynthConnPatts);
+    d_synthConnPatts.resize(numSynthConnPatts);
+    for (size_t i = 0; i < numSynthConnPatts; ++i) {
+      boost::dynamic_bitset<> synthConnPatt;
+      readBitSet(is, synthConnPatt);
+      d_synthConnPatts[i] = synthConnPatt;
+    }
  }
+
  size_t numRS;
  streamRead(is, numRS);
  d_synthons.clear();
@@ -113,6 +161,19 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
      d_synthons[i][j]->readFromDBStream(is);
    }
  }
+
+  if (version >= 2010) {
+    bool haveAddFP;
+    streamRead(is, haveAddFP);
+    if (haveAddFP) {
+      std::string fString;
+      streamRead(is, fString, 0);
+      d_addFP = std::make_unique<ExplicitBitVect>(fString);
+      streamRead(is, fString, 0);
+      d_subtractFP = std::make_unique<ExplicitBitVect>(fString);
+    }
+  }
+
  size_t numFS;
  streamRead(is, numFS);
  d_synthonFPs.clear();
@@ -127,10 +188,10 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
      d_synthonFPs[i][j] = std::make_unique<ExplicitBitVect>(fString);
    }
  }
-  // So that d_synthConnPatts is filled in. Next time the binary file format
-  // is updated they can be put in it, but they're cheap enough to calculate
-  // so leave it for now.
-  assignConnectorsUsed();
+  // So that d_synthConnPatts is filled in.
+  if (version < 2010) {
+    assignConnectorsUsed();
+  }
 }

 void SynthonSet::enumerateToStream(std::ostream &os) const {
@@ -164,8 +225,8 @@ namespace {
 // element of the other vectors.

 std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
-    const std::vector<std::vector<ROMol *>> &synthons,
-    const size_t longVecNum, const SynthonSet &reaction) {
+    const std::vector<std::vector<ROMol *>> &synthons, const size_t longVecNum,
+    const SynthonSet &reaction) {
  std::vector<std::unique_ptr<ROMol>> sampleMolecules;
  sampleMolecules.reserve(synthons[longVecNum].size());

@@ -187,13 +248,16 @@ std::vector<std::unique_ptr<ROMol>> buildSampleMolecules(
      sampleMolecules.push_back(std::move(sampleMol));
    } catch (std::exception &e) {
      const auto &synths = reaction.getSynthons();
-      std::string msg("Error:: in reaction " + reaction.getId() + " :: building molecule from synthons :");
+      std::string msg("Error:: in reaction " + reaction.getId() +
+                      " :: building molecule from synthons :");
      for (size_t j = 0; j < synthons.size(); ++j) {
        std::string sep = j ? " and " : " ";
        if (j == longVecNum) {
-          msg += sep + synths[j][i]->getId() + " (" + synths[j][i]->getSmiles() + ")";
+          msg += sep + synths[j][i]->getId() + " (" +
+                 synths[j][i]->getSmiles() + ")";
        } else {
-          msg +=  sep + synths[j].front()->getId() + " (" + synths[j].front()->getSmiles() + ")";
+          msg += sep + synths[j].front()->getId() + " (" +
+                 synths[j].front()->getSmiles() + ")";
        }
      }
      msg += "\n" + std::string(e.what()) + "\n";
@@ -250,7 +314,8 @@ void SynthonSet::transferProductBondsToSynthons() {
        synthsToUse[j][0] = true;
      }
    }
-    auto sampleMols = buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
+    auto sampleMols =
+        buildSampleMolecules(synthonMolCopies, synthSetNum, *this);
    for (size_t j = 0; j < sampleMols.size(); ++j) {
      auto synthCp =
          std::make_unique<RWMol>(*d_synthons[synthSetNum][j]->getOrigMol());
@@ -363,12 +428,17 @@ const std::vector<int> &SynthonSet::getNumConnectors() const {
 }

 bool SynthonSet::hasFingerprints() const { return !d_synthonFPs.empty(); }
+bool SynthonSet::hasAddAndSubtractFPs() const {
+  return static_cast<bool>(d_addFP);
+}

 void SynthonSet::buildSynthonFingerprints(
    const FingerprintGenerator<std::uint64_t> &fpGen) {
+  d_addFP.reset();
+  d_subtractFP.reset();
+
  // The synthons should have had transferProductBondsToSynthons
  // applied to them by now.
-
  d_synthonFPs.clear();

  d_synthonFPs.reserve(d_synthons.size());
@@ -382,6 +452,97 @@ void SynthonSet::buildSynthonFingerprints(
  }
 }

+void SynthonSet::buildAddAndSubtractFPs(
+    const FingerprintGenerator<std::uint64_t> &fpGen) {
+  d_addFP.reset();
+  d_subtractFP.reset();
+  std::vector<std::vector<size_t>> synthonNums(d_synthons.size());
+  std::vector<size_t> numSynthons(d_synthons.size());
+  std::vector<int> naddbitcounts(fpGen.getOptions()->d_fpSize, 0);
+  std::vector<int> nsubbitcounts(fpGen.getOptions()->d_fpSize, 0);
+  size_t totSamples = 1;
+  // Sample the synthons evenly across their size ranges.
+  for (size_t i = 0; i < d_synthons.size(); ++i) {
+    std::vector<std::tuple<size_t, Synthon *>> sortedSynthons(
+        d_synthons[i].size());
+    for (size_t j = 0; j < d_synthons[i].size(); ++j) {
+      sortedSynthons[j] = std::make_tuple(j, d_synthons[i][j].get());
+    }
+    std::sort(sortedSynthons.begin(), sortedSynthons.end(),
+              [](const std::tuple<size_t, Synthon *> &a,
+                 const std::tuple<size_t, Synthon *> &b) -> bool {
+                auto as = std::get<1>(a);
+                auto bs = std::get<1>(b);
+                if (as->getOrigMol()->getNumAtoms() ==
+                    bs->getOrigMol()->getNumAtoms()) {
+                  return as->getId() < bs->getId();
+                }
+                return as->getOrigMol()->getNumAtoms() <
+                       bs->getOrigMol()->getNumAtoms();
+              });
+    size_t stride = d_synthons[i].size() / 40;
+    if (!stride) {
+      stride = 1;
+    }
+    for (size_t j = 0; j < d_synthons[i].size(); j += stride) {
+      synthonNums[i].push_back(j);
+    }
+    numSynthons[i] = synthonNums[i].size();
+    totSamples *= numSynthons[i];
+  }
+  details::Stepper stepper(numSynthons);
+  std::vector<size_t> theseSynthNums(synthonNums.size(), 0);
+  while (stepper.d_currState[0] != numSynthons[0]) {
+    for (size_t i = 0; i < stepper.d_currState.size(); ++i) {
+      theseSynthNums[i] = synthonNums[i][stepper.d_currState[i]];
+    }
+    auto prod = buildProduct(theseSynthNums);
+    std::unique_ptr<ExplicitBitVect> prodFP(fpGen.getFingerprint(*prod));
+    ExplicitBitVect approxFP(*d_synthonFPs[0][theseSynthNums[0]]);
+    for (size_t j = 1; j < d_synthonFPs.size(); ++j) {
+      approxFP |= *d_synthonFPs[j][theseSynthNums[j]];
+    }
+    // addFP is what's in the productFP and not in approxFP
+    // and subtractFP is vice versa.  The former captures the bits of
+    // the molecule formed by the joining the fragments, the latter
+    // the bits connecting the dummy atoms.
+    std::unique_ptr<ExplicitBitVect> addFP(
+        new ExplicitBitVect(*prodFP & ~approxFP));
+    IntVect v;
+    addFP->getOnBits(v);
+    for (auto i : v) {
+      naddbitcounts[i]++;
+    }
+    std::unique_ptr<ExplicitBitVect> subtractFP(
+        new ExplicitBitVect(approxFP & ~(*prodFP)));
+    subtractFP->getOnBits(v);
+    for (auto i : v) {
+      nsubbitcounts[i]++;
+    }
+    stepper.step();
+  }
+
+  // This is the fraction of products that must set a bit for
+  // it to be included.  Arrived at by empirical means.
+  double frac = 0.75;
+  d_addFP = std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
+  for (size_t i = 0; i < naddbitcounts.size(); ++i) {
+    if (naddbitcounts[i] > int(totSamples * frac)) {
+      d_addFP->setBit(i);
+    }
+  }
+  d_subtractFP =
+      std::make_unique<ExplicitBitVect>(fpGen.getOptions()->d_fpSize);
+  for (size_t i = 0; i < nsubbitcounts.size(); ++i) {
+    if (nsubbitcounts[i] > int(totSamples * frac)) {
+      d_subtractFP->setBit(i);
+    }
+  }
+
+  // Take the complement of the subtract FP so it can be used directly
+  *d_subtractFP = ~(*d_subtractFP);
+}
+
 std::string SynthonSet::buildProductName(
    const std::vector<size_t> &synthNums) const {
  std::string prodName = d_id;
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSet.h
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSet.h
@@ -27,6 +27,7 @@ class ROMol;

 namespace SynthonSpaceSearch {
 class Synthon;
+struct SynthonSpaceSearchParams;

 // This class holds all the synthons for a particular reaction.
 class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
@@ -49,11 +50,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
  const std::vector<std::shared_ptr<ROMol>> &getConnectorRegions() const;

  const std::unique_ptr<ExplicitBitVect> &getConnRegFP() const;
+  const std::unique_ptr<ExplicitBitVect> &getAddFP() const;
+  const std::unique_ptr<ExplicitBitVect> &getSubtractFP() const;
  const std::vector<int> &getNumConnectors() const;
  bool hasFingerprints() const;
+  bool hasAddAndSubtractFPs() const;
+
  const std::vector<std::vector<std::unique_ptr<ExplicitBitVect>>> &
  getSynthonFPs() const;
-
  // Writes to/reads from a binary stream.
  void writeToDBStream(std::ostream &os) const;
  void readFromDBStream(std::istream &is, std::uint32_t version);
@@ -86,6 +90,7 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {

  void buildSynthonFingerprints(
      const FingerprintGenerator<std::uint64_t> &fpGen);
+  void buildAddAndSubtractFPs(const FingerprintGenerator<std::uint64_t> &fpGen);

  // Return the molecules for synthons for which the bits are true.
  // Obviously requires that reqSynths is the same dimensions as
@@ -118,6 +123,14 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
  // The fingerprint of the connector regions.  Fingerprints for all
  // connector regions are folded into the same fingerprint.
  std::unique_ptr<ExplicitBitVect> d_connRegFP;
+
+  // When doing an approximate FP similarity by ORing together
+  // the synthonFPs, adding d_addFP and subtracting d_subtractFP
+  // accounts (a bit) for the joins and the dummy atoms
+  // respectively.
+  std::unique_ptr<ExplicitBitVect> d_addFP;
+  std::unique_ptr<ExplicitBitVect> d_subtractFP;
+
  // The number of connectors in the synthons in each synthon set.
  std::vector<int> d_numConnectors;

--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpace.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpace.cpp
@@ -32,7 +32,7 @@ namespace RDKit::SynthonSpaceSearch {

 // used for serialization
 constexpr int32_t versionMajor = 2;
-constexpr int32_t versionMinor = 0;
+constexpr int32_t versionMinor = 1;
 constexpr int32_t endianId = 0xa100f;

 std::int64_t SynthonSpace::getNumProducts() const {
@@ -185,8 +185,8 @@ void SynthonSpace::writeDBFile(const std::string &outFilename) const {
    streamWrite(os, d_fpType);
  }
  streamWrite(os, d_reactions.size());
-  for (const auto &[fst, snd] : d_reactions) {
-    snd->writeToDBStream(os);
+  for (const auto &[reactionId, reaction] : d_reactions) {
+    reaction->writeToDBStream(os);
  }
  os.close();
 }
@@ -277,10 +277,10 @@ bool SynthonSpace::hasFingerprints() const {

 void SynthonSpace::buildSynthonFingerprints(
    const FingerprintGenerator<std::uint64_t> &fpGen) {
-  BOOST_LOG(rdWarningLog) << "Building the fingerprints may take some time."
-                          << std::endl;
  if (const auto fpType = fpGen.infoString();
      fpType != d_fpType || !hasFingerprints()) {
+    BOOST_LOG(rdWarningLog)
+        << "Building the fingerprints may take some time." << std::endl;
    d_fpType = fpType;
    for (const auto &[id, synthSet] : d_reactions) {
      synthSet->buildSynthonFingerprints(fpGen);
@@ -288,4 +288,18 @@ void SynthonSpace::buildSynthonFingerprints(
  }
 }

+bool SynthonSpace::hasAddAndSubstractFingerprints() const {
+  if (d_reactions.empty()) {
+    return false;
+  }
+  return d_reactions.begin()->second->hasAddAndSubtractFPs();
+}
+
+void SynthonSpace::buildAddAndSubstractFingerprints(
+    const FingerprintGenerator<std::uint64_t> &fpGen) {
+  for (const auto &[id, synthSet] : d_reactions) {
+    synthSet->buildAddAndSubtractFPs(fpGen);
+  }
+}
+
 }  // namespace RDKit::SynthonSpaceSearch
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpace.h
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpace.h
@@ -50,8 +50,15 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
                                         // than that will not matter as it will
                                         // be reduced to 4.  Likewise, values
                                         // lower than 1 will be increased to 1.
-  std::int64_t maxHits{1000};  // The maximum number of hits to return.  Use -1
-                               // for no maximum.
+  std::uint64_t maxNumFrags{
+      100000};  // The maximum number of fragments the query can
+                // be broken into.  Big molecules will create huge
+                // numbers of fragments that may cause excessive
+                // memory use.  If the number of fragments hits this number,
+                // fragmentation stops and the search results will likely be
+                // incomplete.
+  std::int64_t maxHits{1000};  // The maximum number of hits to return.  Use
+                               // -1 for no maximum.
  std::int64_t hitStart{0};    // Sequence number of hit to start from.  So that
                               // you can return the next N hits of a search
                               // having already obtained N-1.
@@ -78,6 +85,16 @@ struct RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpaceSearchParams {
             // times, a lower number will give faster searches at the
             // risk of missing some hits.  The value you use should have
             // a positive correlation with your FOMO.
+  double approxSimilarityAdjuster{
+      0.1};  // The fingerprint search uses an approximate similarity method
+             // before building a product and doing a final check.  The
+             // similarityCutoff is reduced by this value for the approximate
+             // check.  A lower value will give faster run times at the
+             // risk of missing some hits.  The value you use should have a
+             // positive correlation with your FOMO.  The default is
+             // appropriate for Morgan fingerprints.  With RDKit fingerprints,
+             // 0.05 is adequate, and higher than that has been seen to
+             // produce long run times.
  std::uint64_t timeOut{600};  // Maximum number of seconds to spend on a single
                               // search.  0 means no maximum.
 };
@@ -206,10 +223,16 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSpace {

  bool hasFingerprints() const;
  // Create the fingerprints for the synthons ready for fingerprint searches.
-  // Valid values of fpType as described by SynthonSpaceSearchParams.
+  // Will be done by the fingerprint search if not done ahead of time.
  void buildSynthonFingerprints(
      const FingerprintGenerator<std::uint64_t> &fpGen);

+  bool hasAddAndSubstractFingerprints() const;
+  // Create the add and substract fingerprints for the SynthonSets.
+  // Will be done by the fingerprint search if not done ahead of time.
+  void buildAddAndSubstractFingerprints(
+      const FingerprintGenerator<std::uint64_t> &fpGen);
+
 private:
  std::string d_fileName;
  std::map<std::string, std::unique_ptr<SynthonSet>> d_reactions;
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceFingerprintSearcher.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceFingerprintSearcher.cpp
@@ -24,6 +24,9 @@ SynthonSpaceFingerprintSearcher::SynthonSpaceFingerprintSearcher(
      getSpace().getSynthonFingerprintType() != fpGen.infoString()) {
    getSpace().buildSynthonFingerprints(fpGen);
  }
+  if (!getSpace().hasAddAndSubstractFingerprints()) {
+    getSpace().buildAddAndSubstractFingerprints(fpGen);
+  }
  d_queryFP = std::unique_ptr<ExplicitBitVect>(d_fpGen.getFingerprint(query));
 }

@@ -148,6 +151,27 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
  return results;
 }

+bool SynthonSpaceFingerprintSearcher::quickVerify(
+    const std::unique_ptr<SynthonSet> &reaction,
+    const std::vector<size_t> &synthNums) const {
+  // Make an approximate fingerprint by combining the FPs for
+  // these synthons, adding in the addFP and taking out the
+  // subtractFP.
+  const auto &synthFPs = reaction->getSynthonFPs();
+  ExplicitBitVect fullFP(*synthFPs[0][synthNums[0]]);
+  for (unsigned int i = 1; i < synthNums.size(); ++i) {
+    fullFP |= *synthFPs[i][synthNums[i]];
+  }
+  fullFP |= *(reaction->getAddFP());
+  // The subtract FP has already had its bits flipped, so just do a
+  // straight AND.
+  fullFP &= *(reaction->getSubtractFP());
+
+  double approxSim = TanimotoSimilarity(fullFP, *d_queryFP);
+  return approxSim >=
+         getParams().similarityCutoff - getParams().approxSimilarityAdjuster;
+}
+
 bool SynthonSpaceFingerprintSearcher::verifyHit(const ROMol &hit) const {
  const std::unique_ptr<ExplicitBitVect> fp(d_fpGen.getFingerprint(hit));
  if (const auto sim = TanimotoSimilarity(*fp, *d_queryFP);
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceFingerprintSearcher.h
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceFingerprintSearcher.h
@@ -32,6 +32,8 @@ class SynthonSpaceFingerprintSearcher : public SynthonSpaceSearcher {

  std::vector<SynthonSpaceHitSet> searchFragSet(
      std::vector<std::unique_ptr<ROMol>> &fragSet) const override;
+  bool quickVerify(const std::unique_ptr<SynthonSet> &reaction,
+                   const std::vector<size_t> &synthNums) const override;
  bool verifyHit(const ROMol &hit) const override;
 };
 }  // namespace RDKit::SynthonSpaceSearch
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.cpp
@@ -115,7 +115,7 @@ std::vector<const Bond *> getContiguousAromaticBonds(const ROMol &mol,
 }

 std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
-    const ROMol &query, unsigned int maxBondSplits) {
+    const ROMol &query, unsigned int maxBondSplits, std::uint64_t maxNumFrags) {
  if (maxBondSplits < 1) {
    maxBondSplits = 1;
  }
@@ -139,7 +139,10 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
  fragments.emplace_back();
  fragments.back().emplace_back(new ROMol(query));

-  // Now do the splits.
+  // Now do the splits.  Symmetrical molecules can give rise to the same
+  // fragment set in different ways so keep track of what we've had to
+  // avoid duplicates.
+  std::set<std::string> fragSmis;
  for (unsigned int i = 1; i <= maxBondSplits; ++i) {
    auto combs = combMFromN(i, static_cast<int>(query.getNumBonds()));
    std::vector<std::pair<unsigned int, unsigned int>> dummyLabels;
@@ -174,9 +177,21 @@ std::vector<std::vector<std::unique_ptr<ROMol>>> splitMolecule(
        continue;
      }
      if (checkConnectorsInDifferentFrags(molFrags, i)) {
+        std::string fragSmi(MolToSmiles(*fragMol));
+        if (!fragSmis.insert(fragSmi).second) {
+          continue;
+        }
        fragments.emplace_back(std::move(molFrags));
+        if (fragments.size() > maxNumFrags) {
+          BOOST_LOG(rdWarningLog)
+              << "Maximum number of fragments reached." << std::endl;
+          break;
+        }
      }
    }
+    if (fragments.size() > maxNumFrags) {
+      break;
+    }
  }
  return fragments;
 }
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h
@@ -37,7 +37,8 @@ RDKIT_SYNTHONSPACESEARCH_EXPORT void fixAromaticRingSplits(
 // be altered.  Also, you can't split a molecule on 3 bonds if it only contains
 // 2.
 RDKIT_SYNTHONSPACESEARCH_EXPORT std::vector<std::vector<std::unique_ptr<ROMol>>>
-splitMolecule(const ROMol &query, unsigned int maxBondSplits);
+splitMolecule(const ROMol &query, unsigned int maxBondSplits,
+              std::uint64_t maxNumFrags);
 // Counts the number of [1*], [2*]...[4*] in the string.
 RDKIT_SYNTHONSPACESEARCH_EXPORT int countConnections(const ROMol &frag);

--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.cpp
@@ -51,7 +51,7 @@ SearchResults SynthonSpaceSearcher::search() {
  }
  std::vector<std::unique_ptr<ROMol>> results;

-  auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits);
+  auto fragments = details::splitMolecule(d_query, d_params.maxBondSplits, d_params.maxNumFrags);
  std::vector<SynthonSpaceHitSet> allHits;
  size_t totHits = 0;
  TimePoint *endTime = nullptr;
@@ -94,6 +94,9 @@ std::unique_ptr<ROMol> SynthonSpaceSearcher::buildAndVerifyHit(
    if (resultsNames.size() < static_cast<size_t>(d_params.hitStart)) {
      return prod;
    }
+    if (!quickVerify(reaction, synthNums)) {
+      return prod;
+    }
    prod = reaction->buildProduct(synthNums);

    // Do a final check of the whole thing.  It can happen that the
--- a/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.h
+++ b/Code/GraphMol/SynthonSpaceSearch/SynthonSpaceSearcher.h
@@ -71,6 +71,15 @@ class SynthonSpaceSearcher {
      const std::unique_ptr<SynthonSet> &reaction,
      const std::vector<size_t> &synthNums,
      std::set<std::string> &resultsNames) const;
+  // Some of the search methods (Rascal, for example) can do a quick
+  // check on whether this set of synthons can match the query without having to
+  // build the full molecule from the synthons.  They will over-ride this
+  // function which by default passes everything.
+  virtual bool quickVerify(
+      [[maybe_unused]] const std::unique_ptr<SynthonSet> &reaction,
+      [[maybe_unused]] const std::vector<size_t> &synthNums) const {
+    return true;
+  }
  virtual bool verifyHit(const ROMol &mol) const = 0;

  // Build the molecules from the synthons identified in reagentsToUse.
--- a/Code/GraphMol/SynthonSpaceSearch/Wrap/rdSynthonSpaceSearch.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/Wrap/rdSynthonSpaceSearch.cpp
@@ -105,6 +105,14 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
                     &SynthonSpaceSearch::SynthonSpaceSearchParams::maxHits,
                     "The maximum number of hits to return.  Default=1000."
                     "Use -1 for no maximum.")
+      .def_readwrite(
+          "maxNumFrags",
+          &SynthonSpaceSearch::SynthonSpaceSearchParams::maxNumFrags,
+          "The maximum number of fragments the query can be broken into."
+          "  Big molecules will create huge numbers of fragments that may cause"
+          " excessive memory use.  If the number of fragments hits this number,"
+          " fragmentation stops and the search results will likely be incomplete."
+          "  Default=100000.")
      .def_readwrite(
          "hitStart", &SynthonSpaceSearch::SynthonSpaceSearchParams::hitStart,
          "The sequence number of the hit to start from.  So that you"
@@ -144,6 +152,19 @@ BOOST_PYTHON_MODULE(rdSynthonSpaceSearch) {
          "Similarities of fragments are generally low due to low bit"
          " densities.  For the fragment matching, reduce the similarity cutoff"
          " off by this amount.  Default=0.1.")
+      .def_readwrite(
+          "approxSimilarityAdjuster",
+          &SynthonSpaceSearch::SynthonSpaceSearchParams::
+              approxSimilarityAdjuster,
+          "The fingerprint search uses an approximate similarity method"
+          " before building a product and doing a final check.  The"
+          " similarityCutoff is reduced by this value for the approximate"
+          " check.  A lower value will give faster run times at the"
+          " risk of missing some hits.  The value you use should have a"
+          " positive correlation with your FOMO.  The default of 0.1 is"
+          " appropriate for Morgan fingerprints.  With RDKit fingerprints,"
+          " 0.05 is adequate, and higher than that has been seen to"
+          " produce long run times.")
      .def_readwrite(
          "timeOut", &SynthonSpaceSearch::SynthonSpaceSearchParams::timeOut,
          "Time limit for search, in seconds.  Default is 600s, 0 means no"
--- a/Code/GraphMol/SynthonSpaceSearch/fingerprint_search_catch_tests.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/fingerprint_search_catch_tests.cpp
@@ -8,11 +8,13 @@
 //  of the RDKit source tree.

 #include <algorithm>
+#include <chrono>
 #include <fstream>

 #include <GraphMol/SubstructLibrary/SubstructLibrary.h>
 #include <GraphMol/FileParsers/MolSupplier.h>
 #include <GraphMol/Fingerprints/MorganGenerator.h>
+#include <GraphMol/Fingerprints/RDKitFPGenerator.h>
 #include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
 #include <GraphMol/SynthonSpaceSearch/SearchResults.h>
 #include <GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h>
@@ -93,6 +95,8 @@ TEST_CASE("FP Small tests") {
    synthonspace.readTextFile(libNames[i]);
    SynthonSpaceSearchParams params;
    params.maxBondSplits = 3;
+    params.randomSeed = 1;
+    params.approxSimilarityAdjuster = 0.2;
    auto queryMol = v2::SmilesParse::MolFromSmiles(querySmis[i]);
    std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
        MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
@@ -147,8 +151,12 @@ TEST_CASE("FP Biggy") {
  const std::vector<size_t> numRes{46, 2, 0, 123, 0, 0};
  const std::vector<size_t> maxRes{2408, 197, 0, 833, 0, 4};
  SynthonSpaceSearchParams params;
+  params.approxSimilarityAdjuster = 0.2;
  params.maxHits = -1;
  for (size_t i = 0; i < smis.size(); ++i) {
+    if (i != 4) {
+      continue;
+    }
    auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
    auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
    CHECK(results.getHitMolecules().size() == numRes[i]);
@@ -243,3 +251,40 @@ TEST_CASE("Timeout") {
  auto results1 = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
  CHECK(!results1.getTimedOut());
 }
+
+TEST_CASE("FP Approx Similarity") {
+  REQUIRE(rdbase);
+  std::string fName(rdbase);
+  std::string libName =
+      fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
+  SynthonSpace synthonspace;
+  synthonspace.readTextFile(libName);
+  SynthonSpaceSearchParams params;
+  // The addFP and subtractFP are built from a random selection of
+  // products so do occasionally vary, so use a fixed seed.
+  params.randomSeed = 1;
+  params.similarityCutoff = 0.5;
+  params.timeOut = 0;
+  params.maxHits = 1000;
+
+  std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
+      RDKitFP::getRDKitFPGenerator<std::uint64_t>(3));
+  auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
+
+  // With RDKit fingerprints, 0.05 gives a reasonable compromise
+  // between speed and hits missed.
+  params.approxSimilarityAdjuster = 0.05;
+  auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
+  CHECK(results.getHitMolecules().size() == 482);
+  CHECK(results.getMaxNumResults() == 1466);
+
+  // A tighter adjuster misses more hits.
+  params.approxSimilarityAdjuster = 0.01;
+  results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
+  CHECK(results.getHitMolecules().size() == 124);
+
+  // This is the actual number of hits achievable.
+  params.approxSimilarityAdjuster = 0.25;
+  results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
+  CHECK(results.getHitMolecules().size() == 914);
+}
--- a/Code/GraphMol/SynthonSpaceSearch/substructure_search_catch_tests.cpp
+++ b/Code/GraphMol/SynthonSpaceSearch/substructure_search_catch_tests.cpp
@@ -46,12 +46,12 @@ std::unique_ptr<SubstructLibrary> loadSubstructLibrary(
 TEST_CASE("Test splits 1") {
  const std::vector<std::string> smiles{"c1ccccc1CN1CCN(CC1)C(-O)c1ncc(F)cc1",
                                        "CC(C)OCc1nnc(N2CC(C)CC2)n1C1CCCC1"};
-  std::vector<std::vector<size_t>> expCounts{{1, 51, 345, 20},
-                                             {1, 38, 298, 56}};
+  std::vector<std::vector<size_t>> expCounts{{1, 47, 345, 20},
+                                             {1, 37, 262, 41}};
  for (size_t i = 0; i < smiles.size(); ++i) {
    auto mol = v2::SmilesParse::MolFromSmiles(smiles[i]);
    REQUIRE(mol);
-    auto fragments = splitMolecule(*mol, 3);
+    auto fragments = splitMolecule(*mol, 3, 100000);
    CHECK(fragments.size() ==
          std::accumulate(expCounts[i].begin(), expCounts[i].end(), size_t(0)));
    // The first fragment set should just be the molecule itself.