SSSR performance improvements to support larger systems (#1131)

* findSSSR performance improvements for fragments without rings This makes Chem.SanitizeMol significantly faster when dealing with molecules with lots of disconnected fragments (like a box of water). The following is the runtime of Chem.SanitizeMol while adding 10,000 waters with explicit hydrogens when running Chem.SanitizeMol on every 1,000th water added. Before: 0 add_water = 0.00007s 0 Chem.SanitizeMol = 0.01991s 1000 add_water = 0.00009s 1000 Chem.SanitizeMol = 0.99659s 2000 add_water = 0.00013s 2000 Chem.SanitizeMol = 3.94565s 3000 add_water = 0.00018s 3000 Chem.SanitizeMol = 8.94760s 4000 add_water = 0.00023s 4000 Chem.SanitizeMol = 15.75187s 5000 add_water = 0.00035s 5000 Chem.SanitizeMol = 24.59318s 6000 add_water = 0.00048s 6000 Chem.SanitizeMol = 37.23530s 7000 add_water = 0.00042s 7000 Chem.SanitizeMol = 47.70860s 8000 add_water = 0.00105s 8000 Chem.SanitizeMol = 62.21912s 9000 add_water = 0.00056s 9000 Chem.SanitizeMol = 80.08511s After: 0 add_water = 0.00003s 0 Chem.SanitizeMol = 0.01219s 1000 add_water = 0.00004s 1000 Chem.SanitizeMol = 0.01004s 2000 add_water = 0.00012s 2000 Chem.SanitizeMol = 0.01058s 3000 add_water = 0.00018s 3000 Chem.SanitizeMol = 0.01158s 4000 add_water = 0.00018s 4000 Chem.SanitizeMol = 0.01530s 5000 add_water = 0.00022s 5000 Chem.SanitizeMol = 0.02010s 6000 add_water = 0.00036s 6000 Chem.SanitizeMol = 0.02397s 7000 add_water = 0.00033s 7000 Chem.SanitizeMol = 0.02978s 8000 add_water = 0.00037s 8000 Chem.SanitizeMol = 0.04446s 9000 add_water = 0.00040s 9000 Chem.SanitizeMol = 0.04419s * Refactor new_timings.py script a bit to be able to run only the first (reading molecules) test. * Removing O(N^2) behavior of finding the number of bonds in the fragment during SSSR. This only improves the case when there are long chains and a small number of rings in the fragment. Many ring systems are still dominated by the rest of the SSSR algorithm, and fragments with no ring systems don't reach this part of the code. For a test case with a single cyclicpropane and adding carbons while calling Chem.SanitizeMol every 10,000 carbons added yield the following improvement in performance: before: 0 add_carbon = 0.00001s 0 Chem.SanitizeMol = 0.01237s 10000 add_carbon = 0.00017s 10000 Chem.SanitizeMol = 0.04453s 20000 add_carbon = 0.00017s 20000 Chem.SanitizeMol = 0.13038s 30000 add_carbon = 0.00029s 30000 Chem.SanitizeMol = 0.27671s 40000 add_carbon = 0.00063s 40000 Chem.SanitizeMol = 0.44774s 50000 add_carbon = 0.00106s 50000 Chem.SanitizeMol = 0.69433s 60000 add_carbon = 0.00181s 60000 Chem.SanitizeMol = 1.00577s after: 0 add_carbon = 0.00001s 0 Chem.SanitizeMol = 0.01264s 10000 add_carbon = 0.00013s 10000 Chem.SanitizeMol = 0.01349s 20000 add_carbon = 0.00022s 20000 Chem.SanitizeMol = 0.02724s 30000 add_carbon = 0.00040s 30000 Chem.SanitizeMol = 0.04292s 40000 add_carbon = 0.00076s 40000 Chem.SanitizeMol = 0.06172s 50000 add_carbon = 0.00193s 50000 Chem.SanitizeMol = 0.07658s 60000 add_carbon = 0.00147s 60000 Chem.SanitizeMol = 0.08625s Note, couldn't actually test a higher number of carbons as it led to a stack overflow due to recursion in findSSSR.
2026-06-03 21:44:30 +08:00 · 2016-10-29 04:38:14 +02:00
parent 3b41772851
commit 893fa41e98
2 changed files with 52 additions and 35 deletions
--- a/Code/GraphMol/FindRings.cpp
+++ b/Code/GraphMol/FindRings.cpp
@@ -860,9 +860,12 @@ int findSSSR(const ROMol &mol, VECT_INT_VECT &res) {
  boost::dynamic_bitset<> ringAtoms(nats);

  INT_VECT atomDegrees(nats);
+  INT_VECT atomDegreesWithZeroOrderBonds(nats);
  for (unsigned int i = 0; i < nats; ++i) {
    const Atom *atom = mol.getAtomWithIdx(i);
-    atomDegrees[i] = atom->getDegree();
+    int deg = atom->getDegree();
+    atomDegrees[i] = deg;
+    atomDegreesWithZeroOrderBonds[i] = deg;
    ROMol::OEDGE_ITER beg, end;
    boost::tie(beg, end) = mol.getAtomBonds(atom);
    while (beg != end) {
@@ -881,18 +884,38 @@ int findSSSR(const ROMol &mol, VECT_INT_VECT &res) {
    VECT_INT_VECT fragRes;
    curFrag = frags[fi];

+    if (curFrag.size() < 3)
+      continue;
+
    // the following is the list of atoms that are useful in the next round of
    // trimming
    // basically atoms that become degree 0 or 1 because of bond removals
    // initialized with atoms of degrees 0 and 1
    INT_SET changed;
+    int bndcnt_with_zero_order_bonds = 0;
+    unsigned int nbnds = 0;
    for (INT_VECT_CI aidi = curFrag.begin(); aidi != curFrag.end(); aidi++) {
-      unsigned int deg = atomDegrees[*aidi];
+      int atom_idx = *aidi;
+      bndcnt_with_zero_order_bonds += atomDegreesWithZeroOrderBonds[atom_idx];
+
+      int deg = atomDegrees[atom_idx];
+
+      nbnds += deg;
      if (deg < 2) {
        changed.insert((*aidi));
      }
    }

+    // check to see if this fragment can even have a possible ring
+    CHECK_INVARIANT(bndcnt_with_zero_order_bonds % 2 == 0, "fragment graph has a dangling degree");
+    bndcnt_with_zero_order_bonds = bndcnt_with_zero_order_bonds / 2;
+    int num_possible_rings = bndcnt_with_zero_order_bonds - curFrag.size() + 1;
+    if (num_possible_rings < 1)
+      continue;
+
+    CHECK_INVARIANT(nbnds % 2 == 0, "fragment graph problem when including zero-order bonds");
+    nbnds = nbnds / 2;
+
    boost::dynamic_bitset<> doneAts(nats);
    unsigned int nAtomsDone = 0;
    while (nAtomsDone < curFrag.size()) {
@@ -968,19 +991,6 @@ int findSSSR(const ROMol &mol, VECT_INT_VECT &res) {
      }  // done with degree 3 node
    }    // done finding rings in this fragement

-    // calculate the cyclomatic number for the fragment:
-    unsigned int nbnds = 0;
-    for (ROMol::ConstBondIterator bndIt = mol.beginBonds();
-         bndIt != mol.endBonds(); ++bndIt) {
-      if (std::find(curFrag.begin(), curFrag.end(),
-                    (*bndIt)->getBeginAtomIdx()) != curFrag.end() &&
-          std::find(curFrag.begin(), curFrag.end(),
-                    (*bndIt)->getEndAtomIdx()) != curFrag.end() &&
-          (*bndIt)->getBondType() != Bond::ZERO) {
-        ++nbnds;
-      }
-    }
-
 #if 0
        std::cerr<<"\n\nFOUND:\n";
        for(VECT_INT_VECT::const_iterator iter=fragRes.begin();
@@ -990,6 +1000,7 @@ int findSSSR(const ROMol &mol, VECT_INT_VECT &res) {
          std::cerr<<std::endl;
        }
 #endif
+    // calculate the cyclomatic number for the fragment:
    int nexpt = rdcast<int>((nbnds - curFrag.size() + 1));
    int ssiz = rdcast<int>(fragRes.size());

--- a/Regress/Scripts/new_timings.py
+++ b/Regress/Scripts/new_timings.py
@@ -3,34 +3,40 @@ import time, gzip, random, os, sys
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit.RDLogger import logger
+
+dname = os.path.dirname(__file__)
+def data(fname):
+  return os.path.join(dname, '..', 'Data', fname)
+
 logger = logger()

 tests = [1] * 1001
 if len(sys.argv) > 1:
  tests = [0] * 1001
-  tests[1] = 1
  for x in sys.argv[1:]:
    x = int(x)
    tests[x] = 1
 ts = []
 mols = []
-lines = gzip.open('../Data/znp.50k.smi.gz', 'rt').readlines()
-logger.info('mols from smiles')
-nMols = 0
-nBad = 0
-t1 = time.time()
-for line in lines:
-  line = line.strip().split(' ')
-  m = Chem.MolFromSmiles(line[0])
-  if m:
-    nMols += 1
-    mols.append(m)
-  else:
-    nBad += 1

-t2 = time.time()
-logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad))
-ts.append(t2 - t1)
+if tests[0]:
+  lines = gzip.open(data('znp.50k.smi.gz'), 'rt').readlines()
+  logger.info('mols from smiles')
+  nMols = 0
+  nBad = 0
+  t1 = time.time()
+  for line in lines:
+    line = line.strip().split(' ')
+    m = Chem.MolFromSmiles(line[0])
+    if m:
+      nMols += 1
+      mols.append(m)
+    else:
+      nBad += 1
+
+  t2 = time.time()
+  logger.info('Results1: %.2f seconds, %d passed, %d failed' % (t2 - t1, nMols, nBad))
+  ts.append(t2 - t1)

 if tests[1]:
  logger.info('Writing: Canonical SMILES')
@@ -42,7 +48,7 @@ if tests[1]:
  ts.append(t2 - t1)

 if tests[2]:
-  sdData = gzip.open('../Data/mols.1000.sdf.gz', 'rb').read()
+  sdData = gzip.open(data('mols.1000.sdf.gz'), 'rb').read()
  logger.info('mols from sdf')
  suppl = Chem.SDMolSupplier()
  suppl.SetData(sdData)
@@ -61,7 +67,7 @@ if tests[2]:
  ts.append(t2 - t1)

 if tests[3] or tests[4] or tests[5]:
-  pattData = gzip.open('../Data/queries.txt.gz', 'rt').readlines()
+  pattData = gzip.open(data('queries.txt.gz'), 'rt').readlines()
  pattData = [x.strip().replace('[H]', '').replace('()', '') for x in pattData]
  logger.info('patterns from smiles')
  patts = []
@@ -104,7 +110,7 @@ if tests[6] or tests[7] or tests[8]:
  logger.info('reading SMARTS')
  patts = []
  t1 = time.time()
-  for line in open('../Data/RLewis_smarts.txt'):
+  for line in open(data('RLewis_smarts.txt')):
    line = line.strip()
    if line == '' or line[0] == '#':
      continue