NP Likeness with confidence value (#1608)

* NP Likeness with confidence value added a scoring variant with an additional confidence value between 0..1 that describes how many fragments of the tested compound were found in the model data set. * fixed indentation * added some documentation * namedtuple, no rounding * other changes to module as requested by reviewer * replaced occurrences of sys.stderr.write by print
2026-06-03 21:44:30 +08:00 · 2017-10-26 17:39:46 +02:00
parent 908ad5932d
commit 4c26511cb6
1 changed files with 31 additions and 9 deletions
--- a/Contrib/NP_Score/npscorer.py
+++ b/Contrib/NP_Score/npscorer.py
@@ -18,37 +18,59 @@ from rdkit import Chem
 from rdkit.Chem import rdMolDescriptors
 import sys, math, gzip, pickle
 import os.path
+from collections import namedtuple


 def readNPModel(filename=os.path.join(os.path.dirname(__file__), 'publicnp.model.gz')):
-  sys.stderr.write("reading NP model ...\n")
+  """Reads and returns the scoring model,
+  which has to be passed to the scoring functions."""
+  print("reading NP model ...", file=sys.stderr)
  fscore = pickle.load(gzip.open(filename))
-  sys.stderr.write("model in\n")
+  print("model in", file=sys.stderr)
  return fscore


-def scoreMol(mol, fscore):
+def scoreMolWConfidence(mol, fscore):
+  """Next to the NP Likeness Score, this function outputs a confidence value
+  between 0..1 that descibes how many fragments of the tested molecule
+  were found in the model data set (1: all fragments were found).
+
+  Returns namedtuple NPLikeness(nplikeness, confidence)"""
+
  if mol is None:
    raise ValueError('invalid molecule')
  fp = rdMolDescriptors.GetMorganFingerprint(mol, 2)
  bits = fp.GetNonzeroElements()

  # calculating the score
-  score = 0.
+  score = 0.0
+  bits_found = 0
  for bit in bits:
-    score += fscore.get(bit, 0)
+    if bit in fscore:
+      bits_found += 1
+      score += fscore[bit]
+
  score /= float(mol.GetNumAtoms())
+  confidence = float(bits_found / len(bits))

  # preventing score explosion for exotic molecules
  if score > 4:
    score = 4. + math.log10(score - 4. + 1.)
-  if score < -4:
+  elif score < -4:
    score = -4. - math.log10(-4. - score + 1.)
-  return score
+  NPLikeness = namedtuple("NPLikeness", "nplikeness,confidence")
+  return NPLikeness(score, confidence)
+
+
+def scoreMol(mol, fscore):
+  """Calculates the Natural Product Likeness of a molecule.
+
+  Returns the score as float in the range -5..5."""
+  return scoreMolWConfidence(mol, fscore).nplikeness


 def processMols(fscore, suppl):
-  sys.stderr.write("calculating ...\n")
+  print("calculating ...", file=sys.stderr)
  count = {}
  n = 0
  for i, m in enumerate(suppl):
@@ -62,7 +84,7 @@ def processMols(fscore, suppl):
    name = m.GetProp('_Name')
    print(smiles + "\t" + name + "\t" + score)

-  sys.stderr.write("finished, " + str(n) + " molecules processed\n")
+  print("finished, " + str(n) + " molecules processed", file=sys.stderr)


 if __name__ == '__main__':