NP Likeness with confidence value (#1608)

* NP Likeness with confidence value
added a scoring variant with an additional confidence value between 0..1
that describes how many fragments of the tested compound were found in the model data set.

* fixed indentation

* added some documentation

* namedtuple, no rounding

* other changes to module as requested by reviewer
* replaced occurrences of sys.stderr.write by print
This commit is contained in:
Axel Pahl
2017-10-26 17:39:46 +02:00
committed by Greg Landrum
parent 908ad5932d
commit 4c26511cb6

View File

@@ -18,37 +18,59 @@ from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import sys, math, gzip, pickle
import os.path
from collections import namedtuple
def readNPModel(filename=os.path.join(os.path.dirname(__file__), 'publicnp.model.gz')):
sys.stderr.write("reading NP model ...\n")
"""Reads and returns the scoring model,
which has to be passed to the scoring functions."""
print("reading NP model ...", file=sys.stderr)
fscore = pickle.load(gzip.open(filename))
sys.stderr.write("model in\n")
print("model in", file=sys.stderr)
return fscore
def scoreMol(mol, fscore):
def scoreMolWConfidence(mol, fscore):
"""Next to the NP Likeness Score, this function outputs a confidence value
between 0..1 that descibes how many fragments of the tested molecule
were found in the model data set (1: all fragments were found).
Returns namedtuple NPLikeness(nplikeness, confidence)"""
if mol is None:
raise ValueError('invalid molecule')
fp = rdMolDescriptors.GetMorganFingerprint(mol, 2)
bits = fp.GetNonzeroElements()
# calculating the score
score = 0.
score = 0.0
bits_found = 0
for bit in bits:
score += fscore.get(bit, 0)
if bit in fscore:
bits_found += 1
score += fscore[bit]
score /= float(mol.GetNumAtoms())
confidence = float(bits_found / len(bits))
# preventing score explosion for exotic molecules
if score > 4:
score = 4. + math.log10(score - 4. + 1.)
if score < -4:
elif score < -4:
score = -4. - math.log10(-4. - score + 1.)
return score
NPLikeness = namedtuple("NPLikeness", "nplikeness,confidence")
return NPLikeness(score, confidence)
def scoreMol(mol, fscore):
"""Calculates the Natural Product Likeness of a molecule.
Returns the score as float in the range -5..5."""
return scoreMolWConfidence(mol, fscore).nplikeness
def processMols(fscore, suppl):
sys.stderr.write("calculating ...\n")
print("calculating ...", file=sys.stderr)
count = {}
n = 0
for i, m in enumerate(suppl):
@@ -62,7 +84,7 @@ def processMols(fscore, suppl):
name = m.GetProp('_Name')
print(smiles + "\t" + name + "\t" + score)
sys.stderr.write("finished, " + str(n) + " molecules processed\n")
print("finished, " + str(n) + " molecules processed", file=sys.stderr)
if __name__ == '__main__':