mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
NP Likeness with confidence value (#1608)
* NP Likeness with confidence value added a scoring variant with an additional confidence value between 0..1 that describes how many fragments of the tested compound were found in the model data set. * fixed indentation * added some documentation * namedtuple, no rounding * other changes to module as requested by reviewer * replaced occurrences of sys.stderr.write by print
This commit is contained in:
@@ -18,37 +18,59 @@ from rdkit import Chem
|
||||
from rdkit.Chem import rdMolDescriptors
|
||||
import sys, math, gzip, pickle
|
||||
import os.path
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
def readNPModel(filename=os.path.join(os.path.dirname(__file__), 'publicnp.model.gz')):
|
||||
sys.stderr.write("reading NP model ...\n")
|
||||
"""Reads and returns the scoring model,
|
||||
which has to be passed to the scoring functions."""
|
||||
print("reading NP model ...", file=sys.stderr)
|
||||
fscore = pickle.load(gzip.open(filename))
|
||||
sys.stderr.write("model in\n")
|
||||
print("model in", file=sys.stderr)
|
||||
return fscore
|
||||
|
||||
|
||||
def scoreMol(mol, fscore):
|
||||
def scoreMolWConfidence(mol, fscore):
|
||||
"""Next to the NP Likeness Score, this function outputs a confidence value
|
||||
between 0..1 that descibes how many fragments of the tested molecule
|
||||
were found in the model data set (1: all fragments were found).
|
||||
|
||||
Returns namedtuple NPLikeness(nplikeness, confidence)"""
|
||||
|
||||
if mol is None:
|
||||
raise ValueError('invalid molecule')
|
||||
fp = rdMolDescriptors.GetMorganFingerprint(mol, 2)
|
||||
bits = fp.GetNonzeroElements()
|
||||
|
||||
# calculating the score
|
||||
score = 0.
|
||||
score = 0.0
|
||||
bits_found = 0
|
||||
for bit in bits:
|
||||
score += fscore.get(bit, 0)
|
||||
if bit in fscore:
|
||||
bits_found += 1
|
||||
score += fscore[bit]
|
||||
|
||||
score /= float(mol.GetNumAtoms())
|
||||
confidence = float(bits_found / len(bits))
|
||||
|
||||
# preventing score explosion for exotic molecules
|
||||
if score > 4:
|
||||
score = 4. + math.log10(score - 4. + 1.)
|
||||
if score < -4:
|
||||
elif score < -4:
|
||||
score = -4. - math.log10(-4. - score + 1.)
|
||||
return score
|
||||
NPLikeness = namedtuple("NPLikeness", "nplikeness,confidence")
|
||||
return NPLikeness(score, confidence)
|
||||
|
||||
|
||||
def scoreMol(mol, fscore):
|
||||
"""Calculates the Natural Product Likeness of a molecule.
|
||||
|
||||
Returns the score as float in the range -5..5."""
|
||||
return scoreMolWConfidence(mol, fscore).nplikeness
|
||||
|
||||
|
||||
def processMols(fscore, suppl):
|
||||
sys.stderr.write("calculating ...\n")
|
||||
print("calculating ...", file=sys.stderr)
|
||||
count = {}
|
||||
n = 0
|
||||
for i, m in enumerate(suppl):
|
||||
@@ -62,7 +84,7 @@ def processMols(fscore, suppl):
|
||||
name = m.GetProp('_Name')
|
||||
print(smiles + "\t" + name + "\t" + score)
|
||||
|
||||
sys.stderr.write("finished, " + str(n) + " molecules processed\n")
|
||||
print("finished, " + str(n) + " molecules processed", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user