mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
184 lines
7.0 KiB
Python
184 lines
7.0 KiB
Python
#
|
|
# Copyright (c) 2009, Novartis Institutes for BioMedical Research Inc.
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are
|
|
# met:
|
|
#
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above
|
|
# copyright notice, this list of conditions and the following
|
|
# disclaimer in the documentation and/or other materials provided
|
|
# with the distribution.
|
|
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
# nor the names of its contributors may be used to endorse or promote
|
|
# products derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#
|
|
# Created by Greg Landrum and Anna Vulpetti, March 2009
|
|
from CreateFps import GetMolFingerprint
|
|
|
|
from rdkit import Chem, DataStructs
|
|
from rdkit.ML.KNN.KNNRegressionModel import KNNRegressionModel
|
|
from rdkit.RDLogger import logger
|
|
|
|
logger = logger()
|
|
import sys
|
|
|
|
# nameField is the name of the property (from the SD file) that has molecule
|
|
# names...If the molecules have names in the first row of the file, use "_Name"
|
|
nameField = 'Compound_orig'
|
|
#nameField = '_Name'
|
|
|
|
# propField is the name of the property (from the SD file) you want to generate
|
|
# predictions for
|
|
propField = 'chemical_shift_1'
|
|
|
|
weightedAverage = True
|
|
import copy
|
|
import types
|
|
from optparse import Option, OptionParser, OptionValueError
|
|
|
|
|
|
def check_floatlist(option, opt, value):
|
|
try:
|
|
v = eval(value)
|
|
if type(v) not in (types.ListType, types.TupleType):
|
|
raise ValueError
|
|
v = [float(x) for x in v]
|
|
except ValueError:
|
|
raise OptionValueError("option %s : invalid float list value: %r" % (opt, value))
|
|
return v
|
|
|
|
|
|
class MyOption(Option):
|
|
TYPES = Option.TYPES + ("floatlist", )
|
|
TYPE_CHECKER = copy.copy(Option.TYPE_CHECKER)
|
|
TYPE_CHECKER["floatlist"] = check_floatlist
|
|
|
|
|
|
parser = OptionParser("distance predict", version='%prog', option_class=MyOption)
|
|
parser.add_option('--maxPathLength', '--max', default=8, type=int,
|
|
help='maximum length path for the fingerprint')
|
|
parser.add_option('--similarityThreshold', '--sim', default=[0.9], type='floatlist',
|
|
help='threshold for similarity')
|
|
parser.add_option('--numNeighbors', '--num', '-n', '-k', default=50, type=int,
|
|
help='number of neighbors to consider')
|
|
parser.add_option('--neighborsFile', '--nbrs', default='',
|
|
help='name of an output file to hold the neighbor lists')
|
|
parser.add_option('--scan', default=False, action="store_true")
|
|
|
|
if __name__ == '__main__':
|
|
options, args = parser.parse_args()
|
|
outF = file(args[-1], 'w+')
|
|
|
|
logger.info('reading training molecules and generating fingerprints')
|
|
suppl = Chem.SDMolSupplier(args[0])
|
|
train = []
|
|
for i, mol in enumerate(suppl):
|
|
if not mol:
|
|
continue
|
|
smi = Chem.MolToSmiles(mol, True)
|
|
nm = mol.GetProp(nameField)
|
|
property = float(mol.GetProp(propField))
|
|
fp = GetMolFingerprint(mol, options.maxPathLength)
|
|
train.append((nm, smi, fp, property))
|
|
logger.info(' got %d molecules' % len(train))
|
|
|
|
if len(args) > 2:
|
|
suppl = Chem.SDMolSupplier(args[1])
|
|
haveTest = True
|
|
logger.info('reading testing molecules and generating fingerprints')
|
|
test = []
|
|
for i, mol in enumerate(suppl):
|
|
if not mol:
|
|
continue
|
|
smi = Chem.MolToSmiles(mol, True)
|
|
nm = mol.GetProp(nameField)
|
|
if mol.HasProp(propField):
|
|
property = float(mol.GetProp(propField))
|
|
else:
|
|
property = 0
|
|
fp = GetMolFingerprint(mol, options.maxPathLength)
|
|
test.append((nm, smi, fp, property))
|
|
logger.info(' got %d molecules' % len(test))
|
|
else:
|
|
haveTest = False
|
|
test = train
|
|
|
|
results = [None] * len(test)
|
|
for i in range(len(test)):
|
|
results[i] = [None] * len(options.similarityThreshold)
|
|
if options.neighborsFile:
|
|
nbrFile = file(options.neighborsFile, 'w+')
|
|
print >> nbrFile, 'ID|CompoundName|CompoundSmiles|NeighborName|NeighborSmiles|NeighborShift|Similarity'
|
|
id = 1
|
|
else:
|
|
nbrFile = None
|
|
for j, thresh in enumerate(options.similarityThreshold):
|
|
if not haveTest:
|
|
logger.info('Doing cross validation with threshold %.2f' % thresh)
|
|
else:
|
|
logger.info('Doing prediction with threshold %.2f' % thresh)
|
|
for i in range(len(test)):
|
|
if not haveTest:
|
|
localTrain = [train[x] for x in range(len(train)) if x != i]
|
|
else:
|
|
localTrain = train
|
|
localTest = test[i]
|
|
mdl = KNNRegressionModel(options.numNeighbors, [],
|
|
lambda x, y, *args: 1 - DataStructs.DiceSimilarity(x[-2], y[-2]),
|
|
radius=1. - thresh)
|
|
mdl.SetTrainingExamples(localTrain)
|
|
nbrs = []
|
|
pred = mdl.PredictExample(localTest, weightedAverage=weightedAverage, neighborList=nbrs)
|
|
nm, smi, fp, prop = test[i]
|
|
|
|
if nbrFile:
|
|
for dist, data in nbrs:
|
|
if data is None:
|
|
continue
|
|
nnm, nsmi, nfp, nproperty = data
|
|
outRow = [str(id), nm, smi, nnm, nsmi, str(nproperty), str(dist - 1.)]
|
|
id += 1
|
|
print >> nbrFile, '|'.join(outRow)
|
|
nbrs = [x for x in nbrs if x[1] is not None]
|
|
results[i][j] = (nm, smi, prop, pred, len(nbrs))
|
|
if not (i + 1) % 100:
|
|
logger.info('Done %d molecules' % (i + 1))
|
|
logger.info(' done')
|
|
numNeighbors = options.numNeighbors
|
|
maxPathLength = options.maxPathLength - 1
|
|
logger.info('creating output file')
|
|
headers = ['name', 'smiles', 'shift']
|
|
for thresh in options.similarityThreshold:
|
|
headers.append('predShift_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f' % locals())
|
|
headers.append('dPred_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f' % locals())
|
|
headers.append('nbrs_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f' % locals())
|
|
print >> outF, '|'.join(headers)
|
|
for i in range(len(test)):
|
|
nm = results[i][0][0]
|
|
smi = results[i][0][1]
|
|
prop = results[i][0][2]
|
|
row = [nm, smi, str(prop)]
|
|
for j in range(len(options.similarityThreshold)):
|
|
nbrs = results[i][j][4]
|
|
pred = results[i][j][3]
|
|
row.append(str(pred))
|
|
row.append(str(abs(prop - pred)))
|
|
row.append(str(nbrs))
|
|
print >> outF, '|'.join(row)
|