rdkit/Contrib/LEF/DistancePredict.py

#
#  Copyright (c) 2009, Novartis Institutes for BioMedical Research Inc.
#  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Greg Landrum and Anna Vulpetti, March 2009
from rdkit import Chem
from rdkit import DataStructs
from CreateFps import GetMolFingerprint
from rdkit.ML.KNN.KNNRegressionModel import KNNRegressionModel
from rdkit.RDLogger import logger
logger=logger()
import sys

# nameField is the name of the property (from the SD file) that has molecule
# names...If the molecules have names in the first row of the file, use "_Name"
nameField = 'Compound_orig'
#nameField = '_Name'

# propField is the name of the property (from the SD file) you want to generate
# predictions for
propField='chemical_shift_1'

weightedAverage=True
import types,copy
from optparse import OptionParser,Option,OptionValueError

def check_floatlist(option,opt,value):
    try:
        v = eval(value)
        if type(v) not in (types.ListType,types.TupleType):
            raise ValueError
        v = [float(x) for x in v]
    except ValueError:
        raise OptionValueError("option %s : invalid float list value: %r"%( opt,value))
    return v
class MyOption(Option):
    TYPES=Option.TYPES+("floatlist",)
    TYPE_CHECKER = copy.copy(Option.TYPE_CHECKER)
    TYPE_CHECKER["floatlist"] = check_floatlist
parser=OptionParser("distance predict",version='%prog',option_class=MyOption)
parser.add_option('--maxPathLength','--max',default=8,type=int,
                  help='maximum length path for the fingerprint')
parser.add_option('--similarityThreshold','--sim',default=[0.9],type='floatlist',
                  help='threshold for similarity')
parser.add_option('--numNeighbors','--num','-n','-k',default=50,type=int,
                  help='number of neighbors to consider')
parser.add_option('--neighborsFile','--nbrs',default='',
                  help='name of an output file to hold the neighbor lists')
parser.add_option('--scan',default=False,action="store_true")


if __name__=='__main__':
    options,args = parser.parse_args()
    outF = file(args[-1],'w+')

    logger.info('reading training molecules and generating fingerprints')
    suppl = Chem.SDMolSupplier(args[0])
    train=[]
    for i,mol in enumerate(suppl):
        if not mol:
            continue
        smi = Chem.MolToSmiles(mol,True)
        nm = mol.GetProp(nameField)
        property = float(mol.GetProp(propField))
        fp = GetMolFingerprint(mol,options.maxPathLength)
        train.append((nm,smi,fp,property))
    logger.info('  got %d molecules'%len(train))

    if len(args)>2:
        suppl = Chem.SDMolSupplier(args[1])
        haveTest=True
        logger.info('reading testing molecules and generating fingerprints')
        test=[]
        for i,mol in enumerate(suppl):
            if not mol:
                continue
            smi = Chem.MolToSmiles(mol,True)
            nm = mol.GetProp(nameField)
            if mol.HasProp(propField):
                property = float(mol.GetProp(propField))
            else:
                property=0
            fp = GetMolFingerprint(mol,options.maxPathLength)
            test.append((nm,smi,fp,property))
        logger.info('  got %d molecules'%len(test))
    else:
        haveTest=False
        test=train

    results=[None]*len(test)
    for i in range(len(test)):
        results[i] = [None]*len(options.similarityThreshold)
    if options.neighborsFile:
        nbrFile=file(options.neighborsFile,'w+')
        print >>nbrFile,'ID|CompoundName|CompoundSmiles|NeighborName|NeighborSmiles|NeighborShift|Similarity'
        id=1
    else:
        nbrFile=None
    for j,thresh in enumerate(options.similarityThreshold):
        if not haveTest:
            logger.info('Doing cross validation with threshold %.2f'%thresh)
        else:
            logger.info('Doing prediction with threshold %.2f'%thresh)
        for i in range(len(test)):
            if not haveTest:
                localTrain=[train[x] for x in range(len(train)) if x!=i]
            else:
                localTrain=train
            localTest=test[i]
            mdl = KNNRegressionModel(options.numNeighbors,[],
                                     lambda x,y,*args: 1-DataStructs.DiceSimilarity(x[-2],y[-2]),
                                     radius=1.-thresh)
            mdl.SetTrainingExamples(localTrain)
            nbrs=[]
            pred = mdl.PredictExample(localTest,weightedAverage=weightedAverage,
                                      neighborList=nbrs)
            nm,smi,fp,prop = test[i]

            if nbrFile:
                for dist,data in nbrs:
                    if data is None: continue
                    nnm,nsmi,nfp,nproperty = data
                    outRow=[str(id),nm,smi,nnm,nsmi,str(nproperty),str(dist-1.)]
                    id+=1
                    print >>nbrFile,'|'.join(outRow)
            nbrs = [x for x in nbrs if x[1] is not None]
            results[i][j]=(nm,smi,prop,pred,len(nbrs))
            if not (i+1)%100:
                logger.info('Done %d molecules'%(i+1))
        logger.info('  done')
    numNeighbors = options.numNeighbors
    maxPathLength = options.maxPathLength-1
    logger.info('creating output file')
    headers=['name','smiles','shift']
    for thresh in options.similarityThreshold:
        headers.append('predShift_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f'%locals())
        headers.append('dPred_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f'%locals())
        headers.append('nbrs_%(maxPathLength)d_%(numNeighbors)d_%(thresh).2f'%locals())
    print >>outF,'|'.join(headers)
    for i in range(len(test)):
        nm=results[i][0][0]
        smi=results[i][0][1]
        prop=results[i][0][2]
        row = [nm,smi,str(prop)]
        for j in range(len(options.similarityThreshold)):
            nbrs=results[i][j][4]
            pred=results[i][j][3]
            row.append(str(pred))
            row.append(str(abs(prop-pred)))
            row.append(str(nbrs))
        print >>outF,'|'.join(row)