mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-06 22:39:55 +08:00
356 lines
9.6 KiB
Python
Executable File
356 lines
9.6 KiB
Python
Executable File
# $Id$
|
|
#
|
|
# Copyright (C) 2002-2006 greg Landrum and Rational Discovery LLC
|
|
#
|
|
# @@ All Rights Reserved @@
|
|
#
|
|
""" command line utility to report on the contributions of descriptors to
|
|
tree-based composite models
|
|
|
|
Usage: AnalyzeComposite [optional args] <models>
|
|
|
|
<models>: file name(s) of pickled composite model(s)
|
|
(this is the name of the db table if using a database)
|
|
|
|
Optional Arguments:
|
|
|
|
-n number: the number of levels of each model to consider
|
|
|
|
-d dbname: the database from which to read the models
|
|
|
|
-N Note: the note string to search for to pull models from the database
|
|
|
|
-X: Send the results to Excel. Note: will alter the current
|
|
worksheet (by adding data to the end) and only works on
|
|
systems with Excel installed. It *is* safe to call this
|
|
multiple times with a single worksheet.
|
|
|
|
-v: be verbose whilst screening
|
|
"""
|
|
|
|
from Numeric import *
|
|
import sys,cPickle
|
|
from ML.DecTree import TreeUtils,Tree
|
|
from ML.Data import Stats
|
|
from Dbase.DbConnection import DbConnect
|
|
from ML import ScreenComposite
|
|
try:
|
|
from Excel.ExcelWrapper import ExcelWrapper as Excel
|
|
except ImportError:
|
|
Excel = None
|
|
|
|
__VERSION_STRING="2.2.0"
|
|
|
|
def ProcessIt(composites,nToConsider=3,verbose=0,reportToExcel=0):
|
|
composite=composites[0]
|
|
nComposites =len(composites)
|
|
ns = composite.GetDescriptorNames()
|
|
#nDesc = len(ns)-2
|
|
if len(ns)>2:
|
|
#globalRes = zeros((nDesc,nToConsider),Float)
|
|
globalRes = {}
|
|
|
|
nDone = 1
|
|
descNames = {}
|
|
for composite in composites:
|
|
if verbose > 0:
|
|
print '#------------------------------------'
|
|
print 'Doing: ',nDone
|
|
nModels = len(composite)
|
|
nDone += 1
|
|
res = {}
|
|
for i in range(len(composite)):
|
|
model = composite.GetModel(i)
|
|
if isinstance(model,Tree.TreeNode):
|
|
levels = TreeUtils.CollectLabelLevels(model,{},0,nToConsider)
|
|
TreeUtils.CollectDescriptorNames(model,descNames,0,nToConsider)
|
|
for descId in levels.keys():
|
|
v = res.get(descId,zeros(nToConsider,Float))
|
|
v[levels[descId]] += 1./nModels
|
|
res[descId] = v
|
|
for k in res:
|
|
v = globalRes.get(k,zeros(nToConsider,Float))
|
|
v += res[k]/nComposites
|
|
globalRes[k] = v
|
|
if verbose > 0:
|
|
for k in res.keys():
|
|
name = descNames[k]
|
|
strRes = ', '.join(['%4.2f'%x for x in res[k]])
|
|
print '%s,%s,%5.4f'%(name,strRes,sum(res[k]))
|
|
|
|
print
|
|
|
|
|
|
if verbose >= 0:
|
|
print '# Average Descriptor Positions'
|
|
retVal = []
|
|
if reportToExcel and Excel is not None:
|
|
xl = Excel()
|
|
xlCol = 1
|
|
xlRow = xl.FindLastRow(1,xlCol)
|
|
xlRow+=1
|
|
xl[xlRow,xlCol]=' '.join(sys.argv)
|
|
xlRow+=1
|
|
else:
|
|
xl = None
|
|
for k in globalRes.keys():
|
|
name = descNames[k]
|
|
if verbose >= 0:
|
|
strRes = ', '.join(['%4.2f'%x for x in globalRes[k]])
|
|
print '%s,%s,%5.4f'%(name,strRes,sum(globalRes[k]))
|
|
if xl:
|
|
xlCol=1
|
|
xl[xlRow,xlCol]=name
|
|
xlCol += 1
|
|
for v in globalRes[k]:
|
|
xl[xlRow,xlCol]=v
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=sum(globalRes[k])
|
|
xlRow += 1
|
|
tmp = [name]
|
|
tmp.extend(globalRes[k])
|
|
tmp.append(sum(globalRes[k]))
|
|
retVal.append(tmp)
|
|
if verbose >= 0:
|
|
print
|
|
else:
|
|
retVal = []
|
|
return retVal
|
|
|
|
|
|
def ErrorStats(conn,where,enrich=1):
|
|
fields = 'overall_error,holdout_error,overall_result_matrix,holdout_result_matrix,overall_correct_conf,overall_incorrect_conf,holdout_correct_conf,holdout_incorrect_conf'
|
|
try:
|
|
data = conn.GetData(fields=fields,where=where)
|
|
except:
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
nPts = len(data)
|
|
if not nPts:
|
|
sys.stderr.write('no runs found\n')
|
|
return None
|
|
overall = zeros(nPts,Float)
|
|
overallEnrich = zeros(nPts,Float)
|
|
oCorConf = 0.0
|
|
oInCorConf = 0.0
|
|
holdout = zeros(nPts,Float)
|
|
holdoutEnrich = zeros(nPts,Float)
|
|
hCorConf = 0.0
|
|
hInCorConf = 0.0
|
|
overallMatrix = None
|
|
holdoutMatrix = None
|
|
for i in range(nPts):
|
|
if data[i][0] is not None:
|
|
overall[i] = data[i][0]
|
|
oCorConf += data[i][4]
|
|
oInCorConf += data[i][5]
|
|
if data[i][1] is not None:
|
|
holdout[i] = data[i][1]
|
|
haveHoldout=1
|
|
else:
|
|
haveHoldout=0
|
|
tmpOverall = 1.*eval(data[i][2])
|
|
if enrich >=0:
|
|
overallEnrich[i] = ScreenComposite.CalcEnrichment(tmpOverall,tgt=enrich)
|
|
if haveHoldout:
|
|
tmpHoldout = 1.*eval(data[i][3])
|
|
if enrich >=0:
|
|
holdoutEnrich[i] = ScreenComposite.CalcEnrichment(tmpHoldout,tgt=enrich)
|
|
if overallMatrix is None:
|
|
if data[i][2] is not None:
|
|
overallMatrix = tmpOverall
|
|
if haveHoldout and data[i][3] is not None:
|
|
holdoutMatrix = tmpHoldout
|
|
else:
|
|
overallMatrix += tmpOverall
|
|
if haveHoldout:
|
|
holdoutMatrix += tmpHoldout
|
|
if haveHoldout:
|
|
hCorConf += data[i][6]
|
|
hInCorConf += data[i][7]
|
|
|
|
avgOverall = sum(overall)/nPts
|
|
oCorConf /= nPts
|
|
oInCorConf /= nPts
|
|
overallMatrix /= nPts
|
|
oSort = argsort(overall)
|
|
oMin = overall[oSort[0]]
|
|
overall -= avgOverall
|
|
devOverall = sqrt(sum(overall**2)/(nPts-1))
|
|
res = {}
|
|
res['oAvg'] = 100*avgOverall
|
|
res['oDev'] = 100*devOverall
|
|
res['oCorrectConf'] = 100*oCorConf
|
|
res['oIncorrectConf'] = 100*oInCorConf
|
|
res['oResultMat']=overallMatrix
|
|
res['oBestIdx']=oSort[0]
|
|
res['oBestErr']=100*oMin
|
|
|
|
if enrich>=0:
|
|
mean,dev = Stats.MeanAndDev(overallEnrich)
|
|
res['oAvgEnrich'] = mean
|
|
res['oDevEnrich'] = dev
|
|
|
|
if haveHoldout:
|
|
avgHoldout = sum(holdout)/nPts
|
|
hCorConf /= nPts
|
|
hInCorConf /= nPts
|
|
holdoutMatrix /= nPts
|
|
hSort = argsort(holdout)
|
|
hMin = holdout[hSort[0]]
|
|
holdout -= avgHoldout
|
|
devHoldout = sqrt(sum(holdout**2)/(nPts-1))
|
|
res['hAvg'] = 100*avgHoldout
|
|
res['hDev'] = 100*devHoldout
|
|
res['hCorrectConf'] = 100*hCorConf
|
|
res['hIncorrectConf'] = 100*hInCorConf
|
|
res['hResultMat']=holdoutMatrix
|
|
res['hBestIdx']=hSort[0]
|
|
res['hBestErr']=100*hMin
|
|
if enrich>=0:
|
|
mean,dev = Stats.MeanAndDev(holdoutEnrich)
|
|
res['hAvgEnrich'] = mean
|
|
res['hDevEnrich'] = dev
|
|
return res
|
|
|
|
def ShowStats(statD,enrich=1):
|
|
statD = statD.copy()
|
|
statD['oBestIdx'] = statD['oBestIdx']+1
|
|
txt="""
|
|
# Error Statistics:
|
|
\tOverall: %(oAvg)6.3f%% (%(oDev)6.3f) %(oCorrectConf)4.1f/%(oIncorrectConf)4.1f
|
|
\t\tBest: %(oBestIdx)d %(oBestErr)6.3f%%"""%(statD)
|
|
if statD.has_key('hAvg'):
|
|
statD['hBestIdx'] = statD['hBestIdx']+1
|
|
txt += """
|
|
\tHoldout: %(hAvg)6.3f%% (%(hDev)6.3f) %(hCorrectConf)4.1f/%(hIncorrectConf)4.1f
|
|
\t\tBest: %(hBestIdx)d %(hBestErr)6.3f%%
|
|
"""%(statD)
|
|
print txt
|
|
print
|
|
print '# Results matrices:'
|
|
print '\tOverall:'
|
|
tmp = transpose(statD['oResultMat'])
|
|
colCounts = sum(tmp)
|
|
rowCounts = sum(tmp,1)
|
|
for i in range(len(tmp)):
|
|
if rowCounts[i]==0: rowCounts[i]=1
|
|
row = tmp[i]
|
|
print '\t\t',
|
|
for j in range(len(row)):
|
|
print '% 6.2f'%row[j],
|
|
print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i])
|
|
print '\t\t',
|
|
for i in range(len(tmp)):
|
|
print '------',
|
|
print
|
|
print '\t\t',
|
|
for i in range(len(tmp)):
|
|
if colCounts[i]==0: colCounts[i]=1
|
|
print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]),
|
|
print
|
|
if enrich>-1 and statD.has_key('oAvgEnrich'):
|
|
print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['oAvgEnrich'],statD['oDevEnrich'])
|
|
|
|
|
|
if statD.has_key('hResultMat'):
|
|
print '\tHoldout:'
|
|
tmp = transpose(statD['hResultMat'])
|
|
colCounts = sum(tmp)
|
|
rowCounts = sum(tmp,1)
|
|
for i in range(len(tmp)):
|
|
if rowCounts[i]==0: rowCounts[i]=1
|
|
row = tmp[i]
|
|
print '\t\t',
|
|
for j in range(len(row)):
|
|
print '% 6.2f'%row[j],
|
|
print '\t| % 4.2f'%(100.*tmp[i,i]/rowCounts[i])
|
|
print '\t\t',
|
|
for i in range(len(tmp)):
|
|
print '------',
|
|
print
|
|
print '\t\t',
|
|
for i in range(len(tmp)):
|
|
if colCounts[i]==0: colCounts[i]=1
|
|
print '% 6.2f'%(100.*tmp[i,i]/colCounts[i]),
|
|
print
|
|
if enrich>-1 and statD.has_key('hAvgEnrich'):
|
|
print '\t\tEnrich(%d): %.3f (%.3f)'%(enrich,statD['hAvgEnrich'],statD['hDevEnrich'])
|
|
|
|
|
|
return
|
|
|
|
|
|
def Usage():
|
|
print __doc__
|
|
sys.exit(-1)
|
|
|
|
if __name__ == "__main__":
|
|
import getopt
|
|
try:
|
|
args,extras = getopt.getopt(sys.argv[1:],'n:d:N:vX',('skip',
|
|
'enrich=',
|
|
))
|
|
except:
|
|
Usage()
|
|
|
|
count = 3
|
|
db = None
|
|
note = ''
|
|
verbose = 0
|
|
skip = 0
|
|
enrich = 1
|
|
reportToExcel=0
|
|
for arg,val in args:
|
|
if arg == '-n':
|
|
count = int(val)+1
|
|
elif arg == '-d':
|
|
db = val
|
|
elif arg == '-N':
|
|
note = val
|
|
elif arg == '-v':
|
|
verbose = 1
|
|
elif arg == '-X':
|
|
if Excel is not None:
|
|
reportToExcel = 1
|
|
else:
|
|
ScreenComposite.message('NOTE: Excel support not enabled, -X option ignored.')
|
|
|
|
elif arg == '--skip':
|
|
skip = 1
|
|
elif arg == '--enrich':
|
|
enrich = int(val)
|
|
composites = []
|
|
if db is None:
|
|
for arg in extras:
|
|
composite = cPickle.load(open(arg,'rb'))
|
|
composites.append(composite)
|
|
else:
|
|
tbl = extras[0]
|
|
conn = DbConnect(db,tbl)
|
|
if note:
|
|
where="where note='%s'"%(note)
|
|
else:
|
|
where = ''
|
|
if not skip:
|
|
pkls = conn.GetData(fields='model',where=where)
|
|
composites = []
|
|
for pkl in pkls:
|
|
pkl = str(pkl[0])
|
|
comp = cPickle.loads(pkl)
|
|
composites.append(comp)
|
|
|
|
if len(composites):
|
|
ProcessIt(composites,count,verbose=verbose,reportToExcel=reportToExcel)
|
|
elif not skip:
|
|
print 'ERROR: no composite models found'
|
|
sys.exit(-1)
|
|
|
|
if db:
|
|
res = ErrorStats(conn,where,enrich=enrich)
|
|
if res:
|
|
ShowStats(res)
|
|
|
|
|