mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
1765 lines
57 KiB
Python
Executable File
1765 lines
57 KiB
Python
Executable File
# $Id$
|
|
#
|
|
# Copyright (C) 2000-2006 greg Landrum and Rational Discovery LLC
|
|
#
|
|
# @@ All Rights Reserved @@
|
|
#
|
|
""" command line utility for screening composite models
|
|
|
|
**Usage**
|
|
|
|
_ScreenComposite [optional args] modelfile(s) datafile_
|
|
|
|
Unless indicated otherwise (via command line arguments), _modelfile_ is
|
|
a file containing a pickled composite model and _filename_ is a QDAT file.
|
|
|
|
**Command Line Arguments**
|
|
|
|
- -t *threshold value(s)*: use high-confidence predictions for the final
|
|
analysis of the hold-out data. The threshold value can be either a single
|
|
float or a list/tuple of floats. All thresholds should be between
|
|
0.0 and 1.0
|
|
|
|
- -D: do a detailed screen.
|
|
|
|
- -d *database name*: instead of reading the data from a QDAT file,
|
|
pull it from a database. In this case, the _datafile_ argument
|
|
provides the name of the database table containing the data set.
|
|
|
|
- -N *note*: use all models from the database which have this note.
|
|
The modelfile argument should contain the name of the table
|
|
with the models.
|
|
|
|
- -H: screen only the hold out set (works only if a version of
|
|
BuildComposite more recent than 1.2.2 was used).
|
|
|
|
- -T: screen only the training set (works only if a version of
|
|
BuildComposite more recent than 1.2.2 was used).
|
|
|
|
- -E: do a detailed Error analysis. This shows each misclassified
|
|
point and the number of times it was missed across all screened
|
|
composites. If the --enrich argument is also provided, only compounds
|
|
that have true activity value equal to the enrichment value will be
|
|
used.
|
|
|
|
- --enrich *enrichVal*: target "active" value to be used in calculating
|
|
enrichments.
|
|
|
|
- -A: show All predictions.
|
|
|
|
- -S: shuffle activity values before screening
|
|
|
|
- -R: randomize activity values before screening
|
|
|
|
- -F *filter frac*: filters the data before training to change the
|
|
distribution of activity values in the training set. *filter frac*
|
|
is the fraction of the training set that should have the target value.
|
|
**See note in BuildComposite help about data filtering**
|
|
|
|
- -v *filter value*: filters the data before training to change the
|
|
distribution of activity values in the training set. *filter value*
|
|
is the target value to use in filtering.
|
|
**See note in BuildComposite help about data filtering**
|
|
|
|
- -V: be verbose when screening multiple models
|
|
|
|
- -h: show this message and exit
|
|
|
|
- -X: send a summary of the results to Excel (NOTE: this will alter the
|
|
contents of the currently active workbook)
|
|
|
|
- --OOB: Do out an "out-of-bag" generalization error estimate. This only
|
|
makes sense when applied to the original data set.
|
|
|
|
- --pickleCol *colId*: index of the column containing a pickled value
|
|
(used primarily for cases where fingerprints are used as descriptors)
|
|
|
|
*** Options for making Prediction (Hanneke) Plots ***
|
|
|
|
- --predPlot=<fileName>: triggers the generation of a Hanneke plot and
|
|
sets the name of the .txt file which will hold the output data.
|
|
A Gnuplot control file, <fileName>.gnu, will also be generated.
|
|
|
|
- --predActTable=<name> (optional): name of the database table
|
|
containing activity values. If this is not provided, activities
|
|
will be read from the same table containing the screening data
|
|
|
|
- --predActCol=<name> (optional): name of the activity column. If not
|
|
provided, the name of the last column in the activity table will
|
|
be used.
|
|
|
|
- --predLogScale (optional): If provided, the x axis of the
|
|
prediction plot (the activity axis) will be plotted using a log
|
|
scale
|
|
|
|
- --predShow: launch a gnuplot instance and display the prediction
|
|
plot (the plot will still be written to disk).
|
|
|
|
*** The following options are likely obsolete ***
|
|
|
|
- -P: read pickled data. The datafile argument should contain
|
|
a pickled data set. *relevant only to qdat files*
|
|
|
|
- -q: data are not quantized (the composite should take care of
|
|
quantization itself if it requires quantized data). *relevant only to
|
|
qdat files*
|
|
|
|
|
|
|
|
"""
|
|
import RDConfig
|
|
import DataStructs
|
|
import sys,cPickle,types,copy
|
|
from Numeric import *
|
|
|
|
try:
|
|
from PIL import Image,ImageDraw
|
|
except ImportError:
|
|
hasPil=0
|
|
else:
|
|
hasPil=1
|
|
|
|
from ML.Data import DataUtils,SplitData
|
|
from ML import CompositeRun
|
|
from Dbase.DbConnection import DbConnect
|
|
from Dbase import DbModule
|
|
_details = CompositeRun.CompositeRun()
|
|
try:
|
|
from Excel.ExcelWrapper import ExcelWrapper as Excel
|
|
except ImportError:
|
|
Excel = None
|
|
|
|
__VERSION_STRING="3.2.8"
|
|
|
|
def message(msg,noRet=0):
|
|
""" emits messages to _sys.stdout_
|
|
override this in modules which import this one to redirect output
|
|
|
|
**Arguments**
|
|
|
|
- msg: the string to be displayed
|
|
|
|
"""
|
|
if noRet:
|
|
sys.stdout.write('%s '%(msg))
|
|
else:
|
|
sys.stdout.write('%s\n'%(msg))
|
|
def error(msg):
|
|
""" emits messages to _sys.stderr_
|
|
override this in modules which import this one to redirect output
|
|
|
|
**Arguments**
|
|
|
|
- msg: the string to be displayed
|
|
|
|
"""
|
|
sys.stderr.write('ERROR: %s\n'%(msg))
|
|
|
|
def CalcEnrichment(mat,tgt=1):
|
|
if tgt<0 or tgt>=mat.shape[0]: return 0
|
|
nPts = float(sum(sum(mat)))
|
|
nTgtPred = float(sum(mat[:,tgt]))
|
|
if nTgtPred:
|
|
pctCorrect = mat[tgt,tgt]/nTgtPred
|
|
nTgtReal = float(sum(mat[tgt,:]))
|
|
pctOverall = nTgtReal/nPts
|
|
else:
|
|
return 0.0
|
|
return pctCorrect/pctOverall
|
|
|
|
|
|
def CollectResults(indices,dataSet,composite,callback=None,appendExamples=0,
|
|
errorEstimate=0):
|
|
""" screens a set of examples through a composite and returns the
|
|
results
|
|
#DOC
|
|
|
|
**Arguments**
|
|
|
|
- examples: the examples to be screened (a sequence of sequences)
|
|
it's assumed that the last element in each example is it's "value"
|
|
|
|
- composite: the composite model to be used
|
|
|
|
- callback: (optional) if provided, this should be a function
|
|
taking a single argument that is called after each example is
|
|
screened with the number of examples screened so far as the
|
|
argument.
|
|
|
|
- appendExamples: (optional) this value is passed on to the
|
|
composite's _ClassifyExample()_ method.
|
|
|
|
- errorEstimate: (optional) calculate the "out of bag" error
|
|
estimate for the composite using Breiman's definition. This
|
|
only makes sense when screening the original data set!
|
|
[L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
|
|
Statistics Technical Report (1996)]
|
|
|
|
**Returns**
|
|
|
|
a list of 3-tuples _nExamples_ long:
|
|
|
|
1) answer: the value from the example
|
|
|
|
2) pred: the composite model's prediction
|
|
|
|
3) conf: the confidence of the composite
|
|
|
|
"""
|
|
#for i in range(len(composite)):
|
|
# print ' ',i,'TRAIN:',composite[i][0]._trainIndices
|
|
|
|
nPts = len(indices)
|
|
res = [None]*nPts
|
|
for i in range(nPts):
|
|
idx = indices[i]
|
|
example = dataSet[idx]
|
|
if errorEstimate:
|
|
use = []
|
|
for j in range(len(composite)):
|
|
mdl = composite.GetModel(j)
|
|
if not hasattr(mdl,'_trainIndices') or \
|
|
idx not in mdl._trainIndices:
|
|
use.append(j)
|
|
else:
|
|
use = None
|
|
#print 'IDX:',idx,'use:',use
|
|
pred,conf = composite.ClassifyExample(example,appendExample=appendExamples,
|
|
onlyModels=use)
|
|
if composite.GetActivityQuantBounds():
|
|
answer = composite.QuantizeActivity(example)[-1]
|
|
else:
|
|
answer = example[-1]
|
|
res[i] = answer,pred,conf
|
|
if callback: callback(i)
|
|
return res
|
|
|
|
def DetailedScreen(indices,data,composite,threshold=0,screenResults=None,
|
|
goodVotes=None,badVotes=None,noVotes=None,callback=None,
|
|
appendExamples=0,errorEstimate=0):
|
|
""" screens a set of examples cross a composite and breaks the
|
|
predictions into *correct*,*incorrect* and *unclassified* sets.
|
|
#DOC
|
|
**Arguments**
|
|
|
|
- examples: the examples to be screened (a sequence of sequences)
|
|
it's assumed that the last element in each example is its "value"
|
|
|
|
- composite: the composite model to be used
|
|
|
|
- threshold: (optional) the threshold to be used to decide whether
|
|
or not a given prediction should be kept
|
|
|
|
- screenResults: (optional) the results of screening the results
|
|
(a sequence of 3-tuples in the format returned by
|
|
_CollectResults()_). If this is provided, the examples will not
|
|
be screened again.
|
|
|
|
- goodVotes,badVotes,noVotes: (optional) if provided these should
|
|
be lists (or anything supporting an _append()_ method) which
|
|
will be used to pass the screening results back.
|
|
|
|
- callback: (optional) if provided, this should be a function
|
|
taking a single argument that is called after each example is
|
|
screened with the number of examples screened so far as the
|
|
argument.
|
|
|
|
- appendExamples: (optional) this value is passed on to the
|
|
composite's _ClassifyExample()_ method.
|
|
|
|
- errorEstimate: (optional) calculate the "out of bag" error
|
|
estimate for the composite using Breiman's definition. This
|
|
only makes sense when screening the original data set!
|
|
[L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
|
|
Statistics Technical Report (1996)]
|
|
|
|
**Notes**
|
|
|
|
- since this function doesn't return anything, if one or more of
|
|
the arguments _goodVotes_, _badVotes_, and _noVotes_ is not
|
|
provided, there's not much reason to call it
|
|
|
|
"""
|
|
if screenResults is None:
|
|
screenResults = CollectResults(indices,data,composite,callback=callback,
|
|
appendExamples=appendExamples,
|
|
errorEstimate=errorEstimate)
|
|
if goodVotes is None: goodVotes = []
|
|
if badVotes is None: badVotes = []
|
|
if noVotes is None: noVotes = []
|
|
for i in range(len(screenResults)):
|
|
answer,pred,conf = screenResults[i]
|
|
if conf > threshold:
|
|
if pred != answer:
|
|
badVotes.append((answer,pred,conf,i))
|
|
else:
|
|
goodVotes.append((answer,pred,conf,i))
|
|
else:
|
|
noVotes.append((answer,pred,conf,i))
|
|
|
|
def ShowVoteResults(indices,data,composite,nResultCodes,threshold,verbose=1,
|
|
screenResults=None,callback=None,appendExamples=0,
|
|
goodVotes=None,badVotes=None,noVotes=None,
|
|
errorEstimate=0):
|
|
""" screens the results and shows a detailed workup
|
|
|
|
The work of doing the screening and processing the results is
|
|
handled by _DetailedScreen()_
|
|
#DOC
|
|
|
|
**Arguments**
|
|
|
|
- examples: the examples to be screened (a sequence of sequences)
|
|
it's assumed that the last element in each example is its "value"
|
|
|
|
- composite: the composite model to be used
|
|
|
|
- nResultCodes: the number of possible results the composite can
|
|
return
|
|
|
|
- threshold: the threshold to be used to decide whether or not a
|
|
given prediction should be kept
|
|
|
|
- screenResults: (optional) the results of screening the results
|
|
(a sequence of 3-tuples in the format returned by
|
|
_CollectResults()_). If this is provided, the examples will not
|
|
be screened again.
|
|
|
|
- callback: (optional) if provided, this should be a function
|
|
taking a single argument that is called after each example is
|
|
screened with the number of examples screened so far as the
|
|
argument.
|
|
|
|
- appendExamples: (optional) this value is passed on to the
|
|
composite's _ClassifyExample()_ method.
|
|
|
|
- goodVotes,badVotes,noVotes: (optional) if provided these should
|
|
be lists (or anything supporting an _append()_ method) which
|
|
will be used to pass the screening results back.
|
|
|
|
- errorEstimate: (optional) calculate the "out of bag" error
|
|
estimate for the composite using Breiman's definition. This
|
|
only makes sense when screening the original data set!
|
|
[L. Breiman "Out-of-bag Estimation", UC Berkeley Dept of
|
|
Statistics Technical Report (1996)]
|
|
|
|
**Returns**
|
|
|
|
a 7-tuple:
|
|
|
|
1) the number of good (correct) predictions
|
|
|
|
2) the number of bad (incorrect) predictions
|
|
|
|
3) the number of predictions skipped due to the _threshold_
|
|
|
|
4) the average confidence in the good predictions
|
|
|
|
5) the average confidence in the bad predictions
|
|
|
|
6) the average confidence in the skipped predictions
|
|
|
|
7) the results table
|
|
|
|
"""
|
|
nExamples = len(indices)
|
|
if goodVotes is None:
|
|
goodVotes = []
|
|
if badVotes is None:
|
|
badVotes = []
|
|
if noVotes is None:
|
|
noVotes = []
|
|
DetailedScreen(indices,data,composite,threshold,screenResults=screenResults,
|
|
goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes,callback=callback,
|
|
appendExamples=appendExamples,errorEstimate=errorEstimate)
|
|
nBad = len(badVotes)
|
|
nGood = len(goodVotes)
|
|
nClassified = nGood + nBad
|
|
if verbose:
|
|
print '\n\t*** Vote Results ***'
|
|
print 'misclassified: %d/%d (%%%4.2f)\t%d/%d (%%%4.2f)'%(nBad,nExamples,
|
|
100.*float(nBad)/nExamples,
|
|
nBad,nClassified,
|
|
100.*float(nBad)/nClassified)
|
|
nSkip = len(noVotes)
|
|
if nSkip > 0:
|
|
if verbose:
|
|
print 'skipped: %d/%d (%%% 4.2f)'%(nSkip,nExamples,100.*float(nSkip)/nExamples)
|
|
noConf = array(map(lambda x:x[2],noVotes))
|
|
avgSkip = sum(noConf)/float(nSkip)
|
|
else:
|
|
avgSkip = 0.
|
|
|
|
if nBad > 0:
|
|
badConf = array(map(lambda x:x[2],badVotes))
|
|
avgBad = sum(badConf)/float(nBad)
|
|
else:
|
|
avgBad = 0.
|
|
|
|
if nGood > 0:
|
|
goodRes = map(lambda x:x[1],goodVotes)
|
|
goodConf = array(map(lambda x:x[2],goodVotes))
|
|
avgGood = sum(goodConf)/float(nGood)
|
|
else:
|
|
goodRes = []
|
|
goodConf = []
|
|
avgGood = 0.
|
|
|
|
if verbose:
|
|
print
|
|
print 'average correct confidence: % 6.4f'%avgGood
|
|
print 'average incorrect confidence: % 6.4f'%avgBad
|
|
|
|
voteTab = zeros((nResultCodes,nResultCodes),Int)
|
|
for res in goodRes:
|
|
voteTab[res,res] += 1
|
|
for ans,res,conf,idx in badVotes:
|
|
voteTab[ans,res] += 1
|
|
|
|
if verbose:
|
|
print
|
|
print '\tResults Table:'
|
|
vTab=transpose(voteTab)
|
|
colCounts = sum(vTab)
|
|
rowCounts = sum(vTab,1)
|
|
message('')
|
|
for i in range(nResultCodes):
|
|
if rowCounts[i]==0: rowCounts[i]=1
|
|
row = vTab[i]
|
|
message(' ',noRet=1)
|
|
for j in range(nResultCodes):
|
|
entry = row[j]
|
|
message(' % 6d'%entry,noRet=1)
|
|
message(' | % 4.2f'%(100.*vTab[i,i]/rowCounts[i]))
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
message('-------',noRet=1)
|
|
message('')
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
if colCounts[i]==0: colCounts[i]=1
|
|
message(' % 6.2f'%(100.*vTab[i,i]/colCounts[i]),noRet=1)
|
|
message('')
|
|
|
|
|
|
|
|
return nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab
|
|
|
|
def ScreenIt(composite,indices,data,partialVote=0,voteTol=0.0,verbose=1,screenResults=None,
|
|
goodVotes=None,badVotes=None,noVotes=None):
|
|
""" screens a set of data using a composite model and prints out
|
|
statistics about the screen.
|
|
#DOC
|
|
The work of doing the screening and processing the results is
|
|
handled by _DetailedScreen()_
|
|
|
|
**Arguments**
|
|
|
|
- composite: the composite model to be used
|
|
|
|
- data: the examples to be screened (a sequence of sequences)
|
|
it's assumed that the last element in each example is its "value"
|
|
|
|
- partialVote: (optional) toggles use of the threshold value in
|
|
the screnning.
|
|
|
|
- voteTol: (optional) the threshold to be used to decide whether or not a
|
|
given prediction should be kept
|
|
|
|
- verbose: (optional) sets degree of verbosity of the screening
|
|
|
|
- screenResults: (optional) the results of screening the results
|
|
(a sequence of 3-tuples in the format returned by
|
|
_CollectResults()_). If this is provided, the examples will not
|
|
be screened again.
|
|
|
|
- goodVotes,badVotes,noVotes: (optional) if provided these should
|
|
be lists (or anything supporting an _append()_ method) which
|
|
will be used to pass the screening results back.
|
|
|
|
|
|
**Returns**
|
|
|
|
a 7-tuple:
|
|
|
|
1) the number of good (correct) predictions
|
|
|
|
2) the number of bad (incorrect) predictions
|
|
|
|
3) the number of predictions skipped due to the _threshold_
|
|
|
|
4) the average confidence in the good predictions
|
|
|
|
5) the average confidence in the bad predictions
|
|
|
|
6) the average confidence in the skipped predictions
|
|
|
|
7) None
|
|
|
|
"""
|
|
if goodVotes is None:
|
|
goodVotes = []
|
|
if badVotes is None:
|
|
badVotes = []
|
|
if noVotes is None:
|
|
noVotes = []
|
|
|
|
if not partialVote:
|
|
voteTol = 0.0
|
|
|
|
DetailedScreen(indices,data,composite,voteTol,screenResults=screenResults,
|
|
goodVotes=goodVotes,badVotes=badVotes,noVotes=noVotes)
|
|
|
|
nGood = len(goodVotes)
|
|
goodAccum = 0.
|
|
for res,pred,conf,idx in goodVotes:
|
|
goodAccum += conf
|
|
|
|
misCount = len(badVotes)
|
|
badAccum = 0.
|
|
for res,pred,conf,idx in badVotes:
|
|
badAccum += conf
|
|
|
|
nSkipped = len(noVotes)
|
|
goodSkipped = 0
|
|
badSkipped = 0
|
|
skipAccum = 0.
|
|
for ans,pred,conf,idx in noVotes:
|
|
skipAccum += conf
|
|
if ans != pred:
|
|
badSkipped += 1
|
|
else:
|
|
goodSkipped += 1
|
|
|
|
nData = nGood + misCount + nSkipped
|
|
if verbose:
|
|
print 'Total N Points:',nData
|
|
if partialVote:
|
|
nCounted = nData-nSkipped
|
|
if verbose:
|
|
print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nCounted)
|
|
print 'N Skipped: %d (%%%4.2f)'%(nSkipped,100.*float(nSkipped)/nData)
|
|
print '\tGood Votes Skipped: %d (%%%4.2f)'%(goodSkipped,100.*float(goodSkipped)/nSkipped)
|
|
print '\tBad Votes Skipped: %d (%%%4.2f)'%(badSkipped,100.*float(badSkipped)/nSkipped)
|
|
else:
|
|
if verbose:
|
|
print 'Misclassifications: %d (%%%4.2f)'%(misCount,100.*float(misCount)/nData)
|
|
print 'Average Correct Vote Confidence: % 6.4f'%(goodAccum/(nData-misCount))
|
|
print 'Average InCorrect Vote Confidence: % 6.4f'%(badAccum/misCount)
|
|
|
|
avgGood=0
|
|
avgBad=0
|
|
avgSkip=0
|
|
if nGood:
|
|
avgGood = goodAccum/nGood
|
|
if misCount:
|
|
avgBad = badAccum/misCount
|
|
if nSkipped:
|
|
avgSkip = skipAccum/nSkipped
|
|
return nGood,misCount,nSkipped,avgGood,avgBad,avgSkip,None
|
|
|
|
def _processVoteList(votes,data):
|
|
""" *Internal Use Only*
|
|
|
|
converts a list of 4 tuples: (answer,prediction,confidence,idx) into
|
|
an alternate list: (answer,prediction,confidence,data point)
|
|
|
|
**Arguments**
|
|
|
|
- votes: a list of 4 tuples: (answer, prediction, confidence,
|
|
index)
|
|
|
|
- data: a _DataUtils.MLData.MLDataSet_
|
|
|
|
|
|
**Note**: alterations are done in place in the _votes_ list
|
|
|
|
"""
|
|
for i in range(len(votes)):
|
|
ans,pred,conf,idx = votes[i]
|
|
votes[i] = (ans,pred,conf,data[idx])
|
|
|
|
def PrepareDataFromDetails(model,details,data,verbose=0):
|
|
if (hasattr(details,'doHoldout') and details.doHoldout) or \
|
|
(hasattr(details,'doTraining') and details.doTraining):
|
|
try:
|
|
splitF = model._splitFrac
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
if verbose:
|
|
message('s',noRet=1)
|
|
|
|
if hasattr(details,'errorEstimate') and details.errorEstimate and \
|
|
hasattr(details,'doHoldout') and details.doHoldout:
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
message('****** WARNING: OOB screening should not be combined with doHoldout option.')
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
trainIdx,testIdx = SplitData.SplitIndices(data.GetNPts(),splitF,silent=1)
|
|
|
|
if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
|
|
if verbose:
|
|
message('f',noRet=1)
|
|
trainFilt,temp = DataUtils.FilterData(data,details.filterVal,
|
|
details.filterFrac,-1,
|
|
indicesToUse=trainIdx,
|
|
indicesOnly=1)
|
|
testIdx += temp
|
|
trainIdx = trainFilt
|
|
elif hasattr(details,'errorEstimate') and details.errorEstimate:
|
|
# the OOB screening works by checking to see if a given index
|
|
# is in the
|
|
if hasattr(details,'filterFrac') and details.filterFrac != 0.0:
|
|
if verbose:
|
|
message('f',noRet=1)
|
|
testIdx,trainIdx = DataUtils.FilterData(data,details.filterVal,
|
|
details.filterFrac,-1,
|
|
indicesToUse=range(data.GetNPts()),
|
|
indicesOnly=1)
|
|
testIdx.extend(trainIdx)
|
|
else:
|
|
testIdx = range(data.GetNPts())
|
|
trainIdx = []
|
|
else:
|
|
testIdx = range(data.GetNPts())
|
|
trainIdx = []
|
|
if hasattr(details,'doTraining') and details.doTraining:
|
|
testIdx,trainIdx = trainIdx,testIdx
|
|
return trainIdx,testIdx
|
|
|
|
def ScreenFromDetails(models,details,callback=None,setup=None,appendExamples=0,
|
|
goodVotes=None,badVotes=None,noVotes=None,data=None,
|
|
enrichments=None):
|
|
""" Screens a set of data using a a _CompositeRun.CompositeRun_
|
|
instance to provide parameters
|
|
|
|
# DOC
|
|
|
|
The actual data to be used are extracted from the database and
|
|
table specified in _details_
|
|
|
|
Aside from dataset construction, _ShowVoteResults()_ does most of
|
|
the heavy lifting here.
|
|
|
|
**Arguments**
|
|
|
|
- model: a composite model
|
|
|
|
- details: a _CompositeRun.CompositeRun_ object containing details
|
|
(options, parameters, etc.) about the run
|
|
|
|
- callback: (optional) if provided, this should be a function
|
|
taking a single argument that is called after each example is
|
|
screened with the number of examples screened so far as the
|
|
argument.
|
|
|
|
- setup: (optional) a function taking a single argument which is
|
|
called at the start of screening with the number of points to
|
|
be screened as the argument.
|
|
|
|
- appendExamples: (optional) this value is passed on to the
|
|
composite's _ClassifyExample()_ method.
|
|
|
|
- goodVotes,badVotes,noVotes: (optional) if provided these should
|
|
be lists (or anything supporting an _append()_ method) which
|
|
will be used to pass the screening results back.
|
|
|
|
|
|
**Returns**
|
|
|
|
a 7-tuple:
|
|
|
|
1) the number of good (correct) predictions
|
|
|
|
2) the number of bad (incorrect) predictions
|
|
|
|
3) the number of predictions skipped due to the _threshold_
|
|
|
|
4) the average confidence in the good predictions
|
|
|
|
5) the average confidence in the bad predictions
|
|
|
|
6) the average confidence in the skipped predictions
|
|
|
|
7) the results table
|
|
|
|
"""
|
|
if data is None:
|
|
if hasattr(details,'pickleCol'):
|
|
data = details.GetDataSet(pickleCol=details.pickleCol,
|
|
pickleClass=DataStructs.ExplicitBitVect)
|
|
else:
|
|
data = details.GetDataSet()
|
|
if details.threshold>0.0:
|
|
partialVote = 1
|
|
else:
|
|
partialVote = 0
|
|
|
|
if type(models) not in [types.ListType,types.TupleType]:
|
|
models = (models,)
|
|
|
|
nModels = len(models)
|
|
|
|
if setup is not None:
|
|
setup(nModels*data.GetNPts())
|
|
|
|
nGood = zeros(nModels,Float)
|
|
nBad = zeros(nModels,Float)
|
|
nSkip = zeros(nModels,Float)
|
|
confGood = zeros(nModels,Float)
|
|
confBad = zeros(nModels,Float)
|
|
confSkip = zeros(nModels,Float)
|
|
voteTab = None
|
|
if goodVotes is None:
|
|
goodVotes = []
|
|
if badVotes is None:
|
|
badVotes = []
|
|
if noVotes is None:
|
|
noVotes = []
|
|
if enrichments is None:
|
|
enrichments = [0.0]*nModels
|
|
badVoteDict = {}
|
|
noVoteDict = {}
|
|
|
|
for i in range(nModels):
|
|
if nModels>1:
|
|
goodVotes = []
|
|
badVotes=[]
|
|
noVotes=[]
|
|
model = models[i]
|
|
|
|
try:
|
|
seed = model._randomSeed
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
DataUtils.InitRandomNumbers(seed)
|
|
|
|
if (hasattr(details,'shuffleActivities') and details.shuffleActivities) or \
|
|
(hasattr(details,'randomActivities') and details.randomActivities ):
|
|
if hasattr(details,'shuffleActivities') and details.shuffleActivities:
|
|
shuffle = 1
|
|
else:
|
|
shuffle = 0
|
|
randomize=1
|
|
DataUtils.RandomizeActivities(data,shuffle=shuffle,
|
|
runDetails=details)
|
|
else:
|
|
randomize=0
|
|
shuffle=0
|
|
|
|
if hasattr(model,'_shuffleActivities') and \
|
|
model._shuffleActivities and \
|
|
not shuffle:
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
message('****** WARNING: Shuffled model being screened with unshuffled data.')
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
if hasattr(model,'_randomizeActivities') and \
|
|
model._randomizeActivities and \
|
|
not randomize:
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
message('****** WARNING: Random model being screened with non-random data.')
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
|
|
trainIdx,testIdx = PrepareDataFromDetails(model,details,data)
|
|
|
|
nPossible = model.GetQuantBounds()[1]
|
|
if callback:
|
|
cb = lambda x,y=callback,z=i*data.GetNPts():y(x+z)
|
|
else:
|
|
cb = None
|
|
if not hasattr(details,'errorEstimate') or not details.errorEstimate:
|
|
errorEstimate = 0
|
|
else:
|
|
errorEstimate = 1
|
|
g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,data,model,nPossible[-1],
|
|
details.threshold,verbose=0,
|
|
callback=cb,appendExamples=appendExamples,
|
|
goodVotes=goodVotes,badVotes=badVotes,
|
|
noVotes=noVotes,
|
|
errorEstimate=errorEstimate)
|
|
if voteTab is None:
|
|
voteTab = zeros(vT.shape,Float)
|
|
if hasattr(details,'errorAnalysis') and details.errorAnalysis:
|
|
for a,p,c,idx in badVotes:
|
|
label = testIdx[idx]
|
|
if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
|
|
if a==details.enrichTgt:
|
|
badVoteDict[label] = badVoteDict.get(label,0)+1
|
|
else:
|
|
badVoteDict[label] = badVoteDict.get(label,0)+1
|
|
for a,p,c,idx in noVotes:
|
|
label = testIdx[idx]
|
|
if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
|
|
if a==details.enrichTgt:
|
|
noVoteDict[label] = noVoteDict.get(label,0)+1
|
|
else:
|
|
noVoteDict[label] = noVoteDict.get(label,0)+1
|
|
|
|
voteTab += vT
|
|
nGood[i] = g
|
|
nBad[i] = b
|
|
nSkip[i] = s
|
|
confGood[i] = aG
|
|
confBad[i] = aB
|
|
confSkip[i] = aS
|
|
|
|
if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
|
|
enrichments[i] = CalcEnrichment(vT,tgt=details.enrichTgt)
|
|
|
|
if nModels == 1:
|
|
return g,b,s,aG,aB,aS,vT
|
|
else:
|
|
voteTab /= nModels
|
|
|
|
avgNBad = sum(nBad)/nModels
|
|
devNBad = sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
|
|
|
|
bestIdx = argsort(nBad)[0]
|
|
|
|
avgNGood = sum(nGood)/nModels
|
|
devNGood = sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
|
|
|
|
avgNSkip = sum(nSkip)/nModels
|
|
devNSkip = sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
|
|
|
|
avgConfBad = sum(confBad)/nModels
|
|
devConfBad = sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
|
|
|
|
avgConfGood = sum(confGood)/nModels
|
|
devConfGood = sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
|
|
|
|
avgConfSkip = sum(confSkip)/nModels
|
|
devConfSkip = sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
|
|
return (avgNGood,devNGood),(avgNBad,devNBad),(avgNSkip,devNSkip),\
|
|
(avgConfGood,devConfGood),(avgConfBad,devConfBad),(avgConfSkip,devConfSkip),\
|
|
voteTab
|
|
|
|
|
|
def GetScreenImage(nGood,nBad,nRej,size=None):
|
|
if not hasPil:
|
|
return None
|
|
try:
|
|
nTot = float(nGood)+float(nBad)+float(nRej)
|
|
except TypeError:
|
|
nGood = nGood[0]
|
|
nBad = nBad[0]
|
|
nRej = nRej[0]
|
|
nTot = float(nGood)+float(nBad)+float(nRej)
|
|
|
|
if not nTot:
|
|
return None
|
|
goodColor = (100,100,255)
|
|
badColor = (255,100,100)
|
|
rejColor = (255,255,100)
|
|
|
|
pctGood = float(nGood) / nTot
|
|
pctBad = float(nBad) / nTot
|
|
pctRej = float(nRej) / nTot
|
|
|
|
if size is None:
|
|
if RDConfig.doingDemo:
|
|
size = (200,200)
|
|
else:
|
|
size = (100,100)
|
|
img = Image.new('RGB',size,(255,255,255))
|
|
draw = ImageDraw.Draw(img)
|
|
box = (0,0,size[0]-1,size[1]-1)
|
|
|
|
startP = -90
|
|
endP = int(startP + pctGood*360)
|
|
draw.pieslice(box,startP,endP,fill=goodColor)
|
|
startP = endP
|
|
endP = int(startP + pctBad*360)
|
|
draw.pieslice(box,startP,endP,fill=badColor)
|
|
startP = endP
|
|
endP = int(startP + pctRej*360)
|
|
draw.pieslice(box,startP,endP,fill=rejColor)
|
|
|
|
return img
|
|
|
|
|
|
def ScreenToHtml(nGood,nBad,nRej,avgGood,avgBad,avgSkip,voteTable,imgDir='.',
|
|
fullPage=1,skipImg=0,includeDefs=1):
|
|
""" returns the text of a web page showing the screening details
|
|
#DOC
|
|
**Arguments**
|
|
|
|
- nGood: number of correct predictions
|
|
|
|
- nBad: number of incorrect predictions
|
|
|
|
- nRej: number of rejected predictions
|
|
|
|
- avgGood: average correct confidence
|
|
|
|
- avgBad: average incorrect confidence
|
|
|
|
- avgSkip: average rejected confidence
|
|
|
|
- voteTable: vote table
|
|
|
|
- imgDir: (optional) the directory to be used to hold the vote
|
|
image (if constructed)
|
|
|
|
**Returns**
|
|
|
|
a string containing HTML
|
|
|
|
"""
|
|
if type(nGood) == types.TupleType:
|
|
multModels=1
|
|
else:
|
|
multModels=0
|
|
|
|
if fullPage:
|
|
outTxt = ["""<html><body>"""]
|
|
outTxt.append('<center><h2>VOTE DETAILS</h2></center>')
|
|
else:
|
|
outTxt = []
|
|
|
|
if RDConfig.doingDemo:
|
|
outTxt.append('<font size="+2">')
|
|
else:
|
|
outTxt.append('<font>')
|
|
|
|
# Get the image
|
|
if not skipImg:
|
|
img = GetScreenImage(nGood,nBad,nRej)
|
|
if img:
|
|
if imgDir:
|
|
imgFileName = '/'.join((imgDir,'votes.png'))
|
|
else:
|
|
imgFileName = 'votes.png'
|
|
img.save(imgFileName)
|
|
outTxt.append('<center><img src="%s"></center>'%(imgFileName))
|
|
|
|
nPoss = len(voteTable)
|
|
pureCounts = sum(voteTable,1)
|
|
accCounts = sum(voteTable,0)
|
|
pureVect = zeros(nPoss,Float)
|
|
accVect = zeros(nPoss,Float)
|
|
for i in range(nPoss):
|
|
if pureCounts[i]:
|
|
pureVect[i] = float(voteTable[i,i])/pureCounts[i]
|
|
if accCounts[i]:
|
|
accVect[i] = float(voteTable[i,i])/accCounts[i]
|
|
|
|
outTxt.append('<center><table border=1>')
|
|
outTxt.append('<tr><td></td>')
|
|
for i in xrange(nPoss):
|
|
outTxt.append('<th>%d</th>'%i)
|
|
outTxt.append('<th>% Accurate</th>')
|
|
outTxt.append('</tr>')
|
|
#outTxt.append('<th rowspan=%d>Predicted</th></tr>'%(nPoss+1))
|
|
for i in xrange(nPoss):
|
|
outTxt.append('<tr><th>%d</th>'%(i))
|
|
for j in xrange(nPoss):
|
|
if i == j:
|
|
if not multModels:
|
|
outTxt.append('<td bgcolor="#A0A0FF">%d</td>'%(voteTable[j,i]))
|
|
else:
|
|
outTxt.append('<td bgcolor="#A0A0FF">%.2f</td>'%(voteTable[j,i]))
|
|
else:
|
|
if not multModels:
|
|
outTxt.append('<td>%d</td>'%(voteTable[j,i]))
|
|
else:
|
|
outTxt.append('<td>%.2f</td>'%(voteTable[j,i]))
|
|
outTxt.append('<td>%4.2f</td</tr>'%(100.0*accVect[i]))
|
|
if i == 0:
|
|
outTxt.append('<th rowspan=%d>Predicted</th></tr>'%(nPoss))
|
|
else:
|
|
outTxt.append('</tr>')
|
|
outTxt.append('<tr><th>% Pure</th>')
|
|
for i in range(nPoss):
|
|
outTxt.append('<td>%4.2f</td>'%(100.0*pureVect[i]))
|
|
outTxt.append('</tr>')
|
|
outTxt.append('<tr><td></td><th colspan=%d>Original</th>'%(nPoss))
|
|
outTxt.append('</table></center>')
|
|
|
|
|
|
if not multModels:
|
|
nTotal = nBad+nGood+nRej
|
|
nClass = nBad+nGood
|
|
if nClass:
|
|
pctErr = 100.*float(nBad)/nClass
|
|
else:
|
|
pctErr = 0.0
|
|
|
|
outTxt.append('<p>%d of %d examples were misclassified (%%%4.2f)'%(nBad,nGood+nBad,pctErr))
|
|
if nRej > 0:
|
|
pctErr = 100.*float(nBad)/(nGood+nBad+nRej)
|
|
outTxt.append('<p> %d of %d overall: (%%%4.2f)'%(nBad,nTotal,pctErr))
|
|
pctRej = 100.*float(nRej)/nTotal
|
|
outTxt.append('<p>%d of %d examples were rejected (%%%4.2f)'%(nRej,nTotal,pctRej))
|
|
if nGood != 0:
|
|
outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f'%avgGood)
|
|
|
|
if nBad != 0:
|
|
outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f'%avgBad)
|
|
if nRej != 0:
|
|
outTxt.append('<p>The rejected examples had an average confidence of %6.4f'%avgSkip)
|
|
else:
|
|
nTotal = nBad[0]+nGood[0]+nRej[0]
|
|
nClass = nBad[0]+nGood[0]
|
|
devClass = nBad[1]+nGood[1]
|
|
if nClass:
|
|
pctErr = 100.*float(nBad[0])/nClass
|
|
devPctErr = 100.*float(nBad[1])/nClass
|
|
else:
|
|
pctErr = 0.0
|
|
devPctErr = 0.0
|
|
|
|
outTxt.append('<p>%.2f(%.2f) of %.2f(%.2f) examples were misclassified (%%%4.2f(%4.2f))'%\
|
|
(nBad[0],nBad[1],nClass,devClass,pctErr,devPctErr))
|
|
if nRej > 0:
|
|
pctErr = 100.*float(nBad[0])/nTotal
|
|
devPctErr = 100.*float(nBad[1])/nTotal
|
|
outTxt.append('<p> %.2f(%.2f) of %d overall: (%%%4.2f(%4.2f))'%\
|
|
(nBad[0],nBad[1],nTotal,pctErr,devPctErr))
|
|
pctRej = 100.*float(nRej[0])/nTotal
|
|
devPctRej = 100.*float(nRej[1])/nTotal
|
|
outTxt.append('<p>%.2f(%.2f) of %d examples were rejected (%%%4.2f(%4.2f))'%\
|
|
(nRej[0],nRej[1],nTotal,pctRej,devPctRej))
|
|
if nGood != 0:
|
|
outTxt.append('<p>The correctly classified examples had an average confidence of %6.4f(%.4f)'%avgGood)
|
|
|
|
if nBad != 0:
|
|
outTxt.append('<p>The incorrectly classified examples had an average confidence of %6.4f(%.4f)'%avgBad)
|
|
if nRej != 0:
|
|
outTxt.append('<p>The rejected examples had an average confidence of %6.4f(%.4f)'%avgSkip)
|
|
|
|
|
|
|
|
outTxt.append('</font>')
|
|
if includeDefs:
|
|
txt = """
|
|
<p><b>Definitions:</b>
|
|
<ul>
|
|
<li> <i>% Pure:</i> The percentage of, for example, known positives predicted to be positive.
|
|
<li> <i>% Accurate:</i> The percentage of, for example, predicted positives that actually
|
|
are positive.
|
|
</ul>
|
|
"""
|
|
outTxt.append(txt)
|
|
|
|
if fullPage:
|
|
outTxt.append("""</body></html>""")
|
|
return '\n'.join(outTxt)
|
|
|
|
|
|
def MakePredPlot(details,indices,data,goodVotes,badVotes,nRes,idCol=0,verbose=0):
|
|
"""
|
|
|
|
**Arguments**
|
|
|
|
- details: a CompositeRun.RunDetails object
|
|
|
|
- indices: a sequence of integer indices into _data_
|
|
|
|
- data: the data set in question. We assume that the ids for
|
|
the data points are in the _idCol_ column
|
|
|
|
- goodVotes/badVotes: predictions where the model was correct/incorrect.
|
|
These are sequences of 4-tuples:
|
|
(answer,prediction,confidence,index into _indices_)
|
|
|
|
"""
|
|
if not hasattr(details,'predPlot') or not details.predPlot:
|
|
return
|
|
|
|
if verbose: message('\n-> Constructing Prediction (Hanneke) Plot')
|
|
outF = open(details.predPlot,'w+')
|
|
gnuF = open('%s.gnu'%details.predPlot,'w+')
|
|
# first get the ids of the data points we screened:
|
|
ptIds = [data[x][idCol] for x in indices]
|
|
|
|
# get a connection to the database we'll use to grab the continuous
|
|
# activity values:
|
|
origConn = DbConnect(details.dbName,details.tableName,
|
|
user=details.dbUser,password=details.dbPassword)
|
|
colNames = origConn.GetColumnNames()
|
|
idName = colNames[idCol]
|
|
if not hasattr(details,'predActTable') or \
|
|
not details.predActTable or \
|
|
details.predActTable==details.tableName:
|
|
actConn = origConn
|
|
else:
|
|
actConn = DbConnect(details.dbName,details.predActTable,
|
|
user=details.dbUser,password=details.dbPassword)
|
|
if verbose: message('\t-> Pulling Activity Data')
|
|
pts = []
|
|
|
|
if type(ptIds[0]) not in [type(''),type(u'')]:
|
|
ptIds = [str(x) for x in ptIds]
|
|
whereL = [DbModule.placeHolder]*len(ptIds)
|
|
if hasattr(details,'predActCol') and details.predActCol:
|
|
actColName=details.predActCol
|
|
else:
|
|
actColName = actConn.GetColumnNames()[-1]
|
|
|
|
whereTxt = "%s in (%s)"%(idName,','.join(whereL))
|
|
rawD = actConn.GetData(fields='%s,%s'%(idName,actColName),
|
|
where=whereTxt,extras=ptIds)
|
|
# order the data returned:
|
|
if verbose: message('\t-> Creating Plot')
|
|
acts = [None]*len(ptIds)
|
|
for entry in rawD:
|
|
id,act = entry
|
|
idx = ptIds.index(id)
|
|
acts[idx] = act
|
|
outF.write('#ID Pred Conf %s\n'%(actColName))
|
|
for ans,pred,conf,idx in goodVotes:
|
|
act = acts[idx]
|
|
if act!='None':
|
|
act= float(act)
|
|
else:
|
|
act=0
|
|
outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
|
|
for ans,pred,conf,idx in badVotes:
|
|
act = acts[idx]
|
|
if act!='None':
|
|
act= float(act)
|
|
else:
|
|
act=0
|
|
outF.write('%s %d %.4f %f\n'%(ptIds[idx],pred,conf,act))
|
|
outF.close()
|
|
if not hasattr(details,'predLogScale') or not details.predLogScale:
|
|
actLabel = actColName
|
|
else:
|
|
actLabel= 'log(%s)'%(actColName)
|
|
actLabel = actLabel.replace('_',' ')
|
|
gnuHdr="""# Generated by ScreenComposite.py version: %s
|
|
set size square 0.7
|
|
set yrange [:1]
|
|
set data styl points
|
|
set ylab 'confidence'
|
|
set xlab '%s'
|
|
set grid
|
|
set nokey
|
|
set term postscript enh color solid "Helvetica" 16
|
|
set term win
|
|
"""%(__VERSION_STRING,actLabel)
|
|
gnuF.write(gnuHdr)
|
|
plots = []
|
|
for i in range(nRes):
|
|
if not hasattr(details,'predLogScale') or not details.predLogScale:
|
|
plots.append("'%s' us 4:($2==%d?$3:0/0)"%(details.predPlot,i))
|
|
else:
|
|
plots.append("'%s' us (log10($4)):($2==%d?$3:0/0)"%(details.predPlot,i))
|
|
gnuF.write("plot %s\n"%(','.join(plots)))
|
|
gnuTail="""
|
|
# EOF
|
|
"""
|
|
gnuF.write(gnuTail)
|
|
gnuF.close()
|
|
if hasattr(details,'predShow') and details.predShow:
|
|
try:
|
|
import os
|
|
from Gnuplot import Gnuplot
|
|
p = Gnuplot()
|
|
p('cd "%s"'%(os.getcwd()))
|
|
p('load "%s.gnu"'%(details.predPlot))
|
|
raw_input('press return to continue...\n')
|
|
except:
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
def Go(details):
|
|
pass
|
|
|
|
def SetDefaults(details=None):
|
|
global _details
|
|
if details is None:
|
|
details = _details
|
|
CompositeRun.SetDefaults(details)
|
|
details.screenVoteTol = [0.]
|
|
details.detailedScreen = 0
|
|
details.doHoldout=0
|
|
details.doTraining=0
|
|
details.errorAnalysis=0
|
|
details.verbose=0
|
|
details.partialVote=0
|
|
return details
|
|
|
|
def Usage():
|
|
""" prints a list of arguments for when this is used from the
|
|
command line and then exits
|
|
|
|
"""
|
|
print __doc__
|
|
sys.exit(-1)
|
|
|
|
def ShowVersion(includeArgs=0):
|
|
""" prints the version number of the program
|
|
|
|
"""
|
|
print 'This is ScreenComposite.py version %s'%(__VERSION_STRING)
|
|
if includeArgs:
|
|
import sys
|
|
print 'command line was:'
|
|
print ' '.join(sys.argv)
|
|
|
|
def ParseArgs(details):
|
|
import getopt
|
|
try:
|
|
args,extras = getopt.getopt(sys.argv[1:],'EDd:t:VN:HThSRF:v:AX',
|
|
['predPlot=','predActCol=','predActTable=',
|
|
'predLogScale','predShow',
|
|
'OOB','pickleCol=','enrich=',
|
|
])
|
|
except:
|
|
import traceback
|
|
traceback.print_exc()
|
|
Usage()
|
|
|
|
fName = ''
|
|
details.reportToExcel=0
|
|
details.predPlot=''
|
|
details.predActCol=''
|
|
details.predActTable=''
|
|
details.predLogScale=''
|
|
details.predShow=0
|
|
details.errorEstimate=0
|
|
details.pickleCol=-1
|
|
details.enrichTgt=-1
|
|
for arg,val in args:
|
|
if arg == '-d':
|
|
details.dbName = val
|
|
elif arg == '-D':
|
|
details.detailedScreen = 1
|
|
elif arg == '-t':
|
|
details.partialVote = 1
|
|
voteTol = eval(val)
|
|
if type(voteTol) not in [type([]),type((1,1))]:
|
|
voteTol = [voteTol]
|
|
for tol in voteTol:
|
|
if tol > 1 or tol < 0:
|
|
error('Voting threshold must be between 0 and 1')
|
|
sys.exit(-2)
|
|
details.screenVoteTol=voteTol
|
|
elif arg == '-N':
|
|
details.note=val
|
|
elif arg == '-H':
|
|
details.doTraining=0
|
|
details.doHoldout=1
|
|
elif arg == '-T':
|
|
details.doHoldout=0
|
|
details.doTraining=1
|
|
elif arg == '-E':
|
|
details.errorAnalysis=1
|
|
details.detailedScreen=1
|
|
elif arg == '-A':
|
|
details.showAll=1
|
|
details.detailedScreen=1
|
|
elif arg == '-S':
|
|
details.shuffleActivities=1
|
|
elif arg == '-R':
|
|
details.randomActivities=1
|
|
elif arg == '-h':
|
|
Usage()
|
|
elif arg == '-F':
|
|
details.filterFrac=float(val)
|
|
elif arg == '-v':
|
|
details.filterVal=float(val)
|
|
elif arg == '-V':
|
|
verbose=1
|
|
elif arg == '-X':
|
|
if Excel is not None:
|
|
details.reportToExcel = 1
|
|
details.detailedScreen=1
|
|
else:
|
|
message('NOTE: Excel support not enabled, -X option ignored.')
|
|
verbose=1
|
|
elif arg == '--predPlot':
|
|
details.detailedScreen=1
|
|
details.predPlot=val
|
|
elif arg == '--predActCol':
|
|
details.predActCol=val
|
|
elif arg == '--predActTable':
|
|
details.predActTable=val
|
|
elif arg == '--predLogScale':
|
|
details.predLogScale=1
|
|
elif arg == '--predShow':
|
|
details.predShow=1
|
|
elif arg == '--predShow':
|
|
details.predShow=1
|
|
elif arg == '--OOB':
|
|
details.errorEstimate=1
|
|
elif arg == '--pickleCol':
|
|
details.pickleCol=int(val)-1
|
|
elif arg == '--enrich':
|
|
details.enrichTgt=int(val)
|
|
else:
|
|
Usage()
|
|
|
|
if len(extras) < 1:
|
|
Usage()
|
|
return extras
|
|
|
|
|
|
if __name__ == '__main__':
|
|
details = SetDefaults()
|
|
extras = ParseArgs(details)
|
|
ShowVersion(includeArgs=1)
|
|
|
|
models = []
|
|
if details.note and details.dbName:
|
|
tblName = extras[0]
|
|
message('-> Retrieving models from database')
|
|
conn = DbConnect(details.dbName,tblName)
|
|
blobs = conn.GetData(fields='model',where="where note='%s'"%(details.note))
|
|
for blob in blobs:
|
|
blob = blob[0]
|
|
try:
|
|
models.append(cPickle.loads(str(blob)))
|
|
except:
|
|
import traceback
|
|
traceback.print_exc()
|
|
message('Model load failed')
|
|
|
|
else:
|
|
message('-> Loading model')
|
|
modelFile=open(extras[0],'rb')
|
|
models.append(cPickle.load(modelFile))
|
|
if not len(models):
|
|
error('No composite models found')
|
|
sys.exit(-1)
|
|
else:
|
|
message('-> Working with %d models.'%len(models))
|
|
|
|
extras = extras[1:]
|
|
|
|
for fName in extras:
|
|
if details.dbName != '':
|
|
details.tableName = fName
|
|
data = details.GetDataSet(pickleCol=details.pickleCol,
|
|
pickleClass=DataStructs.ExplicitBitVect)
|
|
else:
|
|
data = DataUtils.BuildDataSet(fName)
|
|
descNames = data.GetVarNames()
|
|
nModels = len(models)
|
|
screenResults = [None]*nModels
|
|
dataSets = [None]*nModels
|
|
message('-> Constructing and screening data sets')
|
|
testIdx = range(data.GetNPts())
|
|
trainIdx = testIdx
|
|
|
|
for modelIdx in range(nModels):
|
|
#tmpD = copy.deepcopy(data)
|
|
tmpD = data
|
|
model = models[modelIdx]
|
|
message('.',noRet=1)
|
|
|
|
try:
|
|
seed = model._randomSeed
|
|
except AttributeError:
|
|
pass
|
|
else:
|
|
DataUtils.InitRandomNumbers(seed)
|
|
|
|
if details.shuffleActivities or details.randomActivities:
|
|
shuffle = details.shuffleActivities
|
|
random = 1
|
|
DataUtils.RandomizeActivities(tmpD,shuffle=details.shuffleActivities,
|
|
runDetails=details)
|
|
else:
|
|
random = 0
|
|
shuffle = 0
|
|
|
|
if hasattr(model,'_shuffleActivities') and \
|
|
model._shuffleActivities and \
|
|
not shuffle:
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
message('****** WARNING: Shuffled model being screened with unshuffled data.')
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
if hasattr(model,'_randomizeActivities') and \
|
|
model._randomizeActivities and \
|
|
not randomize:
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
message('****** WARNING: Random model being screened with non-random data.')
|
|
message('*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*!*')
|
|
|
|
trainIdx,testIdx = PrepareDataFromDetails(model,details,tmpD,verbose=1)
|
|
screenResults[modelIdx] = CollectResults(testIdx,tmpD,model,
|
|
errorEstimate=details.errorEstimate)
|
|
dataSets[modelIdx] = testIdx
|
|
if details.reportToExcel and Excel is not None:
|
|
xl = Excel()
|
|
xlCol = 1
|
|
xlRow = xl.FindLastRow(1,xlCol)
|
|
if xl[xlRow,xlCol] is not None and str(xl[xlRow,xlCol]):
|
|
xlRow += 1
|
|
heads=['Tolerance']
|
|
if details.note:
|
|
heads.append('Note')
|
|
if nModels > 1:
|
|
heads += [
|
|
'Mean(MisClass)','Dev(MisClass)',
|
|
'Mean(Correct Conf)','Dev(Correct Conf)',
|
|
'Mean(Incorrect Conf)','Dev(Incorrect Conf)',
|
|
]
|
|
else:
|
|
heads += [
|
|
'MisClass',
|
|
'Correct Conf',
|
|
'Incorrect Conf',
|
|
]
|
|
if models[0].GetActivityQuantBounds():
|
|
nRes = len(models[0].GetActivityQuantBounds())+1
|
|
else:
|
|
nRes = models[0].GetQuantBounds()[1][-1]
|
|
if nModels>1:
|
|
for i in range(nRes):
|
|
heads.append('Mean(Class %d %% pure)'%(i))
|
|
for i in range(nRes):
|
|
heads.append('Mean(Class %d %% correct)'%(i))
|
|
else:
|
|
for i in range(nRes):
|
|
heads.append('Class %d %% pure'%(i))
|
|
for i in range(nRes):
|
|
heads.append('Class %d %% correct'%(i))
|
|
|
|
if nModels > 1:
|
|
heads += [
|
|
'Best(MisClass)',
|
|
'Best(Correct Conf)',
|
|
'Best(Incorrect Conf)',
|
|
]
|
|
for i in range(len(heads)):
|
|
xl[xlRow,xlCol+i] = heads[i]
|
|
xl.Columns(xlCol+i).AutoFit()
|
|
else:
|
|
xl = None
|
|
for tol in details.screenVoteTol:
|
|
if len(details.screenVoteTol)>1:
|
|
message('\n*****-----*****-----*****-----*****-----*****-----*****-----*****\n')
|
|
message('Tolerance: %f'%tol)
|
|
if xl:
|
|
xlRow+=1
|
|
xlCol = 1
|
|
xl[xlRow,xlCol]=tol
|
|
xlCol += 1
|
|
if details.note:
|
|
xl[xlRow,xlCol]=details.note
|
|
xlCol += 1
|
|
nGood = zeros(nModels,Float)
|
|
nBad = zeros(nModels,Float)
|
|
nSkip = zeros(nModels,Float)
|
|
confGood = zeros(nModels,Float)
|
|
confBad = zeros(nModels,Float)
|
|
confSkip = zeros(nModels,Float)
|
|
if details.enrichTgt >= 0:
|
|
enrichments = zeros(nModels,Float)
|
|
goodVoteDict = {}
|
|
badVoteDict = {}
|
|
noVoteDict = {}
|
|
voteTab = None
|
|
for modelIdx in range(nModels):
|
|
model = models[modelIdx]
|
|
model.SetInputOrder(descNames)
|
|
testIdx = dataSets[modelIdx]
|
|
screenRes = screenResults[modelIdx]
|
|
if not details.detailedScreen:
|
|
g,b,s,aG,aB,aS,vT = ScreenIt(model,testIdx,tmpD,details.partialVote,tol,
|
|
verbose=details.verbose,screenResults=screenRes)
|
|
else:
|
|
if model.GetActivityQuantBounds():
|
|
nRes = len(model.GetActivityQuantBounds())+1
|
|
else:
|
|
nRes = model.GetQuantBounds()[1][-1]
|
|
badVotes = []
|
|
noVotes = []
|
|
if (hasattr(details,'showAll') and details.showAll) or \
|
|
(hasattr(details,'predPlot') and details.predPlot):
|
|
goodVotes = []
|
|
else:
|
|
goodVotes = None
|
|
g,b,s,aG,aB,aS,vT = ShowVoteResults(testIdx,tmpD,model,nRes,tol,
|
|
verbose=details.verbose,
|
|
screenResults=screenRes,
|
|
badVotes=badVotes,noVotes=noVotes,
|
|
goodVotes=goodVotes,
|
|
errorEstimate=details.errorEstimate)
|
|
if voteTab is None:
|
|
voteTab = zeros(vT.shape,Float)
|
|
if details.errorAnalysis:
|
|
for a,p,c,idx in badVotes:
|
|
label = testIdx[idx]
|
|
if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
|
|
if a==details.enrichTgt:
|
|
badVoteDict[label] = badVoteDict.get(label,0)+1
|
|
else:
|
|
badVoteDict[label] = badVoteDict.get(label,0)+1
|
|
for a,p,c,idx in noVotes:
|
|
label = testIdx[idx]
|
|
if hasattr(details,'enrichTgt') and details.enrichTgt >=0:
|
|
if a==details.enrichTgt:
|
|
noVoteDict[label] = noVoteDict.get(label,0)+1
|
|
else:
|
|
noVoteDict[label] = noVoteDict.get(label,0)+1
|
|
|
|
if hasattr(details,'showAll') and details.showAll:
|
|
for a,p,c,idx in goodVotes:
|
|
label = testIdx[idx]
|
|
if details.enrichTgt >=0:
|
|
if a==details.enrichTgt:
|
|
goodVoteDict[label] = goodVoteDict.get(label,0)+1
|
|
else:
|
|
goodVoteDict[label] = goodVoteDict.get(label,0)+1
|
|
|
|
if details.enrichTgt>-1:
|
|
enrichments[modelIdx] = CalcEnrichment(vT,tgt=details.enrichTgt)
|
|
|
|
voteTab += vT
|
|
if details.detailedScreen and hasattr(details,'predPlot') and details.predPlot:
|
|
MakePredPlot(details,testIdx,tmpD,goodVotes,badVotes,nRes,verbose=1)
|
|
|
|
if hasattr(details,'showAll') and details.showAll:
|
|
print '-v-v-v-v-v-v-v- All Votes -v-v-v-v-v-v-v-'
|
|
print 'id, prediction, confidence, flag(-1=skipped,0=wrong,1=correct)'
|
|
for ans,pred,conf,idx in goodVotes:
|
|
pt = tmpD[testIdx[idx]]
|
|
assert model.GetActivityQuantBounds() or pt[-1]==ans,\
|
|
'bad point?: %s != %s'%(str(pt[-1]),str(ans))
|
|
print '%s, %d, %.4f, 1'%(str(pt[0]),pred,conf)
|
|
for ans,pred,conf,idx in badVotes:
|
|
pt = tmpD[testIdx[idx]]
|
|
assert model.GetActivityQuantBounds() or pt[-1]==ans,\
|
|
'bad point?: %s != %s'%(str(pt[-1]),str(ans))
|
|
print '%s, %d, %.4f, 0'%(str(pt[0]),pred,conf)
|
|
for ans,pred,conf,idx in noVotes:
|
|
pt = tmpD[testIdx[idx]]
|
|
assert model.GetActivityQuantBounds() or pt[-1]==ans,\
|
|
'bad point?: %s != %s'%(str(pt[-1]),str(ans))
|
|
print '%s, %d, %.4f, -1'%(str(pt[0]),pred,conf)
|
|
print '-^-^-^-^-^-^-^- -^-^-^-^-^-^-^-'
|
|
|
|
nGood[modelIdx] = g
|
|
nBad[modelIdx] = b
|
|
nSkip[modelIdx] = s
|
|
confGood[modelIdx] = aG
|
|
confBad[modelIdx] = aB
|
|
confSkip[modelIdx] = aS
|
|
print
|
|
|
|
if nModels > 1:
|
|
print '-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*'
|
|
print 'AVERAGES:'
|
|
|
|
avgNBad = sum(nBad)/nModels
|
|
devNBad = sqrt(sum((nBad-avgNBad)**2)/(nModels-1))
|
|
|
|
bestIdx = argsort(nBad)[0]
|
|
|
|
avgNGood = sum(nGood)/nModels
|
|
devNGood = sqrt(sum((nGood-avgNGood)**2)/(nModels-1))
|
|
|
|
avgNSkip = sum(nSkip)/nModels
|
|
devNSkip = sqrt(sum((nSkip-avgNSkip)**2)/(nModels-1))
|
|
|
|
avgConfBad = sum(confBad)/nModels
|
|
devConfBad = sqrt(sum((confBad-avgConfBad)**2)/(nModels-1))
|
|
|
|
avgConfGood = sum(confGood)/nModels
|
|
devConfGood = sqrt(sum((confGood-avgConfGood)**2)/(nModels-1))
|
|
|
|
avgConfSkip = sum(confSkip)/nModels
|
|
devConfSkip = sqrt(sum((confSkip-avgConfSkip)**2)/(nModels-1))
|
|
|
|
nClassified = avgNGood + avgNBad
|
|
nExamples = nClassified + avgNSkip
|
|
print 'Misclassifications: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nExamples,
|
|
100*devNBad/nExamples,
|
|
avgNBad,devNBad,
|
|
nExamples)
|
|
if avgNSkip>0:
|
|
print '\tthreshold: \t%%%5.2f(%%%5.2f) %4.1f(%4.1f) / %d'%(100*avgNBad/nClassified,
|
|
100*devNBad/nClassified,
|
|
avgNBad,devNBad,
|
|
nClassified)
|
|
print
|
|
print 'Number Skipped: %%%4.2f(%%%4.2f) %4.2f(%4.2f)'%(100*avgNSkip/nExamples,
|
|
100*devNSkip/nExamples,
|
|
avgNSkip,devNSkip)
|
|
|
|
|
|
print
|
|
print 'Confidences:'
|
|
print '\tCorrect: \t%4.2f(%4.2f)'%(100*avgConfGood,100*devConfGood)
|
|
print '\tIncorrect: \t%4.2f(%4.2f)'%(100*avgConfBad,100*devConfBad)
|
|
if avgNSkip>0:
|
|
print '\tSkipped: \t%4.2f(%4.2f)'%(100*avgConfSkip,100*devConfSkip)
|
|
|
|
if xl:
|
|
xl[xlRow,xlCol]=100.*avgNBad/nExamples
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*devNBad/nExamples
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*avgConfGood
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*devConfGood
|
|
xlCol += 1
|
|
xl[xlRow,xlCol]=100.*avgConfBad
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*devConfBad
|
|
xlCol += 1
|
|
|
|
if details.detailedScreen:
|
|
message('Results Table:')
|
|
voteTab = transpose(voteTab)/nModels
|
|
nResultCodes = len(voteTab)
|
|
colCounts = sum(voteTab)
|
|
rowCounts = sum(voteTab,1)
|
|
print
|
|
for i in range(nResultCodes):
|
|
if rowCounts[i]==0: rowCounts[i]=1
|
|
row = voteTab[i]
|
|
message(' ',noRet=1)
|
|
for j in range(nResultCodes):
|
|
entry = row[j]
|
|
message(' % 6.2f'%entry,noRet=1)
|
|
message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
message('-------',noRet=1)
|
|
message('')
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
if colCounts[i]==0: colCounts[i]=1
|
|
message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
|
|
message('')
|
|
if xl:
|
|
for i in range(nResultCodes):
|
|
xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
|
|
xlCol += 1
|
|
for i in range(nResultCodes):
|
|
xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
|
|
xlCol += 1
|
|
|
|
if details.enrichTgt >-1:
|
|
mean = sum(enrichments)/nModels
|
|
enrichments -= mean
|
|
dev = sqrt(sum(enrichments*enrichments))/(nModels-1)
|
|
message(' Enrichment of value %d: %.4f (%.4f)'%(details.enrichTgt,mean,dev))
|
|
else:
|
|
bestIdx=0
|
|
print '------------------------------------------------'
|
|
print 'Best Model: ',bestIdx+1
|
|
bestBad = nBad[bestIdx]
|
|
bestGood = nGood[bestIdx]
|
|
bestSkip = nSkip[bestIdx]
|
|
nClassified = bestGood + bestBad
|
|
nExamples = nClassified + bestSkip
|
|
print 'Misclassifications: \t%%%5.2f %d / %d'%(100*bestBad/nExamples,
|
|
bestBad,nExamples)
|
|
if bestSkip>0:
|
|
print '\tthreshold: \t%%%5.2f %d / %d'%(100*bestBad/nClassified,
|
|
bestBad,nClassified)
|
|
print
|
|
print 'Number Skipped: %%%4.2f %d'%(100*bestSkip/nExamples,
|
|
bestSkip)
|
|
|
|
print
|
|
print 'Confidences:'
|
|
print '\tCorrect: \t%4.2f'%(100*confGood[bestIdx])
|
|
print '\tIncorrect: \t%4.2f'%(100*confBad[bestIdx])
|
|
if bestSkip>0:
|
|
print '\tSkipped: \t%4.2f'%(100*confSkip[bestIdx])
|
|
if xl:
|
|
xl[xlRow,xlCol]=100.*bestBad/nExamples
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*confGood[bestIdx]
|
|
xlCol+=1
|
|
xl[xlRow,xlCol]=100.*confBad[bestIdx]
|
|
xlCol+=1
|
|
|
|
|
|
if nModels == 1 and details.detailedScreen:
|
|
message('')
|
|
message('Results Table:')
|
|
voteTab = transpose(vT)
|
|
nResultCodes = len(vT)
|
|
colCounts = sum(voteTab)
|
|
rowCounts = sum(voteTab,1)
|
|
message('')
|
|
for i in range(nResultCodes):
|
|
if rowCounts[i]==0: rowCounts[i]=1
|
|
row = voteTab[i]
|
|
message(' ',noRet=1)
|
|
for j in range(nResultCodes):
|
|
entry = row[j]
|
|
message(' % 6.2f'%entry,noRet=1)
|
|
message(' | % 4.2f'%(100.*voteTab[i,i]/rowCounts[i]))
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
message('-------',noRet=1)
|
|
message('')
|
|
message(' ',noRet=1)
|
|
for i in range(nResultCodes):
|
|
if colCounts[i]==0: colCounts[i]=1
|
|
message(' % 6.2f'%(100.*voteTab[i,i]/colCounts[i]),noRet=1)
|
|
message('')
|
|
if xl:
|
|
for i in range(nResultCodes):
|
|
xl[xlRow,xlCol]=100.*voteTab[i,i]/rowCounts[i]
|
|
xlCol += 1
|
|
for i in range(nResultCodes):
|
|
xl[xlRow,xlCol]=100.*voteTab[i,i]/colCounts[i]
|
|
xlCol += 1
|
|
if details.errorAnalysis:
|
|
message('\n*-*-*-*-*-*-*-*- ERROR ANALYSIS -*-*-*-*-*-*-*-*\n')
|
|
ks = badVoteDict.keys()
|
|
if len(ks):
|
|
message(' ---> Bad Vote Counts')
|
|
if xl:
|
|
xlRow += 1
|
|
xl[xlRow,1] = 'Misclassification Counts:'
|
|
xlRow += 1
|
|
xl[xlRow,1] = 'ID'
|
|
xl[xlRow,2] = 'Num_Misses'
|
|
xlRow += 1
|
|
for k in ks:
|
|
pt = data[k]
|
|
message('%s,%d'%(str(pt[0]),badVoteDict[k]))
|
|
if xl:
|
|
xl[xlRow,1] = "'%s"%str(pt[0])
|
|
xl[xlRow,2] = badVoteDict[k]
|
|
xlRow += 1
|
|
|
|
ks = noVoteDict.keys()
|
|
if len(ks):
|
|
message(' ---> Skipped Compound Counts')
|
|
if xl:
|
|
xl[xlRow,1] = 'Skipped Compound Counts:'
|
|
xlRow += 1
|
|
for k in ks:
|
|
pt = data[k]
|
|
message('%s,%d'%(str(pt[0]),noVoteDict[k]))
|
|
if xl:
|
|
xl[xlRow,1] = "'%s"%str(pt[0])
|
|
xl[xlRow,2] = noVoteDict[k]
|
|
xlRow += 1
|
|
|
|
if hasattr(details,'showAll') and details.showAll:
|
|
ks = goodVoteDict.keys()
|
|
if len(ks):
|
|
message(' ---> Good Vote Counts')
|
|
if xl:
|
|
xlRow += 1
|
|
xl[xlRow,1] = 'Correct Classification Counts:'
|
|
xlRow += 1
|
|
xl[xlRow,1] = 'ID'
|
|
xl[xlRow,2] = 'Num_Picks'
|
|
xlRow += 1
|
|
for k in ks:
|
|
pt = data[k]
|
|
message('%s,%d'%(str(pt[0]),goodVoteDict[k]))
|
|
if xl:
|
|
xl[xlRow,1] = "'%s"%str(pt[0])
|
|
xl[xlRow,2] = goodVoteDict[k]
|
|
xlRow += 1
|
|
|