mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-05 22:04:27 +08:00
1010 lines
35 KiB
Python
Executable File
1010 lines
35 KiB
Python
Executable File
# $Id$
|
|
#
|
|
# Copyright (C) 2000-2006 greg Landrum and Rational Discovery LLC
|
|
#
|
|
# @@ All Rights Reserved @@
|
|
#
|
|
""" command line utility for building composite models
|
|
|
|
#DOC
|
|
|
|
**Usage**
|
|
|
|
BuildComposite [optional args] filename
|
|
|
|
Unless indicated otherwise (via command line arguments), _filename_ is
|
|
a QDAT file.
|
|
|
|
**Command Line Arguments**
|
|
|
|
- -o *filename*: name of the output file for the pickled composite
|
|
|
|
- -n *num*: number of separate models to add to the composite
|
|
|
|
- -p *tablename*: store persistence data in the database
|
|
in table *tablename*
|
|
|
|
- -N *note*: attach some arbitrary text to the persistence data
|
|
|
|
- -b *filename*: name of the text file to hold examples from the
|
|
holdout set which are misclassified
|
|
|
|
- -s: split the data into training and hold-out sets before building
|
|
the composite
|
|
|
|
- -f *frac*: the fraction of data to use in the training set when the
|
|
data is split
|
|
|
|
- -r: randomize the activities (for testing purposes). This ignores
|
|
the initial distribution of activity values and produces each
|
|
possible activity value with equal likliehood.
|
|
|
|
- -S: shuffle the activities (for testing purposes) This produces
|
|
a permutation of the input activity values.
|
|
|
|
- -l: locks the random number generator to give consistent sets
|
|
of training and hold-out data. This is primarily intended
|
|
for testing purposes.
|
|
|
|
- -B: use a so-called Bayesian composite model.
|
|
|
|
- -d *database name*: instead of reading the data from a QDAT file,
|
|
pull it from a database. In this case, the _filename_ argument
|
|
provides the name of the database table containing the data set.
|
|
|
|
- -D: show a detailed breakdown of the composite model performance
|
|
across the training and, when appropriate, hold-out sets.
|
|
|
|
- -P *pickle file name*: write out the pickled data set to the file
|
|
|
|
- -F *filter frac*: filters the data before training to change the
|
|
distribution of activity values in the training set. *filter
|
|
frac* is the fraction of the training set that should have the
|
|
target value. **See note below on data filtering.**
|
|
|
|
- -v *filter value*: filters the data before training to change the
|
|
distribution of activity values in the training set. *filter
|
|
value* is the target value to use in filtering. **See note below
|
|
on data filtering.**
|
|
|
|
- --modelFiltFrac *model filter frac*: Similar to filter frac above,
|
|
in this case the data is filtered for each model in the composite
|
|
rather than a single overall filter for a composite. *model
|
|
filter frac* is the fraction of the training set for each model
|
|
that should have the target value (*model filter value*).
|
|
|
|
- --modelFiltVal *model filter value*: target value to use for
|
|
filtering data before training each model in the composite.
|
|
|
|
- -t *threshold value*: use high-confidence predictions for the
|
|
final analysis of the hold-out data.
|
|
|
|
- -Q *list string*: the values of quantization bounds for the
|
|
activity value. See the _-q_ argument for the format of *list
|
|
string*.
|
|
|
|
- --nRuns *count*: build *count* composite models
|
|
|
|
- --prune: prune any models built
|
|
|
|
- -h: print a usage message and exit.
|
|
|
|
- -V: print the version number and exit
|
|
|
|
*-*-*-*-*-*-*-*- Tree-Related Options -*-*-*-*-*-*-*-*
|
|
|
|
- -g: be less greedy when training the models.
|
|
|
|
- -G *number*: force trees to be rooted at descriptor *number*.
|
|
|
|
- -L *limit*: provide an (integer) limit on individual model
|
|
complexity
|
|
|
|
- -q *list string*: Add QuantTrees to the composite and use the list
|
|
specified in *list string* as the number of target quantization
|
|
bounds for each descriptor. Don't forget to include 0's at the
|
|
beginning and end of *list string* for the name and value fields.
|
|
For example, if there are 4 descriptors and you want 2 quant
|
|
bounds apiece, you would use _-q "[0,2,2,2,2,0]"_.
|
|
Two special cases:
|
|
1) If you would like to ignore a descriptor in the model
|
|
building, use '-1' for its number of quant bounds.
|
|
2) If you have integer valued data that should not be quantized
|
|
further, enter 0 for that descriptor.
|
|
|
|
- --recycle: allow descriptors to be used more than once in a tree
|
|
|
|
- --randomDescriptors=val: toggles growing random forests with val
|
|
randomly-selected descriptors available at each node.
|
|
|
|
|
|
*-*-*-*-*-*-*-*- KNN-Related Options -*-*-*-*-*-*-*-*
|
|
|
|
- --doKnn: use K-Nearest Neighbors models
|
|
|
|
- --knnK=*value*: the value of K to use in the KNN models
|
|
|
|
- --knnTanimoto: use the Tanimoto metric in KNN models
|
|
|
|
- --knnEuclid: use a Euclidean metric in KNN models
|
|
|
|
*-*-*-*-*-*-*- Naive Bayes Classifier Options -*-*-*-*-*-*-*-*
|
|
- --doNaiveBayes : use Naive Bayes classifiers
|
|
|
|
- --mEstimateVal : the value to be used in the m-estimate formula
|
|
If this is greater than 0.0, we use it to compute the conditional
|
|
probabilities by the m-estimate
|
|
|
|
*-*-*-*-*-*-*-*- SVM-Related Options -*-*-*-*-*-*-*-*
|
|
|
|
**** NOTE: THESE ARE DISABLED ****
|
|
|
|
## - --doSVM: use Support-vector machines
|
|
|
|
## - --svmKernel=*kernel*: choose the type of kernel to be used for
|
|
## the SVMs. Options are:
|
|
## The default is:
|
|
|
|
## - --svmType=*type*: choose the type of support-vector machine
|
|
## to be used. Options are:
|
|
## The default is:
|
|
|
|
## - --svmGamma=*gamma*: provide the gamma value for the SVMs. If this
|
|
## is not provided, a grid search will be carried out to determine an
|
|
## optimal *gamma* value for each SVM.
|
|
|
|
## - --svmCost=*cost*: provide the cost value for the SVMs. If this is
|
|
## not provided, a grid search will be carried out to determine an
|
|
## optimal *cost* value for each SVM.
|
|
|
|
## - --svmWeights=*weights*: provide the weight values for the
|
|
## activities. If provided this should be a sequence of (label,
|
|
## weight) 2-tuples *nActs* long. If not provided, a weight of 1
|
|
## will be used for each activity.
|
|
|
|
## - --svmEps=*epsilon*: provide the epsilon value used to determine
|
|
## when the SVM has converged. Defaults to 0.001
|
|
|
|
## - --svmDegree=*degree*: provide the degree of the kernel (when
|
|
## sensible) Defaults to 3
|
|
|
|
## - --svmCoeff=*coeff*: provide the coefficient for the kernel (when
|
|
## sensible) Defaults to 0
|
|
|
|
## - --svmNu=*nu*: provide the nu value for the kernel (when sensible)
|
|
## Defaults to 0.5
|
|
|
|
## - --svmDataType=*float*: if the data is contains only 1 and 0 s, specify by
|
|
## using binary. Defaults to float
|
|
|
|
## - --svmCache=*cache*: provide the size of the memory cache (in MB)
|
|
## to be used while building the SVM. Defaults to 40
|
|
|
|
**Notes**
|
|
|
|
- *Data filtering*: When there is a large disparity between the
|
|
numbers of points with various activity levels present in the
|
|
training set it is sometimes desirable to train on a more
|
|
homogeneous data set. This can be accomplished using filtering.
|
|
The filtering process works by selecting a particular target
|
|
fraction and target value. For example, in a case where 95% of
|
|
the original training set has activity 0 and ony 5% activity 1, we
|
|
could filter (by randomly removing points with activity 0) so that
|
|
30% of the data set used to build the composite has activity 1.
|
|
|
|
|
|
"""
|
|
import RDConfig
|
|
from utils import listutils
|
|
from ML.Composite import Composite,BayesComposite
|
|
#from ML.SVM import SVMClassificationModel as SVM
|
|
from Numeric import *
|
|
from ML.Data import DataUtils,SplitData
|
|
from ML import ScreenComposite
|
|
from Dbase import DbModule
|
|
from Dbase.DbConnection import DbConnect
|
|
from ML import CompositeRun
|
|
import sys,cPickle,time
|
|
import DataStructs
|
|
|
|
_runDetails = CompositeRun.CompositeRun()
|
|
|
|
__VERSION_STRING="3.2.3"
|
|
|
|
_verbose = 1
|
|
def message(msg):
|
|
""" emits messages to _sys.stdout_
|
|
override this in modules which import this one to redirect output
|
|
|
|
**Arguments**
|
|
|
|
- msg: the string to be displayed
|
|
|
|
"""
|
|
if _verbose: sys.stdout.write('%s\n'%(msg))
|
|
|
|
|
|
def testall(composite,examples,badExamples=[]):
|
|
""" screens a number of examples past a composite
|
|
|
|
**Arguments**
|
|
|
|
- composite: a composite model
|
|
|
|
- examples: a list of examples (with results) to be screened
|
|
|
|
- badExamples: a list to which misclassified examples are appended
|
|
|
|
**Returns**
|
|
|
|
a list of 2-tuples containing:
|
|
|
|
1) a vote
|
|
|
|
2) a confidence
|
|
|
|
these are the votes and confidence levels for **misclassified** examples
|
|
|
|
"""
|
|
wrong = []
|
|
for example in examples:
|
|
if composite.GetActivityQuantBounds():
|
|
answer = composite.QuantizeActivity(example)[-1]
|
|
else:
|
|
answer = example[-1]
|
|
res,conf = composite.ClassifyExample(example)
|
|
if res != answer:
|
|
wrong.append((res,conf))
|
|
badExamples.append(example)
|
|
|
|
return wrong
|
|
|
|
def GetCommandLine(details):
|
|
""" #DOC
|
|
|
|
"""
|
|
args = ['BuildComposite']
|
|
args.append('-n %d'%(details.nModels))
|
|
if details.filterFrac != 0.0: args.append('-F %.3f -v %d'%(details.filterFrac,details.filterVal))
|
|
if details.modelFilterFrac != 0.0: args.append('--modelFiltFrac=%.3f --modelFiltVal=%d'%(details.modelFilterFrac,
|
|
details.modelFilterVal))
|
|
if details.splitRun: args.append('-s -f %.3f'%(details.splitFrac))
|
|
if details.shuffleActivities: args.append('-S')
|
|
if details.randomActivities: args.append('-r')
|
|
if details.threshold > 0.0: args.append('-t %.3f'%(details.threshold))
|
|
if details.activityBounds: args.append('-Q "%s"'%(details.activityBoundsVals))
|
|
if details.dbName: args.append('-d %s'%(details.dbName))
|
|
if details.detailedRes: args.append('-D')
|
|
if hasattr(details,'noScreen') and details.noScreen: args.append('--noScreen')
|
|
if details.persistTblName and details.dbName:
|
|
args.append('-p %s'%(details.persistTblName))
|
|
if details.note:
|
|
args.append('-N %s'%(details.note))
|
|
if details.useTrees:
|
|
if details.limitDepth>0: args.append('-L %d'%(details.limitDepth))
|
|
if details.lessGreedy: args.append('-g')
|
|
if details.qBounds:
|
|
shortBounds = listutils.CompactListRepr(details.qBounds)
|
|
if details.qBounds: args.append('-q "%s"'%(shortBounds))
|
|
else:
|
|
if details.qBounds: args.append('-q "%s"'%(details.qBoundCount))
|
|
|
|
if details.pruneIt: args.append('--prune')
|
|
if details.startAt: args.append('-G %d'%details.startAt)
|
|
if details.recycleVars: args.append('--recycle')
|
|
if details.randomDescriptors: args.append('--randomDescriptors=%d'%details.randomDescriptors)
|
|
if details.useSigTrees:
|
|
args.append('--doSigTree')
|
|
if details.limitDepth>0: args.append('-L %d'%(details.limitDepth))
|
|
if details.randomDescriptors:
|
|
args.append('--randomDescriptors=%d'%details.randomDescriptors)
|
|
|
|
if details.useKNN:
|
|
args.append('--doKnn --knnK %d'%(details.knnNeighs))
|
|
if details.knnDistFunc=='Tanimoto':
|
|
args.append('--knnTanimoto')
|
|
else:
|
|
args.append('--knnEuclid')
|
|
|
|
if details.useNaiveBayes:
|
|
args.append('--doNaiveBayes')
|
|
if details.mEstimateVal >= 0.0 :
|
|
args.append('--mEstimateVal=%.3f'%details.mEstimateVal)
|
|
|
|
## if details.useSVM:
|
|
## args.append('--doSVM')
|
|
## if details.svmKernel:
|
|
## for k in SVM.kernels.keys():
|
|
## if SVM.kernels[k]==details.svmKernel:
|
|
## args.append('--svmKernel=%s'%k)
|
|
## break
|
|
## if details.svmType:
|
|
## for k in SVM.machineTypes.keys():
|
|
## if SVM.machineTypes[k]==details.svmType:
|
|
## args.append('--svmType=%s'%k)
|
|
## break
|
|
## if details.svmGamma:
|
|
## args.append('--svmGamma=%f'%details.svmGamma)
|
|
## if details.svmCost:
|
|
## args.append('--svmCost=%f'%details.svmCost)
|
|
## if details.svmWeights:
|
|
## args.append("--svmWeights='%s'"%str(details.svmWeights))
|
|
## if details.svmDegree:
|
|
## args.append('--svmDegree=%d'%details.svmDegree)
|
|
## if details.svmCoeff:
|
|
## args.append('--svmCoeff=%d'%details.svmCoeff)
|
|
## if details.svmEps:
|
|
## args.append('--svmEps=%f'%details.svmEps)
|
|
## if details.svmNu:
|
|
## args.append('--svmNu=%f'%details.svmNu)
|
|
## if details.svmCache:
|
|
## args.append('--svmCache=%d'%details.svmCache)
|
|
## if detail.svmDataType:
|
|
## args.append('--svmDataType=%s'%details.svmDataType)
|
|
## if not details.svmShrink:
|
|
## args.append('--svmShrink')
|
|
|
|
if details.replacementSelection: args.append('--replacementSelection')
|
|
|
|
|
|
# this should always be last:
|
|
if details.tableName: args.append(details.tableName)
|
|
|
|
return ' '.join(args)
|
|
|
|
def RunOnData(details,data,progressCallback=None,saveIt=1,setDescNames=0):
|
|
nExamples = data.GetNPts()
|
|
if details.lockRandom:
|
|
seed = details.randomSeed
|
|
else:
|
|
import random
|
|
seed = (random.randint(0,1e6),random.randint(0,1e6))
|
|
DataUtils.InitRandomNumbers(seed)
|
|
testExamples = []
|
|
if details.shuffleActivities == 1:
|
|
DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details)
|
|
elif details.randomActivities == 1:
|
|
DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details)
|
|
|
|
namedExamples = data.GetNamedData()
|
|
if details.splitRun == 1:
|
|
trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac,
|
|
silent=not _verbose)
|
|
|
|
trainExamples = [namedExamples[x] for x in trainIdx]
|
|
testExamples = [namedExamples[x] for x in testIdx]
|
|
else:
|
|
testExamples = []
|
|
testIdx = []
|
|
trainIdx = range(len(namedExamples))
|
|
trainExamples = namedExamples
|
|
|
|
if details.filterFrac != 0.0:
|
|
# if we're doing quantization on the fly, we need to handle that here:
|
|
if hasattr(details,'activityBounds') and details.activityBounds:
|
|
tExamples = []
|
|
bounds = details.activityBounds
|
|
for pt in trainExamples:
|
|
pt = pt[:]
|
|
act = pt[-1]
|
|
placed=0
|
|
bound=0
|
|
while not placed and bound < len(bounds):
|
|
if act < bounds[bound]:
|
|
pt[-1] = bound
|
|
placed = 1
|
|
else:
|
|
bound += 1
|
|
if not placed:
|
|
pt[-1] = bound
|
|
tExamples.append(pt)
|
|
else:
|
|
bounds = None
|
|
tExamples = trainExamples
|
|
trainIdx,temp = DataUtils.FilterData(tExamples,details.filterVal,
|
|
details.filterFrac,-1,
|
|
indicesOnly=1)
|
|
tmp = [trainExamples[x] for x in trainIdx]
|
|
testExamples += [trainExamples[x] for x in temp]
|
|
trainExamples = tmp
|
|
|
|
counts = DataUtils.CountResults(trainExamples,bounds=bounds)
|
|
ks = counts.keys()
|
|
ks.sort()
|
|
message('Result Counts in training set:')
|
|
for k in ks:
|
|
message(str((k, counts[k])))
|
|
counts = DataUtils.CountResults(testExamples,bounds=bounds)
|
|
ks = counts.keys()
|
|
ks.sort()
|
|
message('Result Counts in test set:')
|
|
for k in ks:
|
|
message(str((k, counts[k])))
|
|
nExamples = len(trainExamples)
|
|
message('Training with %d examples'%(nExamples))
|
|
|
|
nVars = data.GetNVars()
|
|
attrs = range(1,nVars+1)
|
|
nPossibleVals = data.GetNPossibleVals()
|
|
for i in range(1,len(nPossibleVals)):
|
|
if nPossibleVals[i-1] == -1:
|
|
attrs.remove(i)
|
|
|
|
if details.pickleDataFileName != '':
|
|
pickleDataFile = open(details.pickleDataFileName,'wb+')
|
|
cPickle.dump(trainExamples,pickleDataFile)
|
|
cPickle.dump(testExamples,pickleDataFile)
|
|
pickleDataFile.close()
|
|
|
|
if details.bayesModel:
|
|
composite = BayesComposite.BayesComposite()
|
|
else:
|
|
composite = Composite.Composite()
|
|
|
|
composite._randomSeed = seed
|
|
composite._splitFrac = details.splitFrac
|
|
composite._shuffleActivities = details.shuffleActivities
|
|
composite._randomizeActivities = details.randomActivities
|
|
|
|
if hasattr(details,'filterFrac'):
|
|
composite._filterFrac = details.filterFrac
|
|
if hasattr(details,'filterVal'):
|
|
composite._filterVal = details.filterVal
|
|
|
|
composite.SetModelFilterData(details.modelFilterFrac, details.modelFilterVal)
|
|
|
|
composite.SetActivityQuantBounds(details.activityBounds)
|
|
nPossibleVals = data.GetNPossibleVals()
|
|
if details.activityBounds:
|
|
nPossibleVals[-1] = len(details.activityBounds)+1
|
|
|
|
|
|
if setDescNames:
|
|
composite.SetInputOrder(data.GetVarNames())
|
|
composite.SetDescriptorNames(details._descNames)
|
|
else:
|
|
composite.SetDescriptorNames(data.GetVarNames())
|
|
composite.SetActivityQuantBounds(details.activityBounds)
|
|
if details.nModels==1:
|
|
details.internalHoldoutFrac=0.0
|
|
if details.useTrees:
|
|
from ML.DecTree import CrossValidate,PruneTree
|
|
if details.qBounds != []:
|
|
from ML.DecTree import BuildQuantTree
|
|
builder = BuildQuantTree.QuantTreeBoot
|
|
else:
|
|
from ML.DecTree import ID3
|
|
builder = ID3.ID3Boot
|
|
driver = CrossValidate.CrossValidationDriver
|
|
pruner = PruneTree.PruneTree
|
|
|
|
composite.SetQuantBounds(details.qBounds)
|
|
nPossibleVals = data.GetNPossibleVals()
|
|
if details.activityBounds:
|
|
nPossibleVals[-1] = len(details.activityBounds)+1
|
|
composite.Grow(trainExamples,attrs,nPossibleVals=[0]+nPossibleVals,
|
|
buildDriver=driver,
|
|
pruner=pruner,
|
|
nTries=details.nModels,pruneIt=details.pruneIt,
|
|
lessGreedy=details.lessGreedy,needsQuantization=0,
|
|
treeBuilder=builder,nQuantBounds=details.qBounds,
|
|
startAt=details.startAt,
|
|
maxDepth=details.limitDepth,
|
|
progressCallback=progressCallback,
|
|
holdOutFrac=details.internalHoldoutFrac,
|
|
replacementSelection=details.replacementSelection,
|
|
recycleVars=details.recycleVars,
|
|
randomDescriptors=details.randomDescriptors,
|
|
silent=not _verbose)
|
|
|
|
elif details.useSigTrees:
|
|
from ML.DecTree import CrossValidate
|
|
from ML.DecTree import BuildSigTree
|
|
builder = BuildSigTree.SigTreeBuilder
|
|
driver = CrossValidate.CrossValidationDriver
|
|
nPossibleVals = data.GetNPossibleVals()
|
|
if details.activityBounds:
|
|
nPossibleVals[-1] = len(details.activityBounds)+1
|
|
if hasattr(details,'sigTreeBiasList'):
|
|
biasList = details.sigTreeBiasList
|
|
else:
|
|
biasList=None
|
|
if hasattr(details,'useCMIM'):
|
|
useCMIM=details.useCMIM
|
|
else:
|
|
useCMIM=0
|
|
if hasattr(details,'allowCollections'):
|
|
allowCollections = details.allowCollections
|
|
else:
|
|
allowCollections=False
|
|
composite.Grow(trainExamples,attrs,nPossibleVals=[0]+nPossibleVals,
|
|
buildDriver=driver,
|
|
nTries=details.nModels,
|
|
needsQuantization=0,
|
|
treeBuilder=builder,
|
|
maxDepth=details.limitDepth,
|
|
progressCallback=progressCallback,
|
|
holdOutFrac=details.internalHoldoutFrac,
|
|
replacementSelection=details.replacementSelection,
|
|
recycleVars=details.recycleVars,
|
|
randomDescriptors=details.randomDescriptors,
|
|
biasList=biasList,
|
|
useCMIM=useCMIM,
|
|
allowCollection=allowCollections,
|
|
silent=not _verbose)
|
|
|
|
elif details.useKNN:
|
|
from ML.KNN import CrossValidate
|
|
from ML.KNN import DistFunctions
|
|
|
|
driver = CrossValidate.CrossValidationDriver
|
|
dfunc = ''
|
|
if (details.knnDistFunc == "Euclidean") :
|
|
dfunc = DistFunctions.EuclideanDist
|
|
elif (details.knnDistFunc == "Tanimoto"):
|
|
dfunc = DistFunctions.TanimotoDist
|
|
else:
|
|
assert 0,"Bad KNN distance metric value"
|
|
|
|
|
|
composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
|
|
buildDriver=driver, nTries=details.nModels,
|
|
needsQuantization=0,
|
|
numNeigh=details.knnNeighs,
|
|
holdOutFrac=details.internalHoldoutFrac,
|
|
distFunc=dfunc)
|
|
|
|
elif details.useNaiveBayes or details.useSigBayes:
|
|
from ML.NaiveBayes import CrossValidate
|
|
driver = CrossValidate.CrossValidationDriver
|
|
if not (hasattr(details,'useSigBayes') and details.useSigBayes):
|
|
composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
|
|
buildDriver=driver, nTries=details.nModels,
|
|
needsQuantization=0, nQuantBounds=details.qBounds,
|
|
holdOutFrac=details.internalHoldoutFrac,
|
|
replacementSelection=details.replacementSelection,
|
|
mEstimateVal=details.mEstimateVal,
|
|
silent=not _verbose)
|
|
else:
|
|
if hasattr(details,'useCMIM'):
|
|
useCMIM=details.useCMIM
|
|
else:
|
|
useCMIM=0
|
|
|
|
composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
|
|
buildDriver=driver, nTries=details.nModels,
|
|
needsQuantization=0, nQuantBounds=details.qBounds,
|
|
mEstimateVal=details.mEstimateVal,
|
|
useSigs=True,useCMIM=useCMIM,
|
|
holdOutFrac=details.internalHoldoutFrac,
|
|
replacementSelection=details.replacementSelection,
|
|
silent=not _verbose)
|
|
|
|
|
|
|
|
## elif details.useSVM:
|
|
## from ML.SVM import CrossValidate
|
|
## driver = CrossValidate.CrossValidationDriver
|
|
## composite.Grow(trainExamples, attrs, nPossibleVals=[0]+nPossibleVals,
|
|
## buildDriver=driver, nTries=details.nModels,
|
|
## needsQuantization=0,
|
|
## cost=details.svmCost,gamma=details.svmGamma,
|
|
## weights=details.svmWeights,degree=details.svmDegree,
|
|
## type=details.svmType,kernelType=details.svmKernel,
|
|
## coef0=details.svmCoeff,eps=details.svmEps,nu=details.svmNu,
|
|
## cache_size=details.svmCache,shrinking=details.svmShrink,
|
|
## dataType=details.svmDataType,
|
|
## holdOutFrac=details.internalHoldoutFrac,
|
|
## replacementSelection=details.replacementSelection,
|
|
## silent=not _verbose)
|
|
|
|
else:
|
|
from ML.Neural import CrossValidate
|
|
driver = CrossValidate.CrossValidationDriver
|
|
composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels,
|
|
buildDriver=driver,needsQuantization=0)
|
|
|
|
composite.AverageErrors()
|
|
composite.SortModels()
|
|
modelList,counts,avgErrs = composite.GetAllData()
|
|
counts = array(counts)
|
|
avgErrs = array(avgErrs)
|
|
composite._varNames = data.GetVarNames()
|
|
|
|
for i in xrange(len(modelList)):
|
|
modelList[i].NameModel(composite._varNames)
|
|
|
|
# do final statistics
|
|
weightedErrs = counts*avgErrs
|
|
averageErr = sum(weightedErrs)/sum(counts)
|
|
devs = (avgErrs - averageErr)
|
|
devs = devs * counts
|
|
devs = sqrt(devs*devs)
|
|
avgDev = sum(devs)/sum(counts)
|
|
message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev))
|
|
|
|
if details.bayesModel:
|
|
composite.Train(trainExamples,verbose=0)
|
|
|
|
# blow out the saved examples and then save the composite:
|
|
composite.ClearModelExamples()
|
|
if saveIt:
|
|
composite.Pickle(details.outName)
|
|
details.model = DbModule.binaryHolder(cPickle.dumps(composite))
|
|
|
|
badExamples = []
|
|
if not details.detailedRes and (not hasattr(details,'noScreen') or not details.noScreen):
|
|
if details.splitRun:
|
|
message('Testing all hold-out examples')
|
|
wrong = testall(composite,testExamples,badExamples)
|
|
message('%d examples (%% %5.2f) were misclassified'%(len(wrong),
|
|
100.*float(len(wrong))/float(len(testExamples))))
|
|
_runDetails.holdout_error = float(len(wrong))/len(testExamples)
|
|
else:
|
|
message('Testing all examples')
|
|
wrong = testall(composite,namedExamples,badExamples)
|
|
message('%d examples (%% %5.2f) were misclassified'%(len(wrong),
|
|
100.*float(len(wrong))/float(len(namedExamples))))
|
|
_runDetails.overall_error = float(len(wrong))/len(namedExamples)
|
|
|
|
if details.detailedRes:
|
|
message('\nEntire data set:')
|
|
resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite,
|
|
nPossibleVals[-1],details.threshold)
|
|
nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup
|
|
nPts = len(namedExamples)
|
|
nClass = nGood+nBad
|
|
_runDetails.overall_error = float(nBad) / nClass
|
|
_runDetails.overall_correct_conf = avgGood
|
|
_runDetails.overall_incorrect_conf = avgBad
|
|
_runDetails.overall_result_matrix = repr(voteTab)
|
|
nRej = nClass-nPts
|
|
if nRej > 0:
|
|
_runDetails.overall_fraction_dropped = float(nRej)/nPts
|
|
|
|
if details.splitRun:
|
|
message('\nHold-out data:')
|
|
resTup = ScreenComposite.ShowVoteResults(range(len(testExamples)),testExamples,
|
|
composite,
|
|
nPossibleVals[-1],details.threshold)
|
|
nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup
|
|
nPts = len(testExamples)
|
|
nClass = nGood+nBad
|
|
_runDetails.holdout_error = float(nBad) / nClass
|
|
_runDetails.holdout_correct_conf = avgGood
|
|
_runDetails.holdout_incorrect_conf = avgBad
|
|
_runDetails.holdout_result_matrix = repr(voteTab)
|
|
nRej = nClass-nPts
|
|
if nRej > 0:
|
|
_runDetails.holdout_fraction_dropped = float(nRej)/nPts
|
|
|
|
|
|
if details.persistTblName and details.dbName:
|
|
message('Updating results table %s:%s'%(details.dbName,details.persistTblName))
|
|
details.Store(db=details.dbName,table=details.persistTblName)
|
|
|
|
if details.badName != '':
|
|
badFile = open(details.badName,'w+')
|
|
for i in xrange(len(badExamples)):
|
|
ex = badExamples[i]
|
|
vote = wrong[i]
|
|
outStr = '%s\t%s\n'%(ex,vote)
|
|
badFile.write(outStr)
|
|
badFile.close()
|
|
|
|
composite.ClearModelExamples()
|
|
return composite
|
|
|
|
def RunIt(details,progressCallback=None,saveIt=1,setDescNames=0):
|
|
""" does the actual work of building a composite model
|
|
|
|
**Arguments**
|
|
|
|
- details: a _CompositeRun.CompositeRun_ object containing details
|
|
(options, parameters, etc.) about the run
|
|
|
|
- progressCallback: (optional) a function which is called with a single
|
|
argument (the number of models built so far) after each model is built.
|
|
|
|
- saveIt: (optional) if this is nonzero, the resulting model will be pickled
|
|
and dumped to the filename specified in _details.outName_
|
|
|
|
- setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
|
|
will be called using the results of the data set's _GetVarNames()_ method;
|
|
it is assumed that the details object has a _descNames attribute which
|
|
is passed to the composites _SetDescriptorNames()_ method. Otherwise
|
|
(the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.
|
|
|
|
**Returns**
|
|
|
|
the composite model constructed
|
|
|
|
|
|
"""
|
|
details.rundate = time.asctime()
|
|
|
|
fName = details.tableName.strip()
|
|
if details.outName == '':
|
|
details.outName = fName + '.pkl'
|
|
if not details.dbName:
|
|
if details.qBounds != []:
|
|
data = DataUtils.TextFileToData(fName)
|
|
else:
|
|
data = DataUtils.BuildQuantDataSet(fName)
|
|
elif details.useSigTrees or details.useSigBayes:
|
|
details.tableName = fName
|
|
data = details.GetDataSet(pickleCol=0,pickleClass=DataStructs.ExplicitBitVect)
|
|
elif details.qBounds != [] or not details.useTrees:
|
|
details.tableName = fName
|
|
data = details.GetDataSet()
|
|
else:
|
|
data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName,
|
|
user=details.dbUser,password=details.dbPassword)
|
|
|
|
composite = RunOnData(details,data,progressCallback=progressCallback,
|
|
saveIt=saveIt,setDescNames=setDescNames)
|
|
return composite
|
|
|
|
|
|
def ShowVersion(includeArgs=0):
|
|
""" prints the version number
|
|
|
|
"""
|
|
print 'This is BuildComposite.py version %s'%(__VERSION_STRING)
|
|
if includeArgs:
|
|
import sys
|
|
print 'command line was:'
|
|
print ' '.join(sys.argv)
|
|
|
|
def Usage():
|
|
""" provides a list of arguments for when this is used from the command line
|
|
|
|
"""
|
|
import sys
|
|
print __doc__
|
|
sys.exit(-1)
|
|
|
|
def SetDefaults(runDetails=None):
|
|
""" initializes a details object with default values
|
|
|
|
**Arguments**
|
|
|
|
- details: (optional) a _CompositeRun.CompositeRun_ object.
|
|
If this is not provided, the global _runDetails will be used.
|
|
|
|
**Returns**
|
|
|
|
the initialized _CompositeRun_ object.
|
|
|
|
|
|
"""
|
|
if runDetails is None: runDetails = _runDetails
|
|
return CompositeRun.SetDefaults(runDetails)
|
|
|
|
def ParseArgs(runDetails):
|
|
""" parses command line arguments and updates _runDetails_
|
|
|
|
**Arguments**
|
|
|
|
- runDetails: a _CompositeRun.CompositeRun_ object.
|
|
|
|
"""
|
|
import getopt
|
|
args,extra = getopt.getopt(sys.argv[1:],'P:o:n:p:b:sf:F:v:hlgd:rSTt:BQ:q:DVG:N:L:',
|
|
['nRuns=','prune','profile',
|
|
'seed=','noScreen',
|
|
|
|
'modelFiltFrac=', 'modelFiltVal=',
|
|
|
|
'recycle','randomDescriptors=',
|
|
|
|
'doKnn','knnK=','knnTanimoto','knnEuclid',
|
|
|
|
'doSigTree','doCMIM=','allowCollections',
|
|
|
|
'doNaiveBayes', 'mEstimateVal=',
|
|
'doSigBayes',
|
|
|
|
## 'doSVM','svmKernel=','svmType=','svmGamma=',
|
|
## 'svmCost=','svmWeights=','svmDegree=',
|
|
## 'svmCoeff=','svmEps=','svmNu=','svmCache=',
|
|
## 'svmShrink','svmDataType=',
|
|
|
|
'replacementSelection',
|
|
|
|
])
|
|
runDetails.profileIt=0
|
|
for arg,val in args:
|
|
if arg == '-n':
|
|
runDetails.nModels = int(val)
|
|
elif arg == '-N':
|
|
runDetails.note=val
|
|
elif arg == '-o':
|
|
runDetails.outName = val
|
|
elif arg == '-Q':
|
|
qBounds = eval(val)
|
|
assert type(qBounds) in [type([]),type(())],'bad argument type for -Q, specify a list as a string'
|
|
runDetails.activityBounds=qBounds
|
|
runDetails.activityBoundsVals=val
|
|
elif arg == '-p':
|
|
runDetails.persistTblName=val
|
|
elif arg == '-P':
|
|
runDetails.pickleDataFileName= val
|
|
elif arg == '-r':
|
|
runDetails.randomActivities = 1
|
|
elif arg == '-S':
|
|
runDetails.shuffleActivities = 1
|
|
elif arg == '-b':
|
|
runDetails.badName = val
|
|
elif arg == '-B':
|
|
runDetails.bayesModels=1
|
|
elif arg == '-s':
|
|
runDetails.splitRun = 1
|
|
elif arg == '-f':
|
|
runDetails.splitFrac=float(val)
|
|
elif arg == '-F':
|
|
runDetails.filterFrac=float(val)
|
|
elif arg == '-v':
|
|
runDetails.filterVal=float(val)
|
|
elif arg == '-l':
|
|
runDetails.lockRandom = 1
|
|
elif arg == '-g':
|
|
runDetails.lessGreedy=1
|
|
elif arg == '-G':
|
|
runDetails.startAt = int(val)
|
|
elif arg == '-d':
|
|
runDetails.dbName=val
|
|
elif arg == '-T':
|
|
runDetails.useTrees = 0
|
|
elif arg == '-t':
|
|
runDetails.threshold=float(val)
|
|
elif arg == '-D':
|
|
runDetails.detailedRes = 1
|
|
elif arg == '-L':
|
|
runDetails.limitDepth = int(val)
|
|
elif arg == '-q':
|
|
qBounds = eval(val)
|
|
assert type(qBounds) in [type([]),type(())],'bad argument type for -q, specify a list as a string'
|
|
runDetails.qBoundCount=val
|
|
runDetails.qBounds = qBounds
|
|
elif arg == '-V':
|
|
ShowVersion()
|
|
sys.exit(0)
|
|
elif arg == '--nRuns':
|
|
runDetails.nRuns = int(val)
|
|
elif arg == '--modelFiltFrac':
|
|
runDetails.modelFilterFrac=float(val)
|
|
elif arg == '--modelFiltVal':
|
|
runDetails.modelFilterVal=float(val)
|
|
elif arg == '--prune':
|
|
runDetails.pruneIt=1
|
|
elif arg == '--profile':
|
|
runDetails.profileIt=1
|
|
|
|
elif arg == '--recycle':
|
|
runDetails.recycleVars=1
|
|
elif arg == '--randomDescriptors':
|
|
runDetails.randomDescriptors=int(val)
|
|
|
|
elif arg == '--doKnn':
|
|
runDetails.useKNN=1
|
|
runDetails.useTrees=0
|
|
## runDetails.useSVM=0
|
|
runDetails.useNaiveBayes=0
|
|
elif arg == '--knnK':
|
|
runDetails.knnNeighs = int(val)
|
|
elif arg == '--knnTanimoto':
|
|
runDetails.knnDistFunc="Tanimoto"
|
|
elif arg == '--knnEuclid':
|
|
runDetails.knnDistFunc="Euclidean"
|
|
|
|
elif arg == '--doSigTree':
|
|
## runDetails.useSVM=0
|
|
runDetails.useKNN=0
|
|
runDetails.useTrees=0
|
|
runDetails.useNaiveBayes=0
|
|
runDetails.useSigTrees=1
|
|
elif arg == '--doCMIM':
|
|
runDetails.useCMIM=int(val)
|
|
elif arg == '--allowCollections':
|
|
runDetails.allowCollections=True
|
|
|
|
elif arg == '--doNaiveBayes':
|
|
runDetails.useNaiveBayes=1
|
|
## runDetails.useSVM=0
|
|
runDetails.useKNN=0
|
|
runDetails.useTrees=0
|
|
runDetails.useSigBayes=0
|
|
elif arg == '--doSigBayes':
|
|
runDetails.useSigBayes=1
|
|
runDetails.useNaiveBayes=0
|
|
## runDetails.useSVM=0
|
|
runDetails.useKNN=0
|
|
runDetails.useTrees=0
|
|
elif arg == '--mEstimateVal':
|
|
runDetails.mEstimateVal=float(val)
|
|
|
|
## elif arg == '--doSVM':
|
|
## runDetails.useSVM=1
|
|
## runDetails.useKNN=0
|
|
## runDetails.useTrees=0
|
|
## runDetails.useNaiveBayes=0
|
|
## elif arg == '--svmKernel':
|
|
## if val not in SVM.kernels.keys():
|
|
## message('kernel %s not in list of available kernels:\n%s\n'%(val,SVM.kernels.keys()))
|
|
## sys.exit(-1)
|
|
## else:
|
|
## runDetails.svmKernel=SVM.kernels[val]
|
|
## elif arg == '--svmType':
|
|
## if val not in SVM.machineTypes.keys():
|
|
## message('type %s not in list of available machines:\n%s\n'%(val,SVM.machineTypes.keys()))
|
|
## sys.exit(-1)
|
|
## else:
|
|
## runDetails.svmType=SVM.machineTypes[val]
|
|
## elif arg == '--svmGamma':
|
|
## runDetails.svmGamma = float(val)
|
|
## elif arg == '--svmCost':
|
|
## runDetails.svmCost = float(val)
|
|
## elif arg == '--svmWeights':
|
|
## # FIX: this is dangerous
|
|
## runDetails.svmWeights = eval(val)
|
|
## elif arg == '--svmDegree':
|
|
## runDetails.svmDegree = int(val)
|
|
## elif arg == '--svmCoeff':
|
|
## runDetails.svmCoeff = float(val)
|
|
## elif arg == '--svmEps':
|
|
## runDetails.svmEps = float(val)
|
|
## elif arg == '--svmNu':
|
|
## runDetails.svmNu = float(val)
|
|
## elif arg == '--svmCache':
|
|
## runDetails.svmCache = int(val)
|
|
## elif arg == '--svmShrink':
|
|
## runDetails.svmShrink = 0
|
|
## elif arg == '--svmDataType':
|
|
## runDetails.svmDataType=val
|
|
|
|
elif arg== '--seed':
|
|
# FIX: dangerous
|
|
runDetails.randomSeed = eval(val)
|
|
|
|
elif arg== '--noScreen':
|
|
runDetails.noScreen=1
|
|
|
|
elif arg== '--replacementSelection':
|
|
runDetails.replacementSelection = 1
|
|
|
|
elif arg == '-h':
|
|
Usage()
|
|
|
|
else:
|
|
Usage()
|
|
runDetails.tableName=extra[0]
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
Usage()
|
|
|
|
_runDetails.cmd = ' '.join(sys.argv)
|
|
SetDefaults(_runDetails)
|
|
ParseArgs(_runDetails)
|
|
|
|
|
|
ShowVersion(includeArgs=1)
|
|
|
|
if _runDetails.nRuns > 1:
|
|
for i in range(_runDetails.nRuns):
|
|
sys.stderr.write('---------------------------------\n\tDoing %d of %d\n---------------------------------\n'%(i+1,_runDetails.nRuns))
|
|
RunIt(_runDetails)
|
|
else:
|
|
if _runDetails.profileIt:
|
|
import hotshot,hotshot.stats
|
|
prof=hotshot.Profile('prof.dat')
|
|
prof.runcall(RunIt,_runDetails)
|
|
stats = hotshot.stats.load('prof.dat')
|
|
stats.strip_dirs()
|
|
stats.sort_stats('time','calls')
|
|
stats.print_stats(30)
|
|
else:
|
|
RunIt(_runDetails)
|
|
|