rdkit/Python/ML/GrowComposite.py

# $Id$
#
#  Copyright (C) 2003-2006  greg Landrum and Rational Discovery LLC
#
#   @@ All Rights Reserved  @@
#

""" command line utility for growing composite models

**Usage**

  _GrowComposite [optional args] filename_

**Command Line Arguments**

  - -n *count*: number of new models to build

  - -C *pickle file name*:  name of file containing composite upon which to build.

  - --inNote *note*: note to be used in loading composite models from the database
      for growing

  - --balTable *table name*:  table from which to take the original data set
     (for balancing)

  - --balWeight *weight*: (between 0 and 1) weighting factor for the new data
     (for balancing). OR, *weight* can be a list of weights

  - --balCnt *count*: number of individual models in the balanced composite
     (for balancing)

  - --balH: use only the holdout set from the original data set in the balancing
     (for balancing)

  - --balT: use only the training set from the original data set in the balancing
     (for balancing)

  - -S: shuffle the original data set
     (for balancing)

  - -r: randomize the activities of the original data set
     (for balancing)

  - -N *note*: note to be attached to the grown composite when it's saved in the
     database

  - --outNote *note*: equivalent to -N

  - -o *filename*: name of an output file to hold the pickled composite after
     it has been grown.
     If multiple balance weights are used, the weights will be added to
     the filenames.

  - -L *limit*: provide an (integer) limit on individual model complexity

  - -d *database name*: instead of reading the data from a QDAT file,
     pull it from a database.  In this case, the _filename_ argument
     provides the name of the database table containing the data set.

  - -p *tablename*: store persistence data in the database
     in table *tablename*

  - -l: locks the random number generator to give consistent sets
     of training and hold-out data.  This is primarily intended
     for testing purposes.

  - -g: be less greedy when training the models.

  - -G *number*: force trees to be rooted at descriptor *number*.

  - -D: show a detailed breakdown of the composite model performance
     across the training and, when appropriate, hold-out sets.

  - -t *threshold value*: use high-confidence predictions for the final
     analysis of the hold-out data.

  - -q *list string*:  Add QuantTrees to the composite and use the list
     specified in *list string* as the number of target quantization
     bounds for each descriptor.  Don't forget to include 0's at the
     beginning and end of *list string* for the name and value fields.
     For example, if there are 4 descriptors and you want 2 quant bounds
     apiece, you would use _-q "[0,2,2,2,2,0]"_.
     Two special cases:
       1) If you would like to ignore a descriptor in the model building,
          use '-1' for its number of quant bounds.
       2) If you have integer valued data that should not be quantized
          further, enter 0 for that descriptor.

  - -V: print the version number and exit

"""
import RDConfig
from Numeric import *
from ML.Data import DataUtils,SplitData
from ML import ScreenComposite,BuildComposite
from ML.Composite import AdjustComposite
from Dbase.DbConnection import DbConnect
from ML import CompositeRun
import sys,cPickle,time,types

_runDetails = CompositeRun.CompositeRun()

__VERSION_STRING="0.5.0"

_verbose = 1
def message(msg):
  """ emits messages to _sys.stdout_
    override this in modules which import this one to redirect output

    **Arguments**

      - msg: the string to be displayed

  """
  if _verbose: sys.stdout.write('%s\n'%(msg))

def GrowIt(details,composite,progressCallback=None,
           saveIt=1,setDescNames=0,data=None):
  """ does the actual work of building a composite model

    **Arguments**

      - details:  a _CompositeRun.CompositeRun_ object containing details
        (options, parameters, etc.) about the run

      - composite: the composite model to grow

      - progressCallback: (optional) a function which is called with a single
        argument (the number of models built so far) after each model is built.

      - saveIt: (optional) if this is nonzero, the resulting model will be pickled
        and dumped to the filename specified in _details.outName_

      - setDescNames: (optional) if nonzero, the composite's _SetInputOrder()_ method
        will be called using the results of the data set's _GetVarNames()_ method;
        it is assumed that the details object has a _descNames attribute which
        is passed to the composites _SetDescriptorNames()_ method.  Otherwise
        (the default), _SetDescriptorNames()_ gets the results of _GetVarNames()_.

      - data: (optional) the data set to be used.  If this is not provided, the
        data set described in details will be used.

    **Returns**

      the enlarged composite model


  """
  details.rundate = time.asctime()

  if data is None:
    fName = details.tableName.strip()
    if details.outName == '':
      details.outName = fName + '.pkl'
    if details.dbName == '':
      data = DataUtils.BuildQuantDataSet(fName)
    elif details.qBounds != []:
      details.tableName = fName
      data = details.GetDataSet()
    else:
      data = DataUtils.DBToQuantData(details.dbName,fName,quantName=details.qTableName,
                                     user=details.dbUser,password=details.dbPassword)

  nExamples = data.GetNPts()
  seed = composite._randomSeed
  DataUtils.InitRandomNumbers(seed)
  testExamples = []
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data,shuffle=0,runDetails=details)

  namedExamples = data.GetNamedData()
  trainExamples = namedExamples
  nExamples = len(trainExamples)
  message('Training with %d examples'%(nExamples))
  message('\t%d descriptors'%(len(trainExamples[0])-2))
  nVars = data.GetNVars()
  nPossibleVals = composite.nPossibleVals
  attrs = range(1,nVars+1)

  if details.useTrees:
    from ML.DecTree import CrossValidate,PruneTree
    if details.qBounds != []:
      from ML.DecTree import BuildQuantTree
      builder = BuildQuantTree.QuantTreeBoot
    else:
      from ML.DecTree import ID3
      builder = ID3.ID3Boot
    driver = CrossValidate.CrossValidationDriver
    pruner = PruneTree.PruneTree

    if setDescNames:
      composite.SetInputOrder(data.GetVarNames())
    composite.Grow(trainExamples,attrs,[0]+nPossibleVals,
                   buildDriver=driver,
                   pruner=pruner,
                   nTries=details.nModels,pruneIt=details.pruneIt,
                   lessGreedy=details.lessGreedy,needsQuantization=0,
                   treeBuilder=builder,nQuantBounds=details.qBounds,
                   startAt=details.startAt,
                   maxDepth=details.limitDepth,
                   progressCallback=progressCallback,
                   silent=not _verbose)


  else:
    from ML.Neural import CrossValidate
    driver = CrossValidate.CrossValidationDriver
    composite.Grow(trainExamples,attrs,[0]+nPossibleVals,nTries=details.nModels,
                   buildDriver=driver,needsQuantization=0)

  composite.AverageErrors()
  composite.SortModels()
  modelList,counts,avgErrs = composite.GetAllData()
  counts = array(counts)
  avgErrs = array(avgErrs)
  composite._varNames = data.GetVarNames()

  for i in xrange(len(modelList)):
    modelList[i].NameModel(composite._varNames)

  # do final statistics
  weightedErrs = counts*avgErrs
  averageErr = sum(weightedErrs)/sum(counts)
  devs = (avgErrs - averageErr)
  devs = devs * counts
  devs = sqrt(devs*devs)
  avgDev = sum(devs)/sum(counts)
  if _verbose:
    message('# Overall Average Error: %%% 5.2f, Average Deviation: %%% 6.2f'%(100.*averageErr,100.*avgDev))

  if details.bayesModel:
    composite.Train(trainExamples,verbose=0)

  badExamples = []
  if not details.detailedRes:
    if _verbose:
      message('Testing all examples')
    wrong = BuildComposite.testall(composite,namedExamples,badExamples)
    if _verbose:
      message('%d examples (%% %5.2f) were misclassified'%(len(wrong),100.*float(len(wrong))/float(len(namedExamples))))
    _runDetails.overall_error = float(len(wrong))/len(namedExamples)

  if details.detailedRes:
    if _verbose:
      message('\nEntire data set:')
    resTup = ScreenComposite.ShowVoteResults(range(data.GetNPts()),data,composite,
                                             nPossibleVals[-1],details.threshold)
    nGood,nBad,nSkip,avgGood,avgBad,avgSkip,voteTab = resTup
    nPts = len(namedExamples)
    nClass = nGood+nBad
    _runDetails.overall_error = float(nBad) / nClass
    _runDetails.overall_correct_conf = avgGood
    _runDetails.overall_incorrect_conf = avgBad
    _runDetails.overall_result_matrix = repr(voteTab)
    nRej = nClass-nPts
    if nRej > 0:
      _runDetails.overall_fraction_dropped = float(nRej)/nPts

  return composite

def GetComposites(details):
  res = []
  if details.persistTblName and details.inNote:
    conn = DbConnect(details.dbName,details.persistTblName)
    mdls = conn.GetData(fields='MODEL',where="where note='%s'"%(details.inNote))
    for row in mdls:
      rawD = row[0]
      res.append(cPickle.loads(str(rawD)))
  elif details.composFileName:
    res.append(cPickle.load(open(details.composFileName,'rb')))
  return res

def BalanceComposite(details,composite,data1=None,data2=None):
  """ balances the composite using the parameters provided in details

   **Arguments**

     - details a _CompositeRun.RunDetails_ object

     - composite: the composite model to be balanced

     - data1: (optional) if provided, this should be the
       data set used to construct the original models

     - data2: (optional) if provided, this should be the
       data set used to construct the new individual models

  """
  if not details.balCnt or details.balCnt > len(composite):
    return composite
  message("Balancing Composite")

  #
  # start by getting data set 1: which is the data set used to build the
  #  original models
  #
  if data1 is None:
    message("\tReading First Data Set")
    fName = details.balTable.strip()
    tmp = details.tableName
    details.tableName = fName
    dbName = details.dbName
    details.dbName = details.balDb
    data1 = details.GetDataSet()
    details.tableName = tmp
    details.dbName = dbName
  if data1 is None:
    return composite
  details.splitFrac = composite._splitFrac
  details.randomSeed = composite._randomSeed
  DataUtils.InitRandomNumbers(details.randomSeed)
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data1,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data1,shuffle=0,runDetails=details)
  namedExamples = data1.GetNamedData()
  if details.balDoHoldout or details.balDoTrain:
    trainIdx,testIdx = SplitData.SplitIndices(len(namedExamples),details.splitFrac,
                                              silent=1)
    trainExamples = [namedExamples[x] for x in trainIdx]
    testExamples = [namedExamples[x] for x in testIdx]
    if details.filterFrac != 0.0:
      trainIdx,temp = DataUtils.FilterData(trainExamples,details.filterVal,
                                           details.filterFrac,-1,
                                           indicesOnly=1)
      tmp = [trainExamples[x] for x in trainIdx]
      testExamples += [trainExamples[x] for x in temp]
      trainExamples = tmp
    if details.balDoHoldout:
      testExamples,trainExamples = trainExamples,testExamples
  else:
    trainExamples = namedExamples
  dataSet1 = trainExamples
  cols1 = [x.upper() for x in data1.GetVarNames()]
  data1 = None

  #
  # now grab data set 2: the data used to build the new individual models
  #
  if data2 is None:
    message("\tReading Second Data Set")
    data2 = details.GetDataSet()
  if data2 is None:
    return composite
  details.splitFrac = composite._splitFrac
  details.randomSeed = composite._randomSeed
  DataUtils.InitRandomNumbers(details.randomSeed)
  if details.shuffleActivities == 1:
    DataUtils.RandomizeActivities(data2,shuffle=1,runDetails=details)
  elif details.randomActivities == 1:
    DataUtils.RandomizeActivities(data2,shuffle=0,runDetails=details)
  dataSet2 = data2.GetNamedData()
  cols2 = [x.upper() for x in data2.GetVarNames()]
  data2 = None

  # and balance it:
  res = []
  weights = details.balWeight
  if type(weights) not in (types.TupleType,types.ListType):
    weights = (weights,)
  for weight in weights:
    message("\tBalancing with Weight: %.4f"%(weight))
    res.append(AdjustComposite.BalanceComposite(composite,dataSet1,dataSet2,
                                                weight,
                                                details.balCnt,
                                                names1=cols1,names2=cols2))
  return res

def ShowVersion(includeArgs=0):
  """ prints the version number

  """
  print 'This is GrowComposite.py version %s'%(__VERSION_STRING)
  if includeArgs:
    import sys
    print 'command line was:'
    print ' '.join(sys.argv)

def Usage():
  """ provides a list of arguments for when this is used from the command line

  """
  import sys
  print __doc__
  sys.exit(-1)

def SetDefaults(runDetails=None):
  """  initializes a details object with default values

      **Arguments**

        - details:  (optional) a _CompositeRun.CompositeRun_ object.
          If this is not provided, the global _runDetails will be used.

      **Returns**

        the initialized _CompositeRun_ object.


  """
  if runDetails is None: runDetails = _runDetails
  return CompositeRun.SetDefaults(runDetails)

def ParseArgs(runDetails):
  """ parses command line arguments and updates _runDetails_

      **Arguments**

        - runDetails:  a _CompositeRun.CompositeRun_ object.

  """
  import getopt
  args,extra = getopt.getopt(sys.argv[1:],'P:o:n:p:b:sf:F:v:hlgd:rSTt:Q:q:DVG:L:C:N:',
                             ['inNote=','outNote=','balTable=','balWeight=','balCnt=',
                              'balH','balT','balDb=',])
  runDetails.inNote=''
  runDetails.composFileName=''
  runDetails.balTable=''
  runDetails.balWeight=(0.5,)
  runDetails.balCnt=0
  runDetails.balDoHoldout=0
  runDetails.balDoTrain=0
  runDetails.balDb=''
  for arg,val in args:
    if arg == '-n':
      runDetails.nModels = int(val)
    elif arg == '-C':
      runDetails.composFileName=val
    elif arg=='--balTable':
      runDetails.balTable=val
    elif arg=='--balWeight':
      runDetails.balWeight=eval(val)
      if type(runDetails.balWeight) not in (types.TupleType,types.ListType):
        runDetails.balWeight=(runDetails.balWeight,)
    elif arg=='--balCnt':
      runDetails.balCnt=int(val)
    elif arg=='--balH':
      runDetails.balDoHoldout=1
    elif arg=='--balT':
      runDetails.balDoTrain=1
    elif arg=='--balDb':
      runDetails.balDb=val
    elif arg == '--inNote':
      runDetails.inNote=val
    elif arg == '-N' or arg=='--outNote':
      runDetails.note=val
    elif arg == '-o':
      runDetails.outName = val
    elif arg == '-p':
      runDetails.persistTblName=val
    elif arg == '-r':
      runDetails.randomActivities = 1
    elif arg == '-S':
      runDetails.shuffleActivities = 1
    elif arg == '-h':
      Usage()
    elif arg == '-l':
      runDetails.lockRandom = 1
    elif arg == '-g':
      runDetails.lessGreedy=1
    elif arg == '-G':
      runDetails.startAt = int(val)
    elif arg == '-d':
      runDetails.dbName=val
    elif arg == '-T':
      runDetails.useTrees = 0
    elif arg == '-t':
      runDetails.threshold=float(val)
    elif arg == '-D':
      runDetails.detailedRes = 1
    elif arg == '-L':
      runDetails.limitDepth = int(val)
    elif arg == '-q':
      qBounds = eval(val)
      assert type(qBounds) in (types.TupleType,types.ListType),'bad argument type for -q, specify a list as a string'
      runDetails.qBoundCount=val
      runDetails.qBounds = qBounds
    elif arg == '-Q':
      qBounds = eval(val)
      assert type(qBounds) in [type([]),type(())],'bad argument type for -Q, specify a list as a string'
      runDetails.activityBounds=qBounds
      runDetails.activityBoundsVals=val
    elif arg == '-V':
      ShowVersion()
      sys.exit(0)
    else:
      print >>sys.stderr,'bad argument:',arg
      Usage()
  runDetails.tableName=extra[0]
  if not runDetails.balDb:
    runDetails.balDb=runDetails.dbName
if __name__ == '__main__':
  if len(sys.argv) < 2:
    Usage()

  _runDetails.cmd = ' '.join(sys.argv)
  SetDefaults(_runDetails)
  ParseArgs(_runDetails)

  ShowVersion(includeArgs=1)

  initModels = GetComposites(_runDetails)
  nModels = len(initModels)
  if nModels>1:
    for i in range(nModels):
      sys.stderr.write('---------------------------------\n\tDoing %d of %d\n---------------------------------\n'%(i+1,nModels))
      composite = GrowIt(_runDetails,initModels[i],setDescNames=1)
      if _runDetails.balTable and _runDetails.balCnt:
        composites = BalanceComposite(_runDetails,composite)
      else:
        composites=[composite]
      for mdl in composites:
        mdl.ClearModelExamples()
      if _runDetails.outName:
        nWeights = len(_runDetails.balWeight)
        if nWeights==1:
          outName = _runDetails.outName
          composites[0].Pickle(outName)
        else:
          for i in range(nWeights):
            weight = int(100*_runDetails.balWeight[i])
            model = composites[i]
            outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight)
            model.Pickle(outName)
      if _runDetails.persistTblName and _runDetails.dbName:
        message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName))
        if(len(_runDetails.balWeight))>1:
          message('WARNING: updating results table with models having different weights')
        # save the composite
        for i in range(len(composites)):
          _runDetails.model = cPickle.dumps(composites[i])
          _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName)
  elif nModels==1:
    composite = GrowIt(_runDetails,initModels[0],setDescNames=1)
    if _runDetails.balTable and _runDetails.balCnt:
      composites = BalanceComposite(_runDetails,composite)
    else:
      composites=[composite]
    for mdl in composites:
      mdl.ClearModelExamples()
    if _runDetails.outName:
      nWeights = len(_runDetails.balWeight)
      if nWeights==1:
        outName = _runDetails.outName
        composites[0].Pickle(outName)
      else:
        for i in range(nWeights):
          weight = int(100*_runDetails.balWeight[i])
          model = composites[i]
          outName = '%s.%d.pkl'%(_runDetails.outName.split('.pkl')[0],weight)
          model.Pickle(outName)
    if _runDetails.persistTblName and _runDetails.dbName:
      message('Updating results table %s:%s'%(_runDetails.dbName,_runDetails.persistTblName))
      if(len(composites))>1:
        message('WARNING: updating results table with models having different weights')
      for i in range(len(composites)):
        _runDetails.model = cPickle.dumps(composites[i])
        _runDetails.Store(db=_runDetails.dbName,table=_runDetails.persistTblName)
  else:
    message("No models found")