rdkit/Python/ML/Data/Quantize.py

# $Id$
#
#  Copyright (C) 2001,2002,2003  Greg Landrum and Rational Discovery LLC
#   All Rights Reserved
#

""" Automatic search for quantization bounds

This uses the expected informational gain to determine where quantization bounds should
lie.

**Notes**:

  - bounds are less than, so if the bounds are [1.,2.],
    [0.9,1.,1.1,2.,2.2] -> [0,1,1,2,2]

"""
from Numeric import *
from ML.InfoTheory import entropy
try:
  import cQuantize
except:
  hascQuantize = 0
else:
  hascQuantize = 1

_float_tol = 1e-8
def feq(v1,v2,tol=_float_tol):
  """ floating point equality with a tolerance factor

    **Arguments**

      - v1: a float

      - v2: a float

      - tol: the tolerance for comparison

    **Returns**

      0 or 1

  """
  return abs(v1-v2) < tol

def FindVarQuantBound(vals,results,nPossibleRes):
  """ Uses FindVarMultQuantBounds, only here for historic reasons
  """
  bounds,gain = FindVarMultQuantBounds(vals,1,results,nPossibleRes)
  return (bounds[0],gain)


def _GenVarTable(vals,cuts,starts,results,nPossibleRes):
  """ Primarily intended for internal use

   constructs a variable table for the data passed in
   The table for a given variable records the number of times each possible value
    of that variable appears for each possible result of the function.

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     the varTable, a 2D Numeric array which is nVarValues x nPossibleRes

   **Notes**

     - _vals_ should be sorted!

  """
  nVals = len(cuts)+1
  varTable = zeros((nVals,nPossibleRes),Int)
  idx = 0
  for i in xrange(nVals-1):
    cut = cuts[i]
    while idx < starts[cut]:
      varTable[i,results[idx]] += 1
      idx += 1
  while idx < len(vals):
    varTable[-1,results[idx]] += 1
    idx += 1
  return varTable

def _PyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
  """ Primarily intended for internal use

   Recursively finds the best quantization boundaries

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables,
       this should be sorted

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - which: an integer indicating which bound is being adjusted here
       (and index into _cuts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     - a 2-tuple containing:

       1) the best information gain found so far

       2) a list of the quantization bound indices ( _cuts_ for the best case)

   **Notes**

    - this is not even remotely efficient, which is why a C replacement
      was written

  """
  nBounds = len(cuts)
  maxGain = -1e6
  bestCuts = None
  highestCutHere = len(starts) - nBounds + which
  if varTable is None:
    varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
  while cuts[which] <= highestCutHere:
    varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
    gainHere = entropy.InfoGain(varTable)
    if gainHere > maxGain:
      maxGain = gainHere
      bestCuts = cuts[:]
    # recurse on the next vars if needed
    if which < nBounds-1:
      gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
                                         varTable = varTable)
      if gainHere > maxGain:
        maxGain = gainHere
        bestCuts = cutsHere
    # update this cut
    cuts[which] += 1
    for i in range(which+1,nBounds):
      if cuts[i] == cuts[i-1]:
        cuts[i] += 1

  return maxGain,bestCuts

def _NewPyRecurseOnBounds(vals,cuts,which,starts,results,nPossibleRes,varTable=None):
  """ Primarily intended for internal use

   Recursively finds the best quantization boundaries

   **Arguments**

     - vals: a 1D Numeric array with the values of the variables,
       this should be sorted

     - cuts: a list with the indices of the quantization bounds
       (indices are into _starts_ )

     - which: an integer indicating which bound is being adjusted here
       (and index into _cuts_ )

     - starts: a list of potential starting points for quantization bounds

     - results: a 1D Numeric array of integer result codes

     - nPossibleRes: an integer with the number of possible result codes

   **Returns**

     - a 2-tuple containing:

       1) the best information gain found so far

       2) a list of the quantization bound indices ( _cuts_ for the best case)

   **Notes**

    - this is not even remotely efficient, which is why a C replacement
      was written

  """
  nBounds = len(cuts)
  maxGain = -1e6
  bestCuts = None
  highestCutHere = len(starts) - nBounds + which
  if varTable is None:
    varTable = _GenVarTable(vals,cuts,starts,results,nPossibleRes)
  while cuts[which] <= highestCutHere:
    gainHere = entropy.InfoGain(varTable)
    if gainHere > maxGain:
      maxGain = gainHere
      bestCuts = cuts[:]
    # recurse on the next vars if needed
    if which < nBounds-1:
      gainHere,cutsHere=_RecurseOnBounds(vals,cuts[:],which+1,starts,results,nPossibleRes,
                                         varTable = None)
      if gainHere > maxGain:
        maxGain = gainHere
        bestCuts = cutsHere
    # update this cut
    oldCut = cuts[which]
    cuts[which] += 1
    bot = starts[oldCut]
    if oldCut+1 < len(starts):
      top = starts[oldCut+1]
    else:
      top = starts[-1]
    for i in range(bot,top):
      v = results[i]
      varTable[which,v] += 1
      varTable[which+1,v] -= 1
    for i in range(which+1,nBounds):
      if cuts[i] == cuts[i-1]:
        cuts[i] += 1


  return maxGain,bestCuts


  # --------------------------------
  #
  # find all possible dividing points
  #
  #  There are a couple requirements for a dividing point:
  #    1) the dependent variable (descriptor) must change across it,
  #    2) the result score must change across it
  #
  #  So, in the list [(0,0),(1,0),(1,1),(2,1)]:
  #    - we cannot divide before (1,0) (same activity value)
  #    - we cannot divide before (1,1) (same descriptor value)
  #    - we can divide before (2,1) (same descriptor value)
  #
  # --------------------------------
def _NewPyFindStartPoints(sortVals,sortResults,nData):
  startNext = []
  tol = 1e-8
  i = 0
  start=0
  actHomog=1
  valHomog=1
  while i<nData:
    # we don't need to use abs (the list is ordered)
    if sortVals[i]-sortVals[start]>tol:
      valHomog=0
    if sortResults[i]!=sortResults[start]:
      actHomog=0
    if not actHomog and not valHomog:
      # we have a switch, now we just need to figure out where the
      #  switch is.
      if( sortVals[i]-sortVals[i-1]<tol ):
        # we're in a block with constant descriptor value, find its beginning:
        while i>1 and sortVals[i]-sortVals[i-1]<tol:
          i-=1
        # i is now just upstream of the transition, which is exactly where
        #  we want the cut point
      else:
        # we don't need to touch i, the dividing line goes right before it
        pass
      startNext.append(i)
      start = i
      actHomog = 1
      valHomog = 1
    i += 1
  return startNext

def FindVarMultQuantBounds(vals,nBounds,results,nPossibleRes):
  """ finds multiple quantization bounds for a single variable

   **Arguments**

     - vals: sequence of variable values (assumed to be floats)

     - nBounds: the number of quantization bounds to find

     - results: a list of result codes (should be integers)

     - nPossibleRes: an integer with the number of possible values of the
       result variable

   **Returns**

     - a 2-tuple containing:

       1) a list of the quantization bounds (floats)

       2) the information gain associated with this quantization


  """
  assert len(vals) == len(results), 'vals/results length mismatch'

  nData = len(vals)
  if nData == 0:
    return [],-1e8

  # sort the variable values:
  #  Bypass the type-checking stuff that happens in a normal
  #  argsort call:
  sortIdx = multiarray.argsort(vals)
  sortVals = array(take(vals,sortIdx),Float)
  sortResults = array(take(results,sortIdx),Int)
  startNext=_FindStartPoints(sortVals,sortResults,nData)
  if not len(startNext):
    return [0],0.0
  if len(startNext)<nBounds:
    nBounds = len(startNext)-1
  if nBounds == 0:
    nBounds=1
  initCuts = range(nBounds)
  maxGain,bestCuts = _RecurseOnBounds(sortVals,initCuts,0,startNext,
                                      sortResults,nPossibleRes)
  quantBounds = []
  nVs = len(sortVals)
  for cut in bestCuts:
    idx = startNext[cut]
    if idx == nVs:
      quantBounds.append(sortVals[-1])
    elif idx == 0:
      quantBounds.append(sortVals[idx])
    else:
      quantBounds.append((sortVals[idx]+sortVals[idx-1])/2.)

  return quantBounds,maxGain

if hascQuantize:
  _RecurseOnBounds = cQuantize._RecurseOnBounds
  _FindStartPoints = cQuantize._FindStartPoints
else:
  _RecurseOnBounds = _NewPyRecurseOnBounds
  _FindStartPoints = _NewPyFindStartPoints

if __name__ == '__main__':
  import sys
  if 1:
    d = [(1.,0),
         (1.1,0),
         (1.2,0),
         (1.4,1),
         (1.4,0),
         (1.6,1),
         (2.,1),
         (2.1,0),
         (2.1,0),
         (2.1,0),
         (2.2,1),
         (2.3,0)]
    varValues = map(lambda x:x[0],d)
    resCodes = map(lambda x:x[1],d)
    nPossibleRes = 2
    res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
    print 'RES:',res
    target = ([1.3, 2.05],.34707 )
  else:
    d = [(1.,0),
         (1.1,0),
         (1.2,0),
         (1.4,1),
         (1.4,0),
         (1.6,1),
         (2.,1),
         (2.1,0),
         (2.1,0),
         (2.1,0),
         (2.2,1),
         (2.3,0)]
    varValues = map(lambda x:x[0],d)
    resCodes = map(lambda x:x[1],d)
    nPossibleRes =2
    res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
    print res
    #sys.exit(1)
    d = [(1.4,1),
         (1.4,0)]

    varValues = map(lambda x:x[0],d)
    resCodes = map(lambda x:x[1],d)
    nPossibleRes =2
    res = FindVarMultQuantBounds(varValues,1,resCodes,nPossibleRes)
    print res

    d = [(1.4,0),
         (1.4,0),(1.6,1)]
    varValues = map(lambda x:x[0],d)
    resCodes = map(lambda x:x[1],d)
    nPossibleRes =2
    res = FindVarMultQuantBounds(varValues,2,resCodes,nPossibleRes)
    print res