rdkit/Python/ML/InfoTheory/BitRank.py

#
#  Copyright (C) 2001,2002,2003  greg Landrum and Rational Discovery LLC
#
""" Functionality for ranking bits using info gains

 **Definitions used in this module**

    - *sequence*: an object capable of containing other objects which supports
      __getitem__() and __len__().  Examples of these include lists, tuples, and
      Numeric arrays.

    - *IntVector*: an object containing integers which supports __getitem__() and
       __len__(). Examples include lists, tuples, Numeric Arrays, and BitVects.


 **NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment.
   It is perfectly acceptable for them to be read-only, so long as they are
   random-access.

"""
from Numeric import *
from ML.InfoTheory import entropy

def FormCounts(bitVects,actVals,whichBit,nPossibleActs,nPossibleBitVals=2):
  """ generates the counts matrix for a particular bit

  **Arguments**

    - bitVects: a *sequence* containing *IntVectors*

    - actVals: a *sequence*

    - whichBit: an integer, the bit number to use.

    - nPossibleActs: the (integer) number of possible activity values.

    - nPossibleBitVals: (optional) if specified, this integer provides the maximum
      value attainable by the (increasingly inaccurately named) bits in _bitVects_.

  **Returns**

    a Numeric array with the counts

  **Notes**

    This is really intended for internal use.

  """
  if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
  res = zeros((nPossibleBitVals,nPossibleActs),Int)
  for i in xrange(len(bitVects)):
    res[bitVects[i][whichBit],actVals[i]] += 1
  return res

def CalcInfoGains(bitVects,actVals,nPossibleActs,nPossibleBitVals=2):
  """  Calculates the information gain for a set of points and activity values

  **Arguments**

    - bitVects: a *sequence* containing *IntVectors*

    - actVals: a *sequence*

    - nPossibleActs: the (integer) number of possible activity values.

    - nPossibleBitVals: (optional) if specified, this integer provides the maximum
      value attainable by the (increasingly inaccurately named) bits in _bitVects_.

   **Returns**

     a list of floats

  """
  if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
  nBits = len(bitVects[0])
  res = zeros(nBits,Float)

  for bit in xrange(nBits):
    counts = FormCounts(bitVects,actVals,bit,nPossibleActs,
                        nPossibleBitVals=nPossibleBitVals)
    res[bit] = entropy.InfoGain(counts)
  return res

def RankBits(bitVects,actVals,nPossibleBitVals=2,
             metricFunc=CalcInfoGains):
  """ Rank a set of bits according to a metric function

  **Arguments**

    - bitVects: a *sequence* containing *IntVectors*

    - actVals: a *sequence*

    - nPossibleBitVals: (optional) if specified, this integer provides the maximum
      value attainable by the (increasingly inaccurately named) bits in _bitVects_.

    - metricFunc: (optional) the metric function to be used.  See _CalcInfoGains()_
      for a description of the signature of this function.

   **Returns**

     A 2-tuple containing:

       - the relative order of the bits (a list of ints)

       - the metric calculated for each bit (a list of floats)

  """
  nPossibleActs = max(actVals)+1
  metrics = metricFunc(bitVects,actVals,nPossibleActs,
                       nPossibleBitVals=nPossibleBitVals)
  bitOrder = list(argsort(metrics))
  bitOrder.reverse()
  return bitOrder,metrics


def AnalyzeSparseVects(bitVects,actVals):
  """ #DOC

  **Arguments**

    - bitVects: a *sequence* containing SBVs

    - actVals: a *sequence*

   **Returns**

     a list of floats

   **Notes**

      - these need to be bit vects and binary activities

  """
  nPts = len(bitVects)
  if nPts != len(actVals): raise ValueError,'var and activity lists should be the same length'
  nBits = bitVects[0].GetSize()

  actives = zeros(nBits,Int)
  inactives = zeros(nBits,Int)
  nActives,nInactives = 0,0
  for i in range(nPts):
    sig,act = bitVects[i],actVals[i]
    onBitList = sig.GetOnBits()
    if act:
      for bit in onBitList:
        actives[bit] += 1
      nActives += 1
    else:
      for bit in onBitList:
        inactives[bit] += 1
      nInactives += 1
  resTbl = zeros((2,2),Int)
  res = []
  gains = []
  counts = []
  for bit in xrange(nBits):
    nAct,nInact = actives[bit],inactives[bit]
    if nAct or nInact:
      resTbl[0,0] = nAct
      resTbl[1,0] = nPts - nAct
      resTbl[0,1] = nInact
      resTbl[1,1] = nPts - nInact
      gain = entropy.InfoGain(resTbl)
      gains.append(gain)
      res.append((bit,gain,nAct,nInact))
  return res,gains

def SparseRankBits(bitVects,actVals,metricFunc=AnalyzeSparseVects):
  """ Rank a set of bits according to a metric function

  **Arguments**

    - bitVects: a *sequence* containing SBVs

    - actVals: a *sequence*

    - metricFunc: (optional) the metric function to be used.  See _SparseCalcInfoGains()_
      for a description of the signature of this function.

   **Returns**

     A 2-tuple containing:

       - the relative order of the bits (a list of ints)

       - the metric calculated for each bit (a list of floats)

    **Notes**

      - these need to be bit vects and binary activities

  """
  info,metrics = metricFunc(bitVects,actVals)
  bitOrder = list(argsort(metrics))
  bitOrder.reverse()
  return bitOrder,info