Files
rdkit/Python/ML/InfoTheory/BitRank.py
Greg Landrum 75a79b6327 initial import
2006-05-06 22:20:08 +00:00

199 lines
5.3 KiB
Python
Executable File

#
# Copyright (C) 2001,2002,2003 greg Landrum and Rational Discovery LLC
#
""" Functionality for ranking bits using info gains
**Definitions used in this module**
- *sequence*: an object capable of containing other objects which supports
__getitem__() and __len__(). Examples of these include lists, tuples, and
Numeric arrays.
- *IntVector*: an object containing integers which supports __getitem__() and
__len__(). Examples include lists, tuples, Numeric Arrays, and BitVects.
**NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment.
It is perfectly acceptable for them to be read-only, so long as they are
random-access.
"""
from Numeric import *
from ML.InfoTheory import entropy
def FormCounts(bitVects,actVals,whichBit,nPossibleActs,nPossibleBitVals=2):
""" generates the counts matrix for a particular bit
**Arguments**
- bitVects: a *sequence* containing *IntVectors*
- actVals: a *sequence*
- whichBit: an integer, the bit number to use.
- nPossibleActs: the (integer) number of possible activity values.
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
**Returns**
a Numeric array with the counts
**Notes**
This is really intended for internal use.
"""
if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
res = zeros((nPossibleBitVals,nPossibleActs),Int)
for i in xrange(len(bitVects)):
res[bitVects[i][whichBit],actVals[i]] += 1
return res
def CalcInfoGains(bitVects,actVals,nPossibleActs,nPossibleBitVals=2):
""" Calculates the information gain for a set of points and activity values
**Arguments**
- bitVects: a *sequence* containing *IntVectors*
- actVals: a *sequence*
- nPossibleActs: the (integer) number of possible activity values.
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
**Returns**
a list of floats
"""
if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
nBits = len(bitVects[0])
res = zeros(nBits,Float)
for bit in xrange(nBits):
counts = FormCounts(bitVects,actVals,bit,nPossibleActs,
nPossibleBitVals=nPossibleBitVals)
res[bit] = entropy.InfoGain(counts)
return res
def RankBits(bitVects,actVals,nPossibleBitVals=2,
metricFunc=CalcInfoGains):
""" Rank a set of bits according to a metric function
**Arguments**
- bitVects: a *sequence* containing *IntVectors*
- actVals: a *sequence*
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
- metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_
for a description of the signature of this function.
**Returns**
A 2-tuple containing:
- the relative order of the bits (a list of ints)
- the metric calculated for each bit (a list of floats)
"""
nPossibleActs = max(actVals)+1
metrics = metricFunc(bitVects,actVals,nPossibleActs,
nPossibleBitVals=nPossibleBitVals)
bitOrder = list(argsort(metrics))
bitOrder.reverse()
return bitOrder,metrics
def AnalyzeSparseVects(bitVects,actVals):
""" #DOC
**Arguments**
- bitVects: a *sequence* containing SBVs
- actVals: a *sequence*
**Returns**
a list of floats
**Notes**
- these need to be bit vects and binary activities
"""
nPts = len(bitVects)
if nPts != len(actVals): raise ValueError,'var and activity lists should be the same length'
nBits = bitVects[0].GetSize()
actives = zeros(nBits,Int)
inactives = zeros(nBits,Int)
nActives,nInactives = 0,0
for i in range(nPts):
sig,act = bitVects[i],actVals[i]
onBitList = sig.GetOnBits()
if act:
for bit in onBitList:
actives[bit] += 1
nActives += 1
else:
for bit in onBitList:
inactives[bit] += 1
nInactives += 1
resTbl = zeros((2,2),Int)
res = []
gains = []
counts = []
for bit in xrange(nBits):
nAct,nInact = actives[bit],inactives[bit]
if nAct or nInact:
resTbl[0,0] = nAct
resTbl[1,0] = nPts - nAct
resTbl[0,1] = nInact
resTbl[1,1] = nPts - nInact
gain = entropy.InfoGain(resTbl)
gains.append(gain)
res.append((bit,gain,nAct,nInact))
return res,gains
def SparseRankBits(bitVects,actVals,metricFunc=AnalyzeSparseVects):
""" Rank a set of bits according to a metric function
**Arguments**
- bitVects: a *sequence* containing SBVs
- actVals: a *sequence*
- metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_
for a description of the signature of this function.
**Returns**
A 2-tuple containing:
- the relative order of the bits (a list of ints)
- the metric calculated for each bit (a list of floats)
**Notes**
- these need to be bit vects and binary activities
"""
info,metrics = metricFunc(bitVects,actVals)
bitOrder = list(argsort(metrics))
bitOrder.reverse()
return bitOrder,info