mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-07 22:44:25 +08:00
199 lines
5.3 KiB
Python
Executable File
199 lines
5.3 KiB
Python
Executable File
#
|
|
# Copyright (C) 2001,2002,2003 greg Landrum and Rational Discovery LLC
|
|
#
|
|
""" Functionality for ranking bits using info gains
|
|
|
|
**Definitions used in this module**
|
|
|
|
- *sequence*: an object capable of containing other objects which supports
|
|
__getitem__() and __len__(). Examples of these include lists, tuples, and
|
|
Numeric arrays.
|
|
|
|
- *IntVector*: an object containing integers which supports __getitem__() and
|
|
__len__(). Examples include lists, tuples, Numeric Arrays, and BitVects.
|
|
|
|
|
|
**NOTE**: Neither *sequences* nor *IntVectors* need to support item assignment.
|
|
It is perfectly acceptable for them to be read-only, so long as they are
|
|
random-access.
|
|
|
|
"""
|
|
from Numeric import *
|
|
from ML.InfoTheory import entropy
|
|
|
|
def FormCounts(bitVects,actVals,whichBit,nPossibleActs,nPossibleBitVals=2):
|
|
""" generates the counts matrix for a particular bit
|
|
|
|
**Arguments**
|
|
|
|
- bitVects: a *sequence* containing *IntVectors*
|
|
|
|
- actVals: a *sequence*
|
|
|
|
- whichBit: an integer, the bit number to use.
|
|
|
|
- nPossibleActs: the (integer) number of possible activity values.
|
|
|
|
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
|
|
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
|
|
|
|
**Returns**
|
|
|
|
a Numeric array with the counts
|
|
|
|
**Notes**
|
|
|
|
This is really intended for internal use.
|
|
|
|
"""
|
|
if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
|
|
res = zeros((nPossibleBitVals,nPossibleActs),Int)
|
|
for i in xrange(len(bitVects)):
|
|
res[bitVects[i][whichBit],actVals[i]] += 1
|
|
return res
|
|
|
|
def CalcInfoGains(bitVects,actVals,nPossibleActs,nPossibleBitVals=2):
|
|
""" Calculates the information gain for a set of points and activity values
|
|
|
|
**Arguments**
|
|
|
|
- bitVects: a *sequence* containing *IntVectors*
|
|
|
|
- actVals: a *sequence*
|
|
|
|
- nPossibleActs: the (integer) number of possible activity values.
|
|
|
|
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
|
|
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
|
|
|
|
**Returns**
|
|
|
|
a list of floats
|
|
|
|
"""
|
|
if len(bitVects) != len(actVals): raise ValueError,'var and activity lists should be the same length'
|
|
nBits = len(bitVects[0])
|
|
res = zeros(nBits,Float)
|
|
|
|
for bit in xrange(nBits):
|
|
counts = FormCounts(bitVects,actVals,bit,nPossibleActs,
|
|
nPossibleBitVals=nPossibleBitVals)
|
|
res[bit] = entropy.InfoGain(counts)
|
|
return res
|
|
|
|
def RankBits(bitVects,actVals,nPossibleBitVals=2,
|
|
metricFunc=CalcInfoGains):
|
|
""" Rank a set of bits according to a metric function
|
|
|
|
**Arguments**
|
|
|
|
- bitVects: a *sequence* containing *IntVectors*
|
|
|
|
- actVals: a *sequence*
|
|
|
|
- nPossibleBitVals: (optional) if specified, this integer provides the maximum
|
|
value attainable by the (increasingly inaccurately named) bits in _bitVects_.
|
|
|
|
- metricFunc: (optional) the metric function to be used. See _CalcInfoGains()_
|
|
for a description of the signature of this function.
|
|
|
|
**Returns**
|
|
|
|
A 2-tuple containing:
|
|
|
|
- the relative order of the bits (a list of ints)
|
|
|
|
- the metric calculated for each bit (a list of floats)
|
|
|
|
"""
|
|
nPossibleActs = max(actVals)+1
|
|
metrics = metricFunc(bitVects,actVals,nPossibleActs,
|
|
nPossibleBitVals=nPossibleBitVals)
|
|
bitOrder = list(argsort(metrics))
|
|
bitOrder.reverse()
|
|
return bitOrder,metrics
|
|
|
|
|
|
def AnalyzeSparseVects(bitVects,actVals):
|
|
""" #DOC
|
|
|
|
**Arguments**
|
|
|
|
- bitVects: a *sequence* containing SBVs
|
|
|
|
- actVals: a *sequence*
|
|
|
|
**Returns**
|
|
|
|
a list of floats
|
|
|
|
**Notes**
|
|
|
|
- these need to be bit vects and binary activities
|
|
|
|
"""
|
|
nPts = len(bitVects)
|
|
if nPts != len(actVals): raise ValueError,'var and activity lists should be the same length'
|
|
nBits = bitVects[0].GetSize()
|
|
|
|
actives = zeros(nBits,Int)
|
|
inactives = zeros(nBits,Int)
|
|
nActives,nInactives = 0,0
|
|
for i in range(nPts):
|
|
sig,act = bitVects[i],actVals[i]
|
|
onBitList = sig.GetOnBits()
|
|
if act:
|
|
for bit in onBitList:
|
|
actives[bit] += 1
|
|
nActives += 1
|
|
else:
|
|
for bit in onBitList:
|
|
inactives[bit] += 1
|
|
nInactives += 1
|
|
resTbl = zeros((2,2),Int)
|
|
res = []
|
|
gains = []
|
|
counts = []
|
|
for bit in xrange(nBits):
|
|
nAct,nInact = actives[bit],inactives[bit]
|
|
if nAct or nInact:
|
|
resTbl[0,0] = nAct
|
|
resTbl[1,0] = nPts - nAct
|
|
resTbl[0,1] = nInact
|
|
resTbl[1,1] = nPts - nInact
|
|
gain = entropy.InfoGain(resTbl)
|
|
gains.append(gain)
|
|
res.append((bit,gain,nAct,nInact))
|
|
return res,gains
|
|
|
|
def SparseRankBits(bitVects,actVals,metricFunc=AnalyzeSparseVects):
|
|
""" Rank a set of bits according to a metric function
|
|
|
|
**Arguments**
|
|
|
|
- bitVects: a *sequence* containing SBVs
|
|
|
|
- actVals: a *sequence*
|
|
|
|
- metricFunc: (optional) the metric function to be used. See _SparseCalcInfoGains()_
|
|
for a description of the signature of this function.
|
|
|
|
**Returns**
|
|
|
|
A 2-tuple containing:
|
|
|
|
- the relative order of the bits (a list of ints)
|
|
|
|
- the metric calculated for each bit (a list of floats)
|
|
|
|
**Notes**
|
|
|
|
- these need to be bit vects and binary activities
|
|
|
|
"""
|
|
info,metrics = metricFunc(bitVects,actVals)
|
|
bitOrder = list(argsort(metrics))
|
|
bitOrder.reverse()
|
|
return bitOrder,info
|
|
|