mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
first pass, using google style
This commit is contained in:
@@ -20,11 +20,9 @@ namespace python = boost::python;
|
||||
|
||||
typedef double real;
|
||||
|
||||
extern "C"
|
||||
void distdriver_(boost::int64_t *n,boost::int64_t *len,
|
||||
real *dists,
|
||||
boost::int64_t *toggle,
|
||||
boost::int64_t *ia,boost::int64_t *ib,real *crit);
|
||||
extern "C" void distdriver_(boost::int64_t *n, boost::int64_t *len, real *dists,
|
||||
boost::int64_t *toggle, boost::int64_t *ia,
|
||||
boost::int64_t *ib, real *crit);
|
||||
|
||||
//
|
||||
// Rather than deal with any nonsense like trying to get
|
||||
@@ -32,54 +30,53 @@ void distdriver_(boost::int64_t *n,boost::int64_t *len,
|
||||
// (thus drowning in the waves of f2c hate), we'll generate
|
||||
// the distance matrix on our own here and then call distdriver_
|
||||
//
|
||||
void clusterit(real *dataP,boost::int64_t n,boost::int64_t m,boost::int64_t iopt,
|
||||
boost::int64_t *ia,boost::int64_t *ib,real *crit){
|
||||
void clusterit(real *dataP, boost::int64_t n, boost::int64_t m,
|
||||
boost::int64_t iopt, boost::int64_t *ia, boost::int64_t *ib,
|
||||
real *crit) {
|
||||
real *dists;
|
||||
boost::int64_t len;
|
||||
boost::int64_t pos = 0;
|
||||
boost::int64_t i,j,k,iTab,jTab;
|
||||
boost::int64_t i, j, k, iTab, jTab;
|
||||
double tmp;
|
||||
len = (n*(n-1))/2;
|
||||
dists = (real *)calloc(len,sizeof(real));
|
||||
for(i=1;i<n;i++){
|
||||
iTab = i*m;
|
||||
for(j=0;j<i;j++){
|
||||
jTab = j*m;
|
||||
for(k=0;k<m;k++){
|
||||
tmp = dataP[iTab+k]-dataP[jTab+k];
|
||||
dists[pos] += tmp*tmp;
|
||||
len = (n * (n - 1)) / 2;
|
||||
dists = (real *)calloc(len, sizeof(real));
|
||||
for (i = 1; i < n; i++) {
|
||||
iTab = i * m;
|
||||
for (j = 0; j < i; j++) {
|
||||
jTab = j * m;
|
||||
for (k = 0; k < m; k++) {
|
||||
tmp = dataP[iTab + k] - dataP[jTab + k];
|
||||
dists[pos] += tmp * tmp;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
distdriver_(&n,&len,dists,&iopt,ia,ib,crit);
|
||||
distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
|
||||
free(dists);
|
||||
};
|
||||
|
||||
static PyObject *
|
||||
Clustering_MurtaghCluster(python::object data, int nPts, int sz, int option)
|
||||
{
|
||||
static PyObject *Clustering_MurtaghCluster(python::object data, int nPts,
|
||||
int sz, int option) {
|
||||
PyArrayObject *dataContig;
|
||||
boost::int64_t *ia,*ib;
|
||||
boost::int64_t *ia, *ib;
|
||||
real *crit;
|
||||
PyObject *res;
|
||||
PyObject *tmp;
|
||||
npy_intp dims[2];
|
||||
|
||||
if (PyArray_Check(data.ptr())) {
|
||||
dataContig
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(data.ptr(),PyArray_DOUBLE,2,2));
|
||||
}
|
||||
else {
|
||||
dataContig = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(data.ptr(), PyArray_DOUBLE, 2, 2));
|
||||
} else {
|
||||
throw_value_error("PyArray_Type expected as input");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ia = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
|
||||
ib = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
|
||||
crit = (real *)calloc(nPts,sizeof(real));
|
||||
ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
|
||||
ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
|
||||
crit = (real *)calloc(nPts, sizeof(real));
|
||||
|
||||
clusterit((real *)dataContig->data,nPts,sz,option,ia,ib,crit);
|
||||
clusterit((real *)dataContig->data, nPts, sz, option, ia, ib, crit);
|
||||
|
||||
dims[0] = nPts;
|
||||
res = PyTuple_New(3);
|
||||
@@ -88,52 +85,47 @@ Clustering_MurtaghCluster(python::object data, int nPts, int sz, int option)
|
||||
// that's why it's ok that we do not free them in this function,
|
||||
// Python will take care of it for us.
|
||||
//
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ia);
|
||||
PyTuple_SetItem(res,0,(PyObject *)tmp);
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
|
||||
PyTuple_SetItem(res, 0, (PyObject *)tmp);
|
||||
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ib);
|
||||
PyTuple_SetItem(res,1,(PyObject *)tmp);
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
|
||||
PyTuple_SetItem(res, 1, (PyObject *)tmp);
|
||||
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_DOUBLE,(void *)crit);
|
||||
PyTuple_SetItem(res,2,(PyObject *)tmp);
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
|
||||
PyTuple_SetItem(res, 2, (PyObject *)tmp);
|
||||
|
||||
return res;
|
||||
};
|
||||
|
||||
|
||||
|
||||
void distclusterit(real *dists,boost::int64_t n,boost::int64_t iopt,
|
||||
boost::int64_t *ia,boost::int64_t *ib,real *crit){
|
||||
void distclusterit(real *dists, boost::int64_t n, boost::int64_t iopt,
|
||||
boost::int64_t *ia, boost::int64_t *ib, real *crit) {
|
||||
boost::int64_t len;
|
||||
|
||||
len = (n*(n-1))/2;
|
||||
distdriver_(&n,&len,dists,&iopt,ia,ib,crit);
|
||||
len = (n * (n - 1)) / 2;
|
||||
distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
|
||||
};
|
||||
|
||||
|
||||
static PyObject *
|
||||
Clustering_MurtaghDistCluster(python::object data, int nPts, int option)
|
||||
{
|
||||
static PyObject *Clustering_MurtaghDistCluster(python::object data, int nPts,
|
||||
int option) {
|
||||
PyArrayObject *dataContig;
|
||||
boost::int64_t *ia,*ib;
|
||||
boost::int64_t *ia, *ib;
|
||||
real *crit;
|
||||
PyObject *res=PyTuple_New(3);
|
||||
PyObject *res = PyTuple_New(3);
|
||||
PyObject *tmp;
|
||||
npy_intp dims[] = {1};
|
||||
|
||||
if (PyArray_Check(data.ptr())) {
|
||||
dataContig
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(data.ptr(),PyArray_DOUBLE,1,1));
|
||||
}
|
||||
else {
|
||||
dataContig = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(data.ptr(), PyArray_DOUBLE, 1, 1));
|
||||
} else {
|
||||
throw_value_error("PyArray_Type expected as input");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ia = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
|
||||
ib = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
|
||||
crit = (real *)calloc(nPts,sizeof(real));
|
||||
distclusterit((real *)dataContig->data,nPts,option,ia,ib,crit);
|
||||
ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
|
||||
ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
|
||||
crit = (real *)calloc(nPts, sizeof(real));
|
||||
distclusterit((real *)dataContig->data, nPts, option, ia, ib, crit);
|
||||
|
||||
dims[0] = nPts;
|
||||
|
||||
@@ -142,30 +134,26 @@ Clustering_MurtaghDistCluster(python::object data, int nPts, int option)
|
||||
// that's why it's ok that we do not free them in this function,
|
||||
// Python will take care of it for us.
|
||||
//
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ia);
|
||||
PyTuple_SetItem(res,0,tmp);
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
|
||||
PyTuple_SetItem(res, 0, tmp);
|
||||
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ib);
|
||||
PyTuple_SetItem(res,1,tmp);
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
|
||||
PyTuple_SetItem(res, 1, tmp);
|
||||
|
||||
tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
|
||||
PyTuple_SetItem(res, 2, tmp);
|
||||
|
||||
tmp = PyArray_SimpleNewFromData(1,dims,NPY_DOUBLE,(void *)crit);
|
||||
PyTuple_SetItem(res,2,tmp);
|
||||
|
||||
return res;
|
||||
};
|
||||
|
||||
|
||||
BOOST_PYTHON_MODULE(Clustering) {
|
||||
|
||||
rdkit_import_array();
|
||||
|
||||
python::def("MurtaghCluster", Clustering_MurtaghCluster,
|
||||
( python::arg("data"), python::arg("nPts"),
|
||||
python::arg("sz"), python::arg("option") ),
|
||||
"TODO: provide docstring");
|
||||
(python::arg("data"), python::arg("nPts"), python::arg("sz"),
|
||||
python::arg("option")),
|
||||
"TODO: provide docstring");
|
||||
python::def("MurtaghDistCluster", Clustering_MurtaghDistCluster,
|
||||
( python::arg("data"), python::arg("nPts"),
|
||||
python::arg("option") ),
|
||||
"TODO: provide docstring");
|
||||
(python::arg("data"), python::arg("nPts"), python::arg("option")),
|
||||
"TODO: provide docstring");
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed."
|
||||
|
||||
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
|
||||
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
|
||||
|
||||
#ifndef F2C_INCLUDE
|
||||
#define F2C_INCLUDE
|
||||
@@ -19,11 +19,11 @@ typedef long int logical;
|
||||
typedef short int shortlogical;
|
||||
typedef char logical1;
|
||||
typedef char integer1;
|
||||
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
|
||||
typedef long long longint; /* system-dependent */
|
||||
typedef unsigned long long ulongint; /* system-dependent */
|
||||
#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b)))
|
||||
#define qbit_set(a,b) ((a) | ((ulongint)1 << (b)))
|
||||
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
|
||||
typedef long long longint; /* system-dependent */
|
||||
typedef unsigned long long ulongint; /* system-dependent */
|
||||
#define qbit_clear(a, b) ((a) & ~((ulongint)1 << (b)))
|
||||
#define qbit_set(a, b) ((a) | ((ulongint)1 << (b)))
|
||||
#endif
|
||||
|
||||
#define TRUE_ (1)
|
||||
@@ -48,121 +48,121 @@ typedef long int ftnint;
|
||||
#endif
|
||||
|
||||
/*external read, write*/
|
||||
typedef struct
|
||||
{ flag cierr;
|
||||
ftnint ciunit;
|
||||
flag ciend;
|
||||
char *cifmt;
|
||||
ftnint cirec;
|
||||
typedef struct {
|
||||
flag cierr;
|
||||
ftnint ciunit;
|
||||
flag ciend;
|
||||
char *cifmt;
|
||||
ftnint cirec;
|
||||
} cilist;
|
||||
|
||||
/*internal read, write*/
|
||||
typedef struct
|
||||
{ flag icierr;
|
||||
char *iciunit;
|
||||
flag iciend;
|
||||
char *icifmt;
|
||||
ftnint icirlen;
|
||||
ftnint icirnum;
|
||||
typedef struct {
|
||||
flag icierr;
|
||||
char *iciunit;
|
||||
flag iciend;
|
||||
char *icifmt;
|
||||
ftnint icirlen;
|
||||
ftnint icirnum;
|
||||
} icilist;
|
||||
|
||||
/*open*/
|
||||
typedef struct
|
||||
{ flag oerr;
|
||||
ftnint ounit;
|
||||
char *ofnm;
|
||||
ftnlen ofnmlen;
|
||||
char *osta;
|
||||
char *oacc;
|
||||
char *ofm;
|
||||
ftnint orl;
|
||||
char *oblnk;
|
||||
typedef struct {
|
||||
flag oerr;
|
||||
ftnint ounit;
|
||||
char *ofnm;
|
||||
ftnlen ofnmlen;
|
||||
char *osta;
|
||||
char *oacc;
|
||||
char *ofm;
|
||||
ftnint orl;
|
||||
char *oblnk;
|
||||
} olist;
|
||||
|
||||
/*close*/
|
||||
typedef struct
|
||||
{ flag cerr;
|
||||
ftnint cunit;
|
||||
char *csta;
|
||||
typedef struct {
|
||||
flag cerr;
|
||||
ftnint cunit;
|
||||
char *csta;
|
||||
} cllist;
|
||||
|
||||
/*rewind, backspace, endfile*/
|
||||
typedef struct
|
||||
{ flag aerr;
|
||||
ftnint aunit;
|
||||
typedef struct {
|
||||
flag aerr;
|
||||
ftnint aunit;
|
||||
} alist;
|
||||
|
||||
/* inquire */
|
||||
typedef struct
|
||||
{ flag inerr;
|
||||
ftnint inunit;
|
||||
char *infile;
|
||||
ftnlen infilen;
|
||||
ftnint *inex; /*parameters in standard's order*/
|
||||
ftnint *inopen;
|
||||
ftnint *innum;
|
||||
ftnint *innamed;
|
||||
char *inname;
|
||||
ftnlen innamlen;
|
||||
char *inacc;
|
||||
ftnlen inacclen;
|
||||
char *inseq;
|
||||
ftnlen inseqlen;
|
||||
char *indir;
|
||||
ftnlen indirlen;
|
||||
char *infmt;
|
||||
ftnlen infmtlen;
|
||||
char *inform;
|
||||
ftnint informlen;
|
||||
char *inunf;
|
||||
ftnlen inunflen;
|
||||
ftnint *inrecl;
|
||||
ftnint *innrec;
|
||||
char *inblank;
|
||||
ftnlen inblanklen;
|
||||
typedef struct {
|
||||
flag inerr;
|
||||
ftnint inunit;
|
||||
char *infile;
|
||||
ftnlen infilen;
|
||||
ftnint *inex; /*parameters in standard's order*/
|
||||
ftnint *inopen;
|
||||
ftnint *innum;
|
||||
ftnint *innamed;
|
||||
char *inname;
|
||||
ftnlen innamlen;
|
||||
char *inacc;
|
||||
ftnlen inacclen;
|
||||
char *inseq;
|
||||
ftnlen inseqlen;
|
||||
char *indir;
|
||||
ftnlen indirlen;
|
||||
char *infmt;
|
||||
ftnlen infmtlen;
|
||||
char *inform;
|
||||
ftnint informlen;
|
||||
char *inunf;
|
||||
ftnlen inunflen;
|
||||
ftnint *inrecl;
|
||||
ftnint *innrec;
|
||||
char *inblank;
|
||||
ftnlen inblanklen;
|
||||
} inlist;
|
||||
|
||||
#define VOID void
|
||||
|
||||
union Multitype { /* for multiple entry points */
|
||||
integer1 g;
|
||||
shortint h;
|
||||
integer i;
|
||||
/* longint j; */
|
||||
real r;
|
||||
doublereal d;
|
||||
complex c;
|
||||
doublecomplex z;
|
||||
};
|
||||
union Multitype {/* for multiple entry points */
|
||||
integer1 g;
|
||||
shortint h;
|
||||
integer i;
|
||||
/* longint j; */
|
||||
real r;
|
||||
doublereal d;
|
||||
complex c;
|
||||
doublecomplex z;
|
||||
};
|
||||
|
||||
typedef union Multitype Multitype;
|
||||
|
||||
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
|
||||
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
|
||||
|
||||
struct Vardesc { /* for Namelist */
|
||||
char *name;
|
||||
char *addr;
|
||||
ftnlen *dims;
|
||||
int type;
|
||||
};
|
||||
struct Vardesc {/* for Namelist */
|
||||
char *name;
|
||||
char *addr;
|
||||
ftnlen *dims;
|
||||
int type;
|
||||
};
|
||||
typedef struct Vardesc Vardesc;
|
||||
|
||||
struct Namelist {
|
||||
char *name;
|
||||
Vardesc **vars;
|
||||
int nvars;
|
||||
};
|
||||
char *name;
|
||||
Vardesc **vars;
|
||||
int nvars;
|
||||
};
|
||||
typedef struct Namelist Namelist;
|
||||
|
||||
#define abs(x) ((x) >= 0 ? (x) : -(x))
|
||||
#define dabs(x) (doublereal)abs(x)
|
||||
#define min(a,b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a,b) ((a) >= (b) ? (a) : (b))
|
||||
#define dmin(a,b) (doublereal)min(a,b)
|
||||
#define dmax(a,b) (doublereal)max(a,b)
|
||||
#define bit_test(a,b) ((a) >> (b) & 1)
|
||||
#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b)))
|
||||
#define bit_set(a,b) ((a) | ((uinteger)1 << (b)))
|
||||
#define dabs(x) (doublereal) abs(x)
|
||||
#define min(a, b) ((a) <= (b) ? (a) : (b))
|
||||
#define max(a, b) ((a) >= (b) ? (a) : (b))
|
||||
#define dmin(a, b) (doublereal) min(a, b)
|
||||
#define dmax(a, b) (doublereal) max(a, b)
|
||||
#define bit_test(a, b) ((a) >> (b)&1)
|
||||
#define bit_clear(a, b) ((a) & ~((uinteger)1 << (b)))
|
||||
#define bit_set(a, b) ((a) | ((uinteger)1 << (b)))
|
||||
|
||||
/* procedure parameter types for -A and -C++ */
|
||||
|
||||
@@ -193,10 +193,10 @@ typedef /* Character */ VOID (*H_fp)();
|
||||
typedef /* Subroutine */ int (*S_fp)();
|
||||
#endif
|
||||
/* E_fp is for real functions when -R is not specified */
|
||||
typedef VOID C_f; /* complex function */
|
||||
typedef VOID H_f; /* character function */
|
||||
typedef VOID Z_f; /* double complex function */
|
||||
typedef doublereal E_f; /* real function with -R not specified */
|
||||
typedef VOID C_f; /* complex function */
|
||||
typedef VOID H_f; /* character function */
|
||||
typedef VOID Z_f; /* double complex function */
|
||||
typedef doublereal E_f; /* real function with -R not specified */
|
||||
|
||||
/* undef any lower-case symbols that your C compiler predefines, e.g.: */
|
||||
|
||||
|
||||
@@ -20,10 +20,11 @@ namespace python = boost::python;
|
||||
/***********************************************
|
||||
|
||||
constructs a variable table for the data passed in
|
||||
The table for a given variable records the number of times each possible value
|
||||
The table for a given variable records the number of times each possible
|
||||
value
|
||||
of that variable appears for each possible result of the function.
|
||||
|
||||
**Arguments**
|
||||
**Arguments**
|
||||
|
||||
- vals: pointer to double, contains the values of the variable,
|
||||
should be sorted
|
||||
@@ -34,14 +35,15 @@ namespace python = boost::python;
|
||||
|
||||
- nCuts: int, the length of _cuts_
|
||||
|
||||
- starts: pointer to int, the potential starting points for quantization bounds
|
||||
- starts: pointer to int, the potential starting points for quantization
|
||||
bounds
|
||||
|
||||
- nStarts: int, the length of _starts_
|
||||
|
||||
- results: poitner to int, the result codes
|
||||
|
||||
- nPossibleRes: int, the number of possible result codes
|
||||
|
||||
|
||||
|
||||
**Returns**
|
||||
|
||||
@@ -54,30 +56,29 @@ namespace python = boost::python;
|
||||
- the _results_ array is assumed to be _nVals_ long
|
||||
|
||||
***********************************************/
|
||||
long int *
|
||||
GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
|
||||
long int *results,int nPossibleRes,long int *varTable)
|
||||
{
|
||||
long int *GenVarTable(double *vals, int nVals, long int *cuts, int nCuts,
|
||||
long int *starts, long int *results, int nPossibleRes,
|
||||
long int *varTable) {
|
||||
RDUNUSED_PARAM(vals);
|
||||
int nBins = nCuts + 1;
|
||||
int idx,i,iTab;
|
||||
int idx, i, iTab;
|
||||
|
||||
memset(varTable,0,nBins*nPossibleRes*sizeof(long int));
|
||||
memset(varTable, 0, nBins * nPossibleRes * sizeof(long int));
|
||||
idx = 0;
|
||||
for(i=0;i<nCuts;i++){
|
||||
for (i = 0; i < nCuts; i++) {
|
||||
int cut = cuts[i];
|
||||
iTab = i*nPossibleRes;
|
||||
while(idx<starts[cut]){
|
||||
varTable[iTab+results[idx]] += 1;
|
||||
iTab = i * nPossibleRes;
|
||||
while (idx < starts[cut]) {
|
||||
varTable[iTab + results[idx]] += 1;
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
iTab = nCuts*nPossibleRes;
|
||||
while(idx<nVals){
|
||||
varTable[iTab+results[idx]] += 1;
|
||||
iTab = nCuts * nPossibleRes;
|
||||
while (idx < nVals) {
|
||||
varTable[iTab + results[idx]] += 1;
|
||||
idx++;
|
||||
}
|
||||
return varTable;
|
||||
return varTable;
|
||||
}
|
||||
|
||||
/***********************************************
|
||||
@@ -86,7 +87,7 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
|
||||
we do things this way to avoid having to convert things back and forth
|
||||
from Python objects
|
||||
|
||||
**Arguments**
|
||||
**Arguments**
|
||||
|
||||
- vals: pointer to double, contains the values of the variable,
|
||||
should be sorted
|
||||
@@ -99,14 +100,15 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
|
||||
|
||||
- which: int, the quant bound being modified here
|
||||
|
||||
- starts: pointer to int, the potential starting points for quantization bounds
|
||||
- starts: pointer to int, the potential starting points for quantization
|
||||
bounds
|
||||
|
||||
- nStarts: int, the length of _starts_
|
||||
|
||||
- results: poitner to int, the result codes
|
||||
|
||||
- nPossibleRes: int, the number of possible result codes
|
||||
|
||||
|
||||
|
||||
**Returns**
|
||||
|
||||
@@ -120,66 +122,65 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
|
||||
- the _results_ array is assumed to be _nVals_ long
|
||||
|
||||
***********************************************/
|
||||
double
|
||||
RecurseHelper(double *vals,int nVals,long int *cuts,int nCuts,int which,
|
||||
long int *starts,int nStarts,long int *results,int nPossibleRes)
|
||||
{
|
||||
double maxGain=-1e6,gainHere;
|
||||
long int *bestCuts,*tCuts;
|
||||
long int *varTable=0;
|
||||
double RecurseHelper(double *vals, int nVals, long int *cuts, int nCuts,
|
||||
int which, long int *starts, int nStarts,
|
||||
long int *results, int nPossibleRes) {
|
||||
double maxGain = -1e6, gainHere;
|
||||
long int *bestCuts, *tCuts;
|
||||
long int *varTable = 0;
|
||||
int highestCutHere = nStarts - nCuts + which;
|
||||
int i,nBounds=nCuts;
|
||||
|
||||
varTable = (long int *)calloc((nCuts+1)*nPossibleRes,sizeof(long int));
|
||||
bestCuts = (long int *)calloc(nCuts,sizeof(long int));
|
||||
tCuts = (long int *)calloc(nCuts,sizeof(long int));
|
||||
GenVarTable(vals,nVals,cuts,nCuts,starts,results,nPossibleRes,varTable);
|
||||
while(cuts[which] <= highestCutHere){
|
||||
gainHere = RDInfoTheory::InfoEntropyGain(varTable,nCuts+1,nPossibleRes);
|
||||
if(gainHere > maxGain){
|
||||
int i, nBounds = nCuts;
|
||||
|
||||
varTable = (long int *)calloc((nCuts + 1) * nPossibleRes, sizeof(long int));
|
||||
bestCuts = (long int *)calloc(nCuts, sizeof(long int));
|
||||
tCuts = (long int *)calloc(nCuts, sizeof(long int));
|
||||
GenVarTable(vals, nVals, cuts, nCuts, starts, results, nPossibleRes,
|
||||
varTable);
|
||||
while (cuts[which] <= highestCutHere) {
|
||||
gainHere = RDInfoTheory::InfoEntropyGain(varTable, nCuts + 1, nPossibleRes);
|
||||
if (gainHere > maxGain) {
|
||||
maxGain = gainHere;
|
||||
memcpy(bestCuts,cuts,nCuts*sizeof(long int));
|
||||
memcpy(bestCuts, cuts, nCuts * sizeof(long int));
|
||||
}
|
||||
|
||||
// recurse on the next vars if needed
|
||||
if(which < nBounds-1){
|
||||
memcpy(tCuts,cuts,nCuts*sizeof(long int));
|
||||
gainHere = RecurseHelper(vals,nVals,tCuts,nCuts,which+1,starts,nStarts,
|
||||
results,nPossibleRes);
|
||||
if(gainHere > maxGain){
|
||||
if (which < nBounds - 1) {
|
||||
memcpy(tCuts, cuts, nCuts * sizeof(long int));
|
||||
gainHere = RecurseHelper(vals, nVals, tCuts, nCuts, which + 1, starts,
|
||||
nStarts, results, nPossibleRes);
|
||||
if (gainHere > maxGain) {
|
||||
maxGain = gainHere;
|
||||
memcpy(bestCuts,tCuts,nCuts*sizeof(long int));
|
||||
memcpy(bestCuts, tCuts, nCuts * sizeof(long int));
|
||||
}
|
||||
}
|
||||
|
||||
// update this cut
|
||||
int oldCut = cuts[which];
|
||||
cuts[which] += 1;
|
||||
int top,bot;
|
||||
int top, bot;
|
||||
bot = starts[oldCut];
|
||||
if(oldCut+1 < nStarts)
|
||||
top = starts[oldCut+1];
|
||||
if (oldCut + 1 < nStarts)
|
||||
top = starts[oldCut + 1];
|
||||
else
|
||||
top = starts[nStarts-1];
|
||||
for(i=bot;i<top;i++) {
|
||||
int v=results[i];
|
||||
varTable[which*nPossibleRes+v] += 1;
|
||||
varTable[(which+1)*nPossibleRes+v] -= 1;
|
||||
top = starts[nStarts - 1];
|
||||
for (i = bot; i < top; i++) {
|
||||
int v = results[i];
|
||||
varTable[which * nPossibleRes + v] += 1;
|
||||
varTable[(which + 1) * nPossibleRes + v] -= 1;
|
||||
}
|
||||
for(i=which+1;i<nBounds;i++){
|
||||
if(cuts[i] == cuts[i-1]) cuts[i] += 1;
|
||||
for (i = which + 1; i < nBounds; i++) {
|
||||
if (cuts[i] == cuts[i - 1]) cuts[i] += 1;
|
||||
}
|
||||
}
|
||||
memcpy(cuts,bestCuts,nCuts*sizeof(long int));
|
||||
memcpy(cuts, bestCuts, nCuts * sizeof(long int));
|
||||
free(tCuts);
|
||||
free(bestCuts);
|
||||
free(varTable);
|
||||
return maxGain;
|
||||
}
|
||||
|
||||
|
||||
/***********************************************
|
||||
|
||||
|
||||
Recursively finds the best quantization boundaries
|
||||
|
||||
**Arguments**
|
||||
@@ -206,21 +207,22 @@ RecurseHelper(double *vals,int nVals,long int *cuts,int nCuts,int which,
|
||||
1) the best information gain found so far
|
||||
|
||||
2) a list of the quantization bound indices ( _cuts_ for the best case)
|
||||
|
||||
|
||||
**Notes**
|
||||
|
||||
- this is not even remotely efficient, which is why a C replacement
|
||||
was written
|
||||
|
||||
- this is a drop-in replacement for *ML.Data.Quantize._PyRecurseBounds*
|
||||
|
||||
|
||||
***********************************************/
|
||||
static python::tuple
|
||||
cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
|
||||
python::list pyStarts, python::object results, int nPossibleRes)
|
||||
{
|
||||
PyArrayObject *contigVals,*contigResults;
|
||||
long int *cuts,*starts;
|
||||
static python::tuple cQuantize_RecurseOnBounds(python::object vals,
|
||||
python::list pyCuts, int which,
|
||||
python::list pyStarts,
|
||||
python::object results,
|
||||
int nPossibleRes) {
|
||||
PyArrayObject *contigVals, *contigResults;
|
||||
long int *cuts, *starts;
|
||||
|
||||
/*
|
||||
-------
|
||||
@@ -229,38 +231,37 @@ cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
|
||||
|
||||
-------
|
||||
*/
|
||||
contigVals
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(vals.ptr(),PyArray_DOUBLE,1,1));
|
||||
if(!contigVals){
|
||||
contigVals = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(vals.ptr(), PyArray_DOUBLE, 1, 1));
|
||||
if (!contigVals) {
|
||||
throw_value_error("could not convert value argument");
|
||||
}
|
||||
|
||||
contigResults
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(results.ptr(),PyArray_LONG,1,1));
|
||||
if(!contigResults){
|
||||
contigResults = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(results.ptr(), PyArray_LONG, 1, 1));
|
||||
if (!contigResults) {
|
||||
throw_value_error("could not convert results argument");
|
||||
}
|
||||
|
||||
python::ssize_t nCuts = python::len(pyCuts);
|
||||
cuts = (long int *)calloc(nCuts,sizeof(long int));
|
||||
for (python::ssize_t i=0; i<nCuts; i++) {
|
||||
cuts = (long int *)calloc(nCuts, sizeof(long int));
|
||||
for (python::ssize_t i = 0; i < nCuts; i++) {
|
||||
python::object elem = pyCuts[i];
|
||||
cuts[i] = python::extract<long int>(elem);
|
||||
}
|
||||
|
||||
python::ssize_t nStarts = python::len(pyStarts);
|
||||
starts = (long int *)calloc(nStarts,sizeof(long int));
|
||||
for (python::ssize_t i=0; i<nStarts; i++){
|
||||
starts = (long int *)calloc(nStarts, sizeof(long int));
|
||||
for (python::ssize_t i = 0; i < nStarts; i++) {
|
||||
python::object elem = pyStarts[i];
|
||||
starts[i] = python::extract<long int>(elem);
|
||||
}
|
||||
|
||||
// do the real work
|
||||
double gain
|
||||
= RecurseHelper((double *)contigVals->data,contigVals->dimensions[0],
|
||||
cuts,nCuts,which,starts,nStarts,
|
||||
(long int *)contigResults->data,nPossibleRes);
|
||||
|
||||
double gain = RecurseHelper(
|
||||
(double *)contigVals->data, contigVals->dimensions[0], cuts, nCuts, which,
|
||||
starts, nStarts, (long int *)contigResults->data, nPossibleRes);
|
||||
|
||||
/*
|
||||
-------
|
||||
|
||||
@@ -269,72 +270,71 @@ cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
|
||||
-------
|
||||
*/
|
||||
python::list cutObj;
|
||||
for (python::ssize_t i=0; i<nCuts; i++) {
|
||||
for (python::ssize_t i = 0; i < nCuts; i++) {
|
||||
cutObj.append(cuts[i]);
|
||||
}
|
||||
free(cuts);
|
||||
free(starts);
|
||||
return python::make_tuple(gain, cutObj);
|
||||
return python::make_tuple(gain, cutObj);
|
||||
}
|
||||
|
||||
static python::list
|
||||
cQuantize_FindStartPoints(python::object values, python::object results,
|
||||
int nData)
|
||||
{
|
||||
static python::list cQuantize_FindStartPoints(python::object values,
|
||||
python::object results,
|
||||
int nData) {
|
||||
python::list startPts;
|
||||
|
||||
if(nData<2){
|
||||
if (nData < 2) {
|
||||
return startPts;
|
||||
}
|
||||
|
||||
PyArrayObject *contigVals
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(values.ptr(),PyArray_DOUBLE,1,1));
|
||||
if(!contigVals){
|
||||
PyArrayObject *contigVals = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(values.ptr(), PyArray_DOUBLE, 1, 1));
|
||||
if (!contigVals) {
|
||||
throw_value_error("could not convert value argument");
|
||||
}
|
||||
|
||||
double *vals=(double *)contigVals->data;
|
||||
double *vals = (double *)contigVals->data;
|
||||
|
||||
PyArrayObject *contigResults
|
||||
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(results.ptr(),PyArray_LONG,1,1));
|
||||
if(!contigResults){
|
||||
PyArrayObject *contigResults = reinterpret_cast<PyArrayObject *>(
|
||||
PyArray_ContiguousFromObject(results.ptr(), PyArray_LONG, 1, 1));
|
||||
if (!contigResults) {
|
||||
throw_value_error("could not convert results argument");
|
||||
}
|
||||
|
||||
long *res=(long *)contigResults->data;
|
||||
long *res = (long *)contigResults->data;
|
||||
|
||||
bool firstBlock=true;
|
||||
long lastBlockAct=-2,blockAct=res[0];
|
||||
int lastDiv=-1;
|
||||
double tol=1e-8;
|
||||
bool firstBlock = true;
|
||||
long lastBlockAct = -2, blockAct = res[0];
|
||||
int lastDiv = -1;
|
||||
double tol = 1e-8;
|
||||
|
||||
int i=1;
|
||||
while(i<nData){
|
||||
while(i<nData && vals[i]-vals[i-1]<=tol){
|
||||
if(res[i]!=blockAct){
|
||||
blockAct=-1;
|
||||
int i = 1;
|
||||
while (i < nData) {
|
||||
while (i < nData && vals[i] - vals[i - 1] <= tol) {
|
||||
if (res[i] != blockAct) {
|
||||
blockAct = -1;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if(firstBlock){
|
||||
firstBlock=false;
|
||||
lastBlockAct=blockAct;
|
||||
lastDiv=i;
|
||||
if (firstBlock) {
|
||||
firstBlock = false;
|
||||
lastBlockAct = blockAct;
|
||||
lastDiv = i;
|
||||
} else {
|
||||
if(blockAct==-1 || lastBlockAct==-1 || blockAct!=lastBlockAct){
|
||||
startPts.append(lastDiv);
|
||||
lastDiv=i;
|
||||
lastBlockAct=blockAct;
|
||||
if (blockAct == -1 || lastBlockAct == -1 || blockAct != lastBlockAct) {
|
||||
startPts.append(lastDiv);
|
||||
lastDiv = i;
|
||||
lastBlockAct = blockAct;
|
||||
} else {
|
||||
lastDiv=i;
|
||||
lastDiv = i;
|
||||
}
|
||||
}
|
||||
if(i<nData) blockAct=res[i];
|
||||
++i;
|
||||
if (i < nData) blockAct = res[i];
|
||||
++i;
|
||||
}
|
||||
|
||||
// catch the case that the last point also sets a bin:
|
||||
if( blockAct != lastBlockAct ){
|
||||
if (blockAct != lastBlockAct) {
|
||||
startPts.append(lastDiv);
|
||||
}
|
||||
|
||||
@@ -342,19 +342,15 @@ cQuantize_FindStartPoints(python::object values, python::object results,
|
||||
}
|
||||
|
||||
BOOST_PYTHON_MODULE(cQuantize) {
|
||||
|
||||
rdkit_import_array();
|
||||
|
||||
python::def("_RecurseOnBounds", cQuantize_RecurseOnBounds,
|
||||
( python::arg("vals"), python::arg("pyCuts"),
|
||||
python::arg("which"), python::arg("pyStarts"),
|
||||
python::arg("results"), python::arg("nPossibleRes") ),
|
||||
"TODO: provide docstring");
|
||||
python::def("_FindStartPoints", cQuantize_FindStartPoints,
|
||||
( python::arg("values"), python::arg("results"),
|
||||
python::arg("nData") ),
|
||||
"TODO: provide docstring");
|
||||
(python::arg("vals"), python::arg("pyCuts"), python::arg("which"),
|
||||
python::arg("pyStarts"), python::arg("results"),
|
||||
python::arg("nPossibleRes")),
|
||||
"TODO: provide docstring");
|
||||
python::def(
|
||||
"_FindStartPoints", cQuantize_FindStartPoints,
|
||||
(python::arg("values"), python::arg("results"), python::arg("nData")),
|
||||
"TODO: provide docstring");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -15,105 +15,100 @@
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
|
||||
namespace RDInfoTheory {
|
||||
//FIX: won't worry about it now, but this class can be templated by the type of
|
||||
// container for the bit list and type of descriptors (fingerprint vs. real valued)
|
||||
class BitCorrMatGenerator {
|
||||
/*! \brief A class to generate a correlation matrix for a bunch of fingerprints
|
||||
*
|
||||
* The correlation matrix is done only for the bit IDs that are set by a call to the
|
||||
* function setDescriptorIdList
|
||||
*
|
||||
* cr = CorrMatGenerator();
|
||||
* cr.setDescriptorIdList(descList);
|
||||
* for each fingerprint in list of fingerprints {
|
||||
* cr.collectVotes(fingerprint);
|
||||
* }
|
||||
* double *corrMat = cr.getCorrMat()
|
||||
*
|
||||
* The resulting correlation matrix is a one dimension matrix with only the lower triangle elements
|
||||
* of the symmetric matrix
|
||||
*/
|
||||
public:
|
||||
BitCorrMatGenerator() {
|
||||
this->initGenerator();
|
||||
};
|
||||
// FIX: won't worry about it now, but this class can be templated by the type of
|
||||
// container for the bit list and type of descriptors (fingerprint vs. real
|
||||
// valued)
|
||||
class BitCorrMatGenerator {
|
||||
/*! \brief A class to generate a correlation matrix for a bunch of
|
||||
*fingerprints
|
||||
*
|
||||
* The correlation matrix is done only for the bit IDs that are set by a call
|
||||
*to the
|
||||
* function setDescriptorIdList
|
||||
*
|
||||
* cr = CorrMatGenerator();
|
||||
* cr.setDescriptorIdList(descList);
|
||||
* for each fingerprint in list of fingerprints {
|
||||
* cr.collectVotes(fingerprint);
|
||||
* }
|
||||
* double *corrMat = cr.getCorrMat()
|
||||
*
|
||||
* The resulting correlation matrix is a one dimension matrix with only the
|
||||
*lower triangle elements
|
||||
* of the symmetric matrix
|
||||
*/
|
||||
public:
|
||||
BitCorrMatGenerator() { this->initGenerator(); };
|
||||
|
||||
~BitCorrMatGenerator() {
|
||||
delete [] dp_corrMat;
|
||||
~BitCorrMatGenerator() { delete[] dp_corrMat; }
|
||||
|
||||
void initGenerator() {
|
||||
dp_corrMat = 0;
|
||||
d_descs.resize(0);
|
||||
d_nExamples = 0;
|
||||
};
|
||||
|
||||
/*! \brief Set the list bits that we are interested in correlating
|
||||
*
|
||||
* \param bitIdList is a list of bit ids that need to be correlated e.g. a
|
||||
*list top ranked ensemble
|
||||
* of bits
|
||||
*/
|
||||
void setBitIdList(const RDKit::INT_VECT &bitIdList) {
|
||||
d_descs = bitIdList;
|
||||
int i, nd = d_descs.size();
|
||||
int nelem = nd * (nd - 1) / 2;
|
||||
delete[] dp_corrMat;
|
||||
|
||||
dp_corrMat = new double[nd * (nd - 1) / 2];
|
||||
for (i = 0; i < nelem; i++) {
|
||||
dp_corrMat[i] = 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
void initGenerator() {
|
||||
dp_corrMat = 0;
|
||||
d_descs.resize(0);
|
||||
d_nExamples = 0;
|
||||
};
|
||||
//! \brief get the number of examples we used so far to compute the
|
||||
//correlation matrix
|
||||
int getNumExamples() const { return d_nExamples; };
|
||||
|
||||
/*! \brief Set the list bits that we are interested in correlating
|
||||
*
|
||||
* \param bitIdList is a list of bit ids that need to be correlated e.g. a list top ranked ensemble
|
||||
* of bits
|
||||
*/
|
||||
void setBitIdList(const RDKit::INT_VECT &bitIdList) {
|
||||
d_descs = bitIdList;
|
||||
int i, nd = d_descs.size();
|
||||
int nelem = nd*(nd-1)/2;
|
||||
delete [] dp_corrMat;
|
||||
//! \brief Get the list of bits ID that are used to generate the correlation
|
||||
//matrix
|
||||
RDKit::INT_VECT getCorrBitList() const { return d_descs; };
|
||||
|
||||
dp_corrMat = new double[nd*(nd-1)/2];
|
||||
for (i = 0; i < nelem; i++) {
|
||||
dp_corrMat[i] = 0.0;
|
||||
//! \brief Gets a pointer to the correlation matrix
|
||||
double *getCorrMat() { return dp_corrMat; };
|
||||
|
||||
//! \brief For each pair of on bits (bi, bj) in fp increase the correlation
|
||||
//count
|
||||
// for the pair by 1
|
||||
void collectVotes(const BitVect &fp) {
|
||||
unsigned int nd = d_descs.size();
|
||||
// use a temporary bit vector to first mask the fingerprint
|
||||
ExplicitBitVect ebv(nd);
|
||||
int bi;
|
||||
for (unsigned int i = 0; i < nd; i++) {
|
||||
bi = d_descs[i];
|
||||
if (fp[bi]) {
|
||||
ebv.setBit(i);
|
||||
}
|
||||
};
|
||||
|
||||
//! \brief get the number of examples we used so far to compute the correlation matrix
|
||||
int getNumExamples() const {
|
||||
return d_nExamples;
|
||||
};
|
||||
|
||||
//! \brief Get the list of bits ID that are used to generate the correlation matrix
|
||||
RDKit::INT_VECT getCorrBitList() const {
|
||||
return d_descs;
|
||||
};
|
||||
|
||||
//! \brief Gets a pointer to the correlation matrix
|
||||
double *getCorrMat() {
|
||||
return dp_corrMat;
|
||||
};
|
||||
|
||||
//! \brief For each pair of on bits (bi, bj) in fp increase the correlation count
|
||||
// for the pair by 1
|
||||
void collectVotes(const BitVect &fp) {
|
||||
unsigned int nd = d_descs.size();
|
||||
// use a temporary bit vector to first mask the fingerprint
|
||||
ExplicitBitVect ebv(nd);
|
||||
int bi;
|
||||
for (unsigned int i = 0; i < nd; i++) {
|
||||
bi = d_descs[i];
|
||||
if (fp[bi]) {
|
||||
ebv.setBit(i);
|
||||
}
|
||||
}
|
||||
for (unsigned i = 1; i < nd; i++) {
|
||||
unsigned int itab = i*(i-1)/2;
|
||||
if (ebv[i]) {
|
||||
for (unsigned int j = 0; j < i; j++) {
|
||||
if ( ebv[j]) {
|
||||
dp_corrMat[itab + j] += 1;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 1; i < nd; i++) {
|
||||
unsigned int itab = i * (i - 1) / 2;
|
||||
if (ebv[i]) {
|
||||
for (unsigned int j = 0; j < i; j++) {
|
||||
if (ebv[j]) {
|
||||
dp_corrMat[itab + j] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
d_nExamples++;
|
||||
};
|
||||
|
||||
private:
|
||||
RDKit::INT_VECT d_descs;
|
||||
double *dp_corrMat;
|
||||
int d_nExamples;
|
||||
}
|
||||
d_nExamples++;
|
||||
};
|
||||
|
||||
private:
|
||||
RDKit::INT_VECT d_descs;
|
||||
double *dp_corrMat;
|
||||
int d_nExamples;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -20,167 +20,168 @@
|
||||
#include <queue>
|
||||
|
||||
namespace RDInfoTheory {
|
||||
typedef std::pair<double, int> PAIR_D_I;
|
||||
typedef std::vector<PAIR_D_I> VECT_PDI;
|
||||
typedef std::pair<double, int> PAIR_D_I;
|
||||
typedef std::vector<PAIR_D_I> VECT_PDI;
|
||||
|
||||
struct gtDIPair {
|
||||
bool operator() ( const PAIR_D_I &pd1, const PAIR_D_I &pd2) const {
|
||||
return pd1.first > pd2.first;
|
||||
}
|
||||
};
|
||||
|
||||
typedef std::priority_queue<PAIR_D_I, VECT_PDI, gtDIPair> PR_QUEUE;
|
||||
|
||||
|
||||
|
||||
void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) {
|
||||
URANGE_CHECK(classList.size(), d_classes);
|
||||
d_biasList = classList;
|
||||
//make sure we don't have any duplicates
|
||||
std::sort(d_biasList.begin(), d_biasList.end());
|
||||
RDKit::INT_VECT_CI bi = std::unique(d_biasList.begin(), d_biasList.end());
|
||||
CHECK_INVARIANT(bi == d_biasList.end(), "There are duplicates in the class bias list");
|
||||
|
||||
// finally make sure all the class ID in d_biasList are within range
|
||||
for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) {
|
||||
URANGE_CHECK(static_cast<unsigned int>(*bi), d_classes-1);
|
||||
}
|
||||
struct gtDIPair {
|
||||
bool operator()(const PAIR_D_I &pd1, const PAIR_D_I &pd2) const {
|
||||
return pd1.first > pd2.first;
|
||||
}
|
||||
};
|
||||
|
||||
void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) {
|
||||
delete dp_maskBits;
|
||||
dp_maskBits = new ExplicitBitVect(d_dims);
|
||||
for (RDKit::INT_VECT_CI bi = maskBits.begin();
|
||||
bi != maskBits.end(); ++bi) {
|
||||
dp_maskBits->setBit(*bi);
|
||||
}
|
||||
typedef std::priority_queue<PAIR_D_I, VECT_PDI, gtDIPair> PR_QUEUE;
|
||||
|
||||
void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) {
|
||||
URANGE_CHECK(classList.size(), d_classes);
|
||||
d_biasList = classList;
|
||||
// make sure we don't have any duplicates
|
||||
std::sort(d_biasList.begin(), d_biasList.end());
|
||||
RDKit::INT_VECT_CI bi = std::unique(d_biasList.begin(), d_biasList.end());
|
||||
CHECK_INVARIANT(bi == d_biasList.end(),
|
||||
"There are duplicates in the class bias list");
|
||||
|
||||
// finally make sure all the class ID in d_biasList are within range
|
||||
for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) {
|
||||
URANGE_CHECK(static_cast<unsigned int>(*bi), d_classes - 1);
|
||||
}
|
||||
}
|
||||
|
||||
bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat,"bad results pointer");
|
||||
if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) {
|
||||
//we will accept the bit
|
||||
return true;
|
||||
}
|
||||
RDKit::DOUBLE_VECT fracs;
|
||||
fracs.resize(d_classes);
|
||||
|
||||
// compute the fractions of items in each class that hit the bit
|
||||
// and record the maximum for the those classes not in the bias list
|
||||
double maxCor = 0.0;
|
||||
for (unsigned int i = 0; i < d_classes; i++) {
|
||||
if (d_clsCount[i] > 0) {
|
||||
fracs[i] = ((double)resMat[i])/d_clsCount[i];
|
||||
} else {
|
||||
fracs[i] = 0.0;
|
||||
}
|
||||
if (std::find(d_biasList.begin(), d_biasList.end(), i) == d_biasList.end()) {
|
||||
// if not in the biasList
|
||||
if (fracs[i] > maxCor) {
|
||||
// if this is fraction is greater than the previously known maximum
|
||||
maxCor = fracs[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool bitOk = false;
|
||||
for (RDKit::INT_VECT_CI bci = d_biasList.begin(); bci !=
|
||||
d_biasList.end(); ++bci) {
|
||||
if (fracs[*bci] >= maxCor) {
|
||||
bitOk = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return bitOk;
|
||||
void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) {
|
||||
delete dp_maskBits;
|
||||
dp_maskBits = new ExplicitBitVect(d_dims);
|
||||
for (RDKit::INT_VECT_CI bi = maskBits.begin(); bi != maskBits.end(); ++bi) {
|
||||
dp_maskBits->setBit(*bi);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat,"bad result pointer");
|
||||
bool bitOk = this->BiasCheckBit(resMat);
|
||||
double info=0.0;
|
||||
if (bitOk) {
|
||||
info = ChiSquare(resMat, 2, d_classes);
|
||||
}
|
||||
return info;
|
||||
bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat, "bad results pointer");
|
||||
if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) {
|
||||
// we will accept the bit
|
||||
return true;
|
||||
}
|
||||
RDKit::DOUBLE_VECT fracs;
|
||||
fracs.resize(d_classes);
|
||||
|
||||
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat,"bad result pointer");
|
||||
bool bitOk = this->BiasCheckBit(resMat);
|
||||
double info=0.0;
|
||||
if (bitOk) {
|
||||
info = InfoEntropyGain(resMat, 2, d_classes);
|
||||
// compute the fractions of items in each class that hit the bit
|
||||
// and record the maximum for the those classes not in the bias list
|
||||
double maxCor = 0.0;
|
||||
for (unsigned int i = 0; i < d_classes; i++) {
|
||||
if (d_clsCount[i] > 0) {
|
||||
fracs[i] = ((double)resMat[i]) / d_clsCount[i];
|
||||
} else {
|
||||
fracs[i] = 0.0;
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv, unsigned int label) {
|
||||
URANGE_CHECK(label, d_classes-1);
|
||||
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
||||
|
||||
d_nInst += 1;
|
||||
d_clsCount[label] += 1;
|
||||
for (unsigned int i=0;i<bv.getNumBits();i++){
|
||||
if( (*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i)) ){
|
||||
d_counts[label][i] += 1;
|
||||
if (std::find(d_biasList.begin(), d_biasList.end(), i) ==
|
||||
d_biasList.end()) {
|
||||
// if not in the biasList
|
||||
if (fracs[i] > maxCor) {
|
||||
// if this is fraction is greater than the previously known maximum
|
||||
maxCor = fracs[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InfoBitRanker::accumulateVotes(const SparseBitVect &bv, unsigned int label) {
|
||||
URANGE_CHECK(label, d_classes-1);
|
||||
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
||||
|
||||
d_nInst += 1;
|
||||
d_clsCount[label] += 1;
|
||||
for (IntSet::const_iterator obi = bv.dp_bits->begin();
|
||||
obi != bv.dp_bits->end();
|
||||
++obi) {
|
||||
if(!dp_maskBits || dp_maskBits->getBit(*obi)){
|
||||
d_counts[label][(*obi)] += 1;
|
||||
}
|
||||
bool bitOk = false;
|
||||
for (RDKit::INT_VECT_CI bci = d_biasList.begin(); bci != d_biasList.end();
|
||||
++bci) {
|
||||
if (fracs[*bci] >= maxCor) {
|
||||
bitOk = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double *InfoBitRanker::getTopN(unsigned int num) {
|
||||
// this is a place holder to pass along to infogain function
|
||||
// the size of this container should nVals*d_classes, where nVals
|
||||
// is the number of values a variable can take.
|
||||
// since we are dealing with a binary bit vector nVals = 2
|
||||
// in addition the infogain function pretends that this is a 2D matrix
|
||||
// with the number of rows equal to nVals and num of columns equal to
|
||||
// d_classes
|
||||
if(num>d_dims) throw ValueErrorException("attempt to rank more bits than present in the bit vectors");
|
||||
if(dp_maskBits)
|
||||
CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(), "Can't rank more bits than the ensemble size");
|
||||
RDKit::USHORT *resMat = new RDKit::USHORT[2*d_classes];
|
||||
|
||||
PR_QUEUE topN;
|
||||
return bitOk;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < d_dims; i++) {
|
||||
// we may want to ignore bits that are not turned on in any item of class
|
||||
// "ignoreNoClass"
|
||||
/*
|
||||
if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) {
|
||||
if (d_counts[ignoreNoClass][i] == 0) {
|
||||
continue;
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
if (dp_maskBits && !dp_maskBits->getBit(i)) {
|
||||
continue;
|
||||
}
|
||||
double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat, "bad result pointer");
|
||||
bool bitOk = this->BiasCheckBit(resMat);
|
||||
double info = 0.0;
|
||||
if (bitOk) {
|
||||
info = ChiSquare(resMat, 2, d_classes);
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
// fill up dmat
|
||||
for (unsigned int j = 0; j < d_classes; j++) {
|
||||
// we know that we have only two rows here
|
||||
resMat[j] = d_counts[j][i];
|
||||
resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]);
|
||||
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const {
|
||||
PRECONDITION(resMat, "bad result pointer");
|
||||
bool bitOk = this->BiasCheckBit(resMat);
|
||||
double info = 0.0;
|
||||
if (bitOk) {
|
||||
info = InfoEntropyGain(resMat, 2, d_classes);
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv,
|
||||
unsigned int label) {
|
||||
URANGE_CHECK(label, d_classes - 1);
|
||||
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
||||
|
||||
d_nInst += 1;
|
||||
d_clsCount[label] += 1;
|
||||
for (unsigned int i = 0; i < bv.getNumBits(); i++) {
|
||||
if ((*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i))) {
|
||||
d_counts[label][i] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InfoBitRanker::accumulateVotes(const SparseBitVect &bv,
|
||||
unsigned int label) {
|
||||
URANGE_CHECK(label, d_classes - 1);
|
||||
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
||||
|
||||
d_nInst += 1;
|
||||
d_clsCount[label] += 1;
|
||||
for (IntSet::const_iterator obi = bv.dp_bits->begin();
|
||||
obi != bv.dp_bits->end(); ++obi) {
|
||||
if (!dp_maskBits || dp_maskBits->getBit(*obi)) {
|
||||
d_counts[label][(*obi)] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double *InfoBitRanker::getTopN(unsigned int num) {
|
||||
// this is a place holder to pass along to infogain function
|
||||
// the size of this container should nVals*d_classes, where nVals
|
||||
// is the number of values a variable can take.
|
||||
// since we are dealing with a binary bit vector nVals = 2
|
||||
// in addition the infogain function pretends that this is a 2D matrix
|
||||
// with the number of rows equal to nVals and num of columns equal to
|
||||
// d_classes
|
||||
if (num > d_dims)
|
||||
throw ValueErrorException(
|
||||
"attempt to rank more bits than present in the bit vectors");
|
||||
if (dp_maskBits)
|
||||
CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(),
|
||||
"Can't rank more bits than the ensemble size");
|
||||
RDKit::USHORT *resMat = new RDKit::USHORT[2 * d_classes];
|
||||
|
||||
PR_QUEUE topN;
|
||||
|
||||
for (unsigned int i = 0; i < d_dims; i++) {
|
||||
// we may want to ignore bits that are not turned on in any item of class
|
||||
// "ignoreNoClass"
|
||||
/*
|
||||
if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) {
|
||||
if (d_counts[ignoreNoClass][i] == 0) {
|
||||
continue;
|
||||
}
|
||||
double info = 0.0;
|
||||
switch (d_type) {
|
||||
}*/
|
||||
|
||||
if (dp_maskBits && !dp_maskBits->getBit(i)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// fill up dmat
|
||||
for (unsigned int j = 0; j < d_classes; j++) {
|
||||
// we know that we have only two rows here
|
||||
resMat[j] = d_counts[j][i];
|
||||
resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]);
|
||||
}
|
||||
double info = 0.0;
|
||||
switch (d_type) {
|
||||
case ENTROPY:
|
||||
info = InfoEntropyGain(resMat, 2, d_classes);
|
||||
break;
|
||||
@@ -195,100 +196,93 @@ namespace RDInfoTheory {
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
PAIR_D_I entry(info, i);
|
||||
|
||||
if (info >= 0.0) {
|
||||
if (topN.size() < num) {
|
||||
topN.push(entry);
|
||||
}
|
||||
else if (info > topN.top().first) {
|
||||
topN.pop();
|
||||
topN.push(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete [] resMat;
|
||||
|
||||
// now fill up the result matrix for the topN bits
|
||||
// the result from this function is a double * of size
|
||||
// num*4. The caller of this function interprets this
|
||||
// array as a two dimensional array of size num*(2+d_classes) with each row
|
||||
// containing the following entries
|
||||
// bitId, infogain, 1 additional column for number of hits for each class
|
||||
//double *res = new double[num*(2+d_classes)];
|
||||
|
||||
d_top = num;
|
||||
int ncols = 2+d_classes;
|
||||
|
||||
delete [] dp_topBits;
|
||||
dp_topBits = new double[num*ncols];
|
||||
|
||||
int offset, bid;
|
||||
|
||||
RDKit::INT_VECT maskBits;
|
||||
if (dp_maskBits && topN.size() < num) {
|
||||
dp_maskBits->getOnBits(maskBits);
|
||||
}
|
||||
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
offset = i*ncols;
|
||||
if (topN.size() == 0 ) {
|
||||
if (dp_maskBits) {
|
||||
bid = maskBits[i];
|
||||
} else {
|
||||
bid = i;
|
||||
}
|
||||
dp_topBits[offset + 1] = 0.0;
|
||||
} else {
|
||||
bid = topN.top().second; // bit id
|
||||
dp_topBits[offset + 1] = topN.top().first; // value of the infogain
|
||||
PAIR_D_I entry(info, i);
|
||||
|
||||
if (info >= 0.0) {
|
||||
if (topN.size() < num) {
|
||||
topN.push(entry);
|
||||
} else if (info > topN.top().first) {
|
||||
topN.pop();
|
||||
}
|
||||
dp_topBits[offset] = (double)bid;
|
||||
|
||||
for (unsigned int j = 0; j < d_classes; j++) {
|
||||
dp_topBits[offset + 2 + j] = (double)d_counts[j][bid];
|
||||
topN.push(entry);
|
||||
}
|
||||
}
|
||||
return dp_topBits;
|
||||
}
|
||||
|
||||
void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const {
|
||||
(*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent";
|
||||
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
||||
(*outStream) << std::setw(10) << "class" << ic;
|
||||
}
|
||||
(*outStream) << std::endl;
|
||||
|
||||
unsigned int ncols = 2 + d_classes;
|
||||
for (unsigned int i = 0; i < d_top; i++) {
|
||||
(*outStream) << std::setw(12) << (int)dp_topBits[i*ncols]
|
||||
<< std::setw(12) << std::setprecision(5)
|
||||
<< dp_topBits[i*ncols + 1];
|
||||
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
||||
(*outStream) << std::setw(10) << (int)dp_topBits[i*ncols + 2 + ic];
|
||||
}
|
||||
(*outStream) << "\n";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const {
|
||||
std::ofstream tmpStream(fileName.c_str());
|
||||
if ((!tmpStream) || (tmpStream.bad()) ) {
|
||||
std::ostringstream errout;
|
||||
errout << "Bad output file " << fileName;
|
||||
throw RDKit::FileParseException(errout.str());
|
||||
}
|
||||
delete[] resMat;
|
||||
|
||||
std::ostream &outStream = static_cast<std::ostream &>(tmpStream);
|
||||
this->writeTopBitsToStream(&outStream);
|
||||
// now fill up the result matrix for the topN bits
|
||||
// the result from this function is a double * of size
|
||||
// num*4. The caller of this function interprets this
|
||||
// array as a two dimensional array of size num*(2+d_classes) with each row
|
||||
// containing the following entries
|
||||
// bitId, infogain, 1 additional column for number of hits for each class
|
||||
// double *res = new double[num*(2+d_classes)];
|
||||
|
||||
d_top = num;
|
||||
int ncols = 2 + d_classes;
|
||||
|
||||
delete[] dp_topBits;
|
||||
dp_topBits = new double[num * ncols];
|
||||
|
||||
int offset, bid;
|
||||
|
||||
RDKit::INT_VECT maskBits;
|
||||
if (dp_maskBits && topN.size() < num) {
|
||||
dp_maskBits->getOnBits(maskBits);
|
||||
}
|
||||
|
||||
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
offset = i * ncols;
|
||||
if (topN.size() == 0) {
|
||||
if (dp_maskBits) {
|
||||
bid = maskBits[i];
|
||||
} else {
|
||||
bid = i;
|
||||
}
|
||||
dp_topBits[offset + 1] = 0.0;
|
||||
} else {
|
||||
bid = topN.top().second; // bit id
|
||||
dp_topBits[offset + 1] = topN.top().first; // value of the infogain
|
||||
topN.pop();
|
||||
}
|
||||
dp_topBits[offset] = (double)bid;
|
||||
|
||||
for (unsigned int j = 0; j < d_classes; j++) {
|
||||
dp_topBits[offset + 2 + j] = (double)d_counts[j][bid];
|
||||
}
|
||||
}
|
||||
return dp_topBits;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const {
|
||||
(*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent";
|
||||
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
||||
(*outStream) << std::setw(10) << "class" << ic;
|
||||
}
|
||||
(*outStream) << std::endl;
|
||||
|
||||
unsigned int ncols = 2 + d_classes;
|
||||
for (unsigned int i = 0; i < d_top; i++) {
|
||||
(*outStream) << std::setw(12) << (int)dp_topBits[i * ncols] << std::setw(12)
|
||||
<< std::setprecision(5) << dp_topBits[i * ncols + 1];
|
||||
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
||||
(*outStream) << std::setw(10) << (int)dp_topBits[i * ncols + 2 + ic];
|
||||
}
|
||||
(*outStream) << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const {
|
||||
std::ofstream tmpStream(fileName.c_str());
|
||||
if ((!tmpStream) || (tmpStream.bad())) {
|
||||
std::ostringstream errout;
|
||||
errout << "Bad output file " << fileName;
|
||||
throw RDKit::FileParseException(errout.str());
|
||||
}
|
||||
|
||||
std::ostream &outStream = static_cast<std::ostream &>(tmpStream);
|
||||
this->writeTopBitsToStream(&outStream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,236 +15,262 @@
|
||||
#include <DataStructs/BitVects.h>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
/*! \brief Class used to rank bits based on a specified measure of infomation
|
||||
*
|
||||
* Basically a primitive mimic of the CombiChem "signal" functionality
|
||||
* To use:
|
||||
* - create an instance of this class
|
||||
* - loop over the fingerprints in the dataset by calling accumulateVotes method
|
||||
* - create an instance of this class
|
||||
* - loop over the fingerprints in the dataset by calling accumulateVotes
|
||||
*method
|
||||
* - call getTopN to get the top n ranked bits
|
||||
*
|
||||
* Sample usage and results from the python wrapper:
|
||||
* Here's a small set of vectors:
|
||||
* >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
|
||||
* ...
|
||||
* ...
|
||||
* 0001 0
|
||||
* 0101 0
|
||||
* 0010 1
|
||||
* 1110 1
|
||||
*
|
||||
*
|
||||
* Default ranker, using infogain:
|
||||
* >>> ranker = InfoBitRanker(4,2)
|
||||
* >>> ranker = InfoBitRanker(4,2)
|
||||
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
|
||||
*int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* 3 1.000 2 0
|
||||
* 2 1.000 0 2
|
||||
* 0 0.311 0 1
|
||||
*
|
||||
*
|
||||
* Using the biased infogain:
|
||||
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
|
||||
* >>> ranker.SetBiasList((1,))
|
||||
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
|
||||
*int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* 2 1.000 0 2
|
||||
* 0 0.311 0 1
|
||||
* 1 0.000 1 1
|
||||
*
|
||||
*
|
||||
* A chi squared ranker is also available:
|
||||
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
|
||||
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
|
||||
*int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* 3 4.000 2 0
|
||||
* 2 4.000 0 2
|
||||
* 0 1.333 0 1
|
||||
*
|
||||
*
|
||||
* As is a biased chi squared:
|
||||
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
|
||||
* >>> ranker.SetBiasList((1,))
|
||||
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* ...
|
||||
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
|
||||
*int(bit),'%.3f'%gain,int(n0),int(n1)
|
||||
* ...
|
||||
* 2 4.000 0 2
|
||||
* 0 1.333 0 1
|
||||
* 1 0.000 1 1
|
||||
*/
|
||||
namespace RDInfoTheory {
|
||||
typedef std::vector<RDKit::USHORT> USHORT_VECT;
|
||||
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
|
||||
typedef std::vector<RDKit::USHORT> USHORT_VECT;
|
||||
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
|
||||
|
||||
class InfoBitRanker {
|
||||
public:
|
||||
|
||||
/*! \brief the type of measure for information
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
ENTROPY=1,
|
||||
BIASENTROPY=2,
|
||||
CHISQUARE=3,
|
||||
BIASCHISQUARE=4
|
||||
} InfoType;
|
||||
|
||||
/*! \brief Constructor
|
||||
*
|
||||
* ARGUMENTS:
|
||||
*
|
||||
* - nBits: the dimension of the bit vectors or the fingerprint length
|
||||
* - nClasses: the number of classes used in the classification problem (e.g. active,
|
||||
* moderately active, inactive etc.). It is assumed that the classes are
|
||||
* numbered from 0 to (nClasses - 1)
|
||||
* - infoType: the type of information metric
|
||||
*/
|
||||
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) :
|
||||
d_dims(nBits), d_classes(nClasses), d_type(infoType) {
|
||||
d_counts.resize(0);
|
||||
for (unsigned int i = 0; i < nClasses; i++) {
|
||||
USHORT_VECT cCount;
|
||||
cCount.resize(d_dims, 0);
|
||||
d_counts.push_back(cCount);
|
||||
}
|
||||
d_clsCount.resize(d_classes, 0);
|
||||
d_nInst = 0;
|
||||
d_top = 0;
|
||||
dp_topBits=0;
|
||||
d_biasList.resize(0);
|
||||
dp_maskBits=0;
|
||||
}
|
||||
|
||||
~InfoBitRanker() {
|
||||
if(dp_topBits)
|
||||
delete [] dp_topBits;
|
||||
if(dp_maskBits)
|
||||
delete dp_maskBits;
|
||||
class InfoBitRanker {
|
||||
public:
|
||||
/*! \brief the type of measure for information
|
||||
*
|
||||
*/
|
||||
typedef enum {
|
||||
ENTROPY = 1,
|
||||
BIASENTROPY = 2,
|
||||
CHISQUARE = 3,
|
||||
BIASCHISQUARE = 4
|
||||
} InfoType;
|
||||
|
||||
/*! \brief Constructor
|
||||
*
|
||||
* ARGUMENTS:
|
||||
*
|
||||
* - nBits: the dimension of the bit vectors or the fingerprint length
|
||||
* - nClasses: the number of classes used in the classification problem
|
||||
*(e.g. active,
|
||||
* moderately active, inactive etc.). It is assumed that the
|
||||
*classes are
|
||||
* numbered from 0 to (nClasses - 1)
|
||||
* - infoType: the type of information metric
|
||||
*/
|
||||
InfoBitRanker(unsigned int nBits, unsigned int nClasses,
|
||||
InfoType infoType = InfoBitRanker::ENTROPY)
|
||||
: d_dims(nBits), d_classes(nClasses), d_type(infoType) {
|
||||
d_counts.resize(0);
|
||||
for (unsigned int i = 0; i < nClasses; i++) {
|
||||
USHORT_VECT cCount;
|
||||
cCount.resize(d_dims, 0);
|
||||
d_counts.push_back(cCount);
|
||||
}
|
||||
d_clsCount.resize(d_classes, 0);
|
||||
d_nInst = 0;
|
||||
d_top = 0;
|
||||
dp_topBits = 0;
|
||||
d_biasList.resize(0);
|
||||
dp_maskBits = 0;
|
||||
}
|
||||
|
||||
/*! \brief Accumulate the votes for all the bits turned on in a bit vector
|
||||
*
|
||||
* ARGUMENTS:
|
||||
*
|
||||
* - bv : bit vector that supports [] operator
|
||||
* - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses
|
||||
*/
|
||||
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
|
||||
void accumulateVotes(const SparseBitVect &bv, unsigned int label);
|
||||
|
||||
/*! \brief Returns the top n bits ranked by the information metric
|
||||
*
|
||||
* This is actually the function where most of the work of ranking is happening
|
||||
*
|
||||
* \param num the number of top ranked bits that are required
|
||||
*
|
||||
* \return a pointer to an information array. The client should *not*
|
||||
* delete this
|
||||
*/
|
||||
double *getTopN(unsigned int num);
|
||||
|
||||
/*! \brief return the number of labelled instances(examples) or fingerprints seen so far
|
||||
*
|
||||
*/
|
||||
unsigned int getNumInstances() const {
|
||||
return d_nInst;
|
||||
}
|
||||
|
||||
/*! \brief return the number of classes
|
||||
*
|
||||
*/
|
||||
unsigned int getNumClasses() const {
|
||||
return d_classes;
|
||||
}
|
||||
~InfoBitRanker() {
|
||||
if (dp_topBits) delete[] dp_topBits;
|
||||
if (dp_maskBits) delete dp_maskBits;
|
||||
}
|
||||
|
||||
/*! \brief Set the classes to which the entropy calculation should be biased
|
||||
*
|
||||
* This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits.
|
||||
* In this mode, a bit must be correllated higher with one of the biased classes than all the
|
||||
* other classes. For example, in a two class problem with actives and inactives, the fraction of
|
||||
* actives that hit the bit has to be greater than the fraction of inactives that hit the bit
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* classList - list of class ids that we want a bias towards
|
||||
*/
|
||||
void setBiasList(RDKit::INT_VECT &classList);
|
||||
/*! \brief Accumulate the votes for all the bits turned on in a bit vector
|
||||
*
|
||||
* ARGUMENTS:
|
||||
*
|
||||
* - bv : bit vector that supports [] operator
|
||||
* - label : the class label for the bit vector. It is assumed that 0 <=
|
||||
*class < nClasses
|
||||
*/
|
||||
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
|
||||
void accumulateVotes(const SparseBitVect &bv, unsigned int label);
|
||||
|
||||
/*! \brief Returns the top n bits ranked by the information metric
|
||||
*
|
||||
* This is actually the function where most of the work of ranking is
|
||||
*happening
|
||||
*
|
||||
* \param num the number of top ranked bits that are required
|
||||
*
|
||||
* \return a pointer to an information array. The client should *not*
|
||||
* delete this
|
||||
*/
|
||||
double *getTopN(unsigned int num);
|
||||
|
||||
/*! \brief Set the bits to be used as a mask
|
||||
*
|
||||
* If this function is called, only the bits which are present in the
|
||||
* maskBits list will be used.
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* maskBits - the bits to be considered
|
||||
*/
|
||||
void setMaskBits(RDKit::INT_VECT &maskBits);
|
||||
/*! \brief return the number of labelled instances(examples) or fingerprints
|
||||
*seen so far
|
||||
*
|
||||
*/
|
||||
unsigned int getNumInstances() const { return d_nInst; }
|
||||
|
||||
/*! \brief Write the top N bits to a stream
|
||||
*
|
||||
*/
|
||||
void writeTopBitsToStream(std::ostream *outStream) const;
|
||||
|
||||
/*! \brief Write the top bits to a file
|
||||
*
|
||||
*/
|
||||
void writeTopBitsToFile(const std::string &fileName) const;
|
||||
/*! \brief return the number of classes
|
||||
*
|
||||
*/
|
||||
unsigned int getNumClasses() const { return d_classes; }
|
||||
|
||||
private:
|
||||
/*! \brief check if we want to compute the info content for a bit based on the bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - the fraction of items in each class that hit a particular bit are computed
|
||||
* - the maximum of these fractions for classes that are not in the biasList are computed
|
||||
* - If this maximum is less than the fraction for atleast one of classes in the biaslist
|
||||
* the bit is considered good
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
|
||||
* a 2D structure is assumed with the first row containing number of items of each class
|
||||
* with the bit set and the second row to entires of each class with the bit turned off
|
||||
*/
|
||||
bool BiasCheckBit(RDKit::USHORT *resMat) const;
|
||||
/*! \brief Set the classes to which the entropy calculation should be biased
|
||||
*
|
||||
* This list contains a set of class ids used when in the BIASENTROPY mode of
|
||||
*ranking bits.
|
||||
* In this mode, a bit must be correllated higher with one of the biased
|
||||
*classes than all the
|
||||
* other classes. For example, in a two class problem with actives and
|
||||
*inactives, the fraction of
|
||||
* actives that hit the bit has to be greater than the fraction of inactives
|
||||
*that hit the bit
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* classList - list of class ids that we want a bias towards
|
||||
*/
|
||||
void setBiasList(RDKit::INT_VECT &classList);
|
||||
|
||||
/*! \brief Compute the biased info entropy gain based on the bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
|
||||
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
|
||||
* a 2D structure is assumed with the first row containing number of items of each class
|
||||
* with the bit set and the second row to entires of each class with the bit turned off
|
||||
*/
|
||||
double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
|
||||
/*! \brief Set the bits to be used as a mask
|
||||
*
|
||||
* If this function is called, only the bits which are present in the
|
||||
* maskBits list will be used.
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* maskBits - the bits to be considered
|
||||
*/
|
||||
void setMaskBits(RDKit::INT_VECT &maskBits);
|
||||
|
||||
/*! \brief Compute the biased chi qsure value based on the bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
|
||||
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
|
||||
* a 2D structure is assumed with the first row containing number of items of each class
|
||||
* with the bit set and the second row to entires of each class with the bit turned off
|
||||
*/
|
||||
double BiasChiSquareGain(RDKit::USHORT *resMat) const;
|
||||
/*! \brief Write the top N bits to a stream
|
||||
*
|
||||
*/
|
||||
void writeTopBitsToStream(std::ostream *outStream) const;
|
||||
|
||||
unsigned int d_dims; // the number of bits in the fingerprints
|
||||
unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.)
|
||||
InfoType d_type; // the type of information meassure - currently we support only entropy
|
||||
VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class
|
||||
USHORT_VECT d_clsCount; // counter for the number of instances of each class
|
||||
double *dp_topBits; // storage for the top ranked bits and the corresponding statistics
|
||||
unsigned int d_top; // the number of bits that have been ranked
|
||||
unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes
|
||||
RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits
|
||||
ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
|
||||
|
||||
};
|
||||
/*! \brief Write the top bits to a file
|
||||
*
|
||||
*/
|
||||
void writeTopBitsToFile(const std::string &fileName) const;
|
||||
|
||||
private:
|
||||
/*! \brief check if we want to compute the info content for a bit based on the
|
||||
*bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - the fraction of items in each class that hit a particular bit are
|
||||
*computed
|
||||
* - the maximum of these fractions for classes that are not in the
|
||||
*biasList are computed
|
||||
* - If this maximum is less than the fraction for atleast one of classes
|
||||
*in the biaslist
|
||||
* the bit is considered good
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
|
||||
*of classes))
|
||||
* a 2D structure is assumed with the first row containing number
|
||||
*of items of each class
|
||||
* with the bit set and the second row to entires of each class
|
||||
*with the bit turned off
|
||||
*/
|
||||
bool BiasCheckBit(RDKit::USHORT *resMat) const;
|
||||
|
||||
/*! \brief Compute the biased info entropy gain based on the bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - we call BiasCheckBit to see if the bit qualifies to compute the
|
||||
*infocontent
|
||||
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
|
||||
*of classes))
|
||||
* a 2D structure is assumed with the first row containing number
|
||||
*of items of each class
|
||||
* with the bit set and the second row to entires of each class
|
||||
*with the bit turned off
|
||||
*/
|
||||
double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
|
||||
|
||||
/*! \brief Compute the biased chi qsure value based on the bias list
|
||||
*
|
||||
* This what happens here:
|
||||
* - we call BiasCheckBit to see if the bit qualifies to compute the
|
||||
*infocontent
|
||||
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
|
||||
*
|
||||
* ARGUMENTS:
|
||||
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
|
||||
*of classes))
|
||||
* a 2D structure is assumed with the first row containing number
|
||||
*of items of each class
|
||||
* with the bit set and the second row to entires of each class
|
||||
*with the bit turned off
|
||||
*/
|
||||
double BiasChiSquareGain(RDKit::USHORT *resMat) const;
|
||||
|
||||
unsigned int d_dims; // the number of bits in the fingerprints
|
||||
unsigned int d_classes; // the number of classes (active, inactive,
|
||||
// moderately active etc.)
|
||||
InfoType d_type; // the type of information meassure - currently we support
|
||||
// only entropy
|
||||
VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
|
||||
// each bit for each class
|
||||
USHORT_VECT d_clsCount; // counter for the number of instances of each class
|
||||
double *dp_topBits; // storage for the top ranked bits and the corresponding
|
||||
// statistics
|
||||
unsigned int d_top; // the number of bits that have been ranked
|
||||
unsigned int d_nInst; // total number of instances or fingerprints used
|
||||
// accumulate votes
|
||||
RDKit::INT_VECT
|
||||
d_biasList; // if we want a bias towards certain classes in ranking bits
|
||||
ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
|
||||
};
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -10,129 +10,129 @@
|
||||
|
||||
namespace RDInfoTheory {
|
||||
|
||||
template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
|
||||
// For a contingency matrix with each column corresponding to a class and each row to a
|
||||
// the descriptor (or variable) state, the matrix looks something like for 3x3 problem
|
||||
//
|
||||
// 1 2 3 Totals
|
||||
// 1 | N11 N12 N13 R1
|
||||
// 2 | N21 N22 N23 R2
|
||||
// 3 | N31 N32 N33 R3
|
||||
// Totals | C1 C2 C3 N
|
||||
//
|
||||
// Th chi squere formula is
|
||||
// chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
|
||||
T *rowSums, *colSums;
|
||||
int i, j, tSum;
|
||||
// find the row sum
|
||||
tSum = 0;
|
||||
rowSums = new T[dim1];
|
||||
for (i = 0; i < dim1; i++) {
|
||||
int idx1 = i*dim2;
|
||||
rowSums[i] = (T)0.0;
|
||||
for (j = 0; j < dim2; j++) {
|
||||
rowSums[i] += dMat[idx1 + j];
|
||||
}
|
||||
tSum += (int)rowSums[i];
|
||||
template <class T>
|
||||
double ChiSquare(T *dMat, long int dim1, long int dim2) {
|
||||
// For a contingency matrix with each column corresponding to a class and each
|
||||
// row to a
|
||||
// the descriptor (or variable) state, the matrix looks something like for 3x3
|
||||
// problem
|
||||
//
|
||||
// 1 2 3 Totals
|
||||
// 1 | N11 N12 N13 R1
|
||||
// 2 | N21 N22 N23 R2
|
||||
// 3 | N31 N32 N33 R3
|
||||
// Totals | C1 C2 C3 N
|
||||
//
|
||||
// Th chi squere formula is
|
||||
// chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
|
||||
T *rowSums, *colSums;
|
||||
int i, j, tSum;
|
||||
// find the row sum
|
||||
tSum = 0;
|
||||
rowSums = new T[dim1];
|
||||
for (i = 0; i < dim1; i++) {
|
||||
int idx1 = i * dim2;
|
||||
rowSums[i] = (T)0.0;
|
||||
for (j = 0; j < dim2; j++) {
|
||||
rowSums[i] += dMat[idx1 + j];
|
||||
}
|
||||
|
||||
// find the column sums
|
||||
colSums = new T[dim2];
|
||||
for (i = 0; i < dim2; i++) {
|
||||
colSums[i] = (T)0.0;
|
||||
for (j = 0; j < dim1; j++) {
|
||||
colSums[i] += dMat[j*dim2 + i];
|
||||
}
|
||||
}
|
||||
|
||||
double chi = 0.0;
|
||||
for ( i = 0; i < dim1; i++) {
|
||||
double rchi = 0.0;
|
||||
for (j = 0; j < dim2; j++) {
|
||||
rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
|
||||
}
|
||||
chi += ( ((double)tSum/rowSums[i])*rchi );
|
||||
}
|
||||
chi -= tSum;
|
||||
delete [] rowSums;
|
||||
delete [] colSums;
|
||||
|
||||
return chi;
|
||||
tSum += (int)rowSums[i];
|
||||
}
|
||||
|
||||
template<class T> double InfoEntropy(T *tPtr, long int dim) {
|
||||
int i;
|
||||
T nInstances = 0;
|
||||
double accum=0.0,d;
|
||||
|
||||
for(i=0;i<dim;i++){
|
||||
nInstances += tPtr[i];
|
||||
// find the column sums
|
||||
colSums = new T[dim2];
|
||||
for (i = 0; i < dim2; i++) {
|
||||
colSums[i] = (T)0.0;
|
||||
for (j = 0; j < dim1; j++) {
|
||||
colSums[i] += dMat[j * dim2 + i];
|
||||
}
|
||||
|
||||
if(nInstances != 0){
|
||||
for(i=0;i<dim;i++){
|
||||
d = (double)tPtr[i]/nInstances;
|
||||
if(d != 0){
|
||||
accum += -d*log(d);
|
||||
}
|
||||
}
|
||||
}
|
||||
return accum/log(2.0);
|
||||
}
|
||||
|
||||
template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
|
||||
T *variableRes, *overallRes;
|
||||
double gain,term2;
|
||||
int tSum;
|
||||
|
||||
//std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
|
||||
variableRes = new T[dim1];
|
||||
for(long int i=0;i<dim1;i++){
|
||||
long int idx1 = i*dim2;
|
||||
variableRes[i] = (T)0.0;
|
||||
for(long int j=0;j<dim2;j++){
|
||||
variableRes[i] += dMat[idx1+j];
|
||||
//std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
|
||||
}
|
||||
double chi = 0.0;
|
||||
for (i = 0; i < dim1; i++) {
|
||||
double rchi = 0.0;
|
||||
for (j = 0; j < dim2; j++) {
|
||||
rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
|
||||
}
|
||||
|
||||
overallRes = new T[dim2];
|
||||
// do the col sums
|
||||
for(long int i=0;i<dim2;i++){
|
||||
overallRes[i] = (T)0.0;
|
||||
for(long int j=0;j<dim1;j++){
|
||||
overallRes[i] += dMat[j*dim2+i];
|
||||
//std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
term2 = 0.0;
|
||||
for(long int i=0;i<dim1;i++) {
|
||||
T *tPtr;
|
||||
tPtr = dMat + i*dim2;
|
||||
term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
|
||||
}
|
||||
tSum = 0;
|
||||
for(long int i=0;i<dim2;i++){
|
||||
tSum += static_cast<int>(overallRes[i]);
|
||||
}
|
||||
|
||||
if(tSum != 0){
|
||||
term2 /= tSum;
|
||||
gain = InfoEntropy(overallRes,dim2) - term2;
|
||||
}
|
||||
else{
|
||||
gain = 0.0;
|
||||
}
|
||||
//std::cerr<<" >gain> "<<gain<<std::endl;
|
||||
|
||||
delete [] overallRes;
|
||||
delete [] variableRes;
|
||||
return gain;
|
||||
chi += (((double)tSum / rowSums[i]) * rchi);
|
||||
}
|
||||
|
||||
|
||||
chi -= tSum;
|
||||
delete[] rowSums;
|
||||
delete[] colSums;
|
||||
|
||||
return chi;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
double InfoEntropy(T *tPtr, long int dim) {
|
||||
int i;
|
||||
T nInstances = 0;
|
||||
double accum = 0.0, d;
|
||||
|
||||
for (i = 0; i < dim; i++) {
|
||||
nInstances += tPtr[i];
|
||||
}
|
||||
|
||||
if (nInstances != 0) {
|
||||
for (i = 0; i < dim; i++) {
|
||||
d = (double)tPtr[i] / nInstances;
|
||||
if (d != 0) {
|
||||
accum += -d * log(d);
|
||||
}
|
||||
}
|
||||
}
|
||||
return accum / log(2.0);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
|
||||
T *variableRes, *overallRes;
|
||||
double gain, term2;
|
||||
int tSum;
|
||||
|
||||
// std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
|
||||
variableRes = new T[dim1];
|
||||
for (long int i = 0; i < dim1; i++) {
|
||||
long int idx1 = i * dim2;
|
||||
variableRes[i] = (T)0.0;
|
||||
for (long int j = 0; j < dim2; j++) {
|
||||
variableRes[i] += dMat[idx1 + j];
|
||||
// std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
overallRes = new T[dim2];
|
||||
// do the col sums
|
||||
for (long int i = 0; i < dim2; i++) {
|
||||
overallRes[i] = (T)0.0;
|
||||
for (long int j = 0; j < dim1; j++) {
|
||||
overallRes[i] += dMat[j * dim2 + i];
|
||||
// std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
term2 = 0.0;
|
||||
for (long int i = 0; i < dim1; i++) {
|
||||
T *tPtr;
|
||||
tPtr = dMat + i * dim2;
|
||||
term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
|
||||
}
|
||||
tSum = 0;
|
||||
for (long int i = 0; i < dim2; i++) {
|
||||
tSum += static_cast<int>(overallRes[i]);
|
||||
}
|
||||
|
||||
if (tSum != 0) {
|
||||
term2 /= tSum;
|
||||
gain = InfoEntropy(overallRes, dim2) - term2;
|
||||
} else {
|
||||
gain = 0.0;
|
||||
}
|
||||
// std::cerr<<" >gain> "<<gain<<std::endl;
|
||||
|
||||
delete[] overallRes;
|
||||
delete[] variableRes;
|
||||
return gain;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
|
||||
#define NO_IMPORT_ARRAY
|
||||
#include <RDBoost/python.h>
|
||||
#define PY_ARRAY_UNIQUE_SYMBOL rdinfotheory_array_API
|
||||
@@ -22,47 +21,48 @@
|
||||
namespace python = boost::python;
|
||||
|
||||
namespace RDInfoTheory {
|
||||
|
||||
PyObject *getCorrMatrix(BitCorrMatGenerator *cmGen) {
|
||||
double *dres = cmGen->getCorrMat();
|
||||
unsigned int nb = cmGen->getCorrBitList().size();
|
||||
npy_intp dim = nb*(nb-1)/2;
|
||||
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(1,&dim,NPY_DOUBLE);
|
||||
memcpy(static_cast<void *>(res->data),
|
||||
static_cast<void *>(dres), dim*sizeof(double));
|
||||
return PyArray_Return(res);
|
||||
}
|
||||
|
||||
void setBitList(BitCorrMatGenerator *cmGen, python::object bitList) {
|
||||
PySequenceHolder<int> blist(bitList);
|
||||
unsigned int nb = blist.size();
|
||||
RDKit::INT_VECT res;
|
||||
res.reserve(nb);
|
||||
for (unsigned int i = 0; i < nb; i++) {
|
||||
res.push_back(blist[i]);
|
||||
}
|
||||
cmGen->setBitIdList(res);
|
||||
}
|
||||
PyObject *getCorrMatrix(BitCorrMatGenerator *cmGen) {
|
||||
double *dres = cmGen->getCorrMat();
|
||||
unsigned int nb = cmGen->getCorrBitList().size();
|
||||
npy_intp dim = nb * (nb - 1) / 2;
|
||||
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
|
||||
memcpy(static_cast<void *>(res->data), static_cast<void *>(dres),
|
||||
dim * sizeof(double));
|
||||
return PyArray_Return(res);
|
||||
}
|
||||
|
||||
void CollectVotes(BitCorrMatGenerator *cmGen, python::object bitVect) {
|
||||
python::extract<ExplicitBitVect> ebvWorks(bitVect);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (ebvWorks.check()) {
|
||||
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
|
||||
cmGen->collectVotes(ev);
|
||||
}
|
||||
else if (sbvWorks.check()) {
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
cmGen->collectVotes(sv);
|
||||
}
|
||||
else {
|
||||
throw_value_error("CollectVote can only take ExplicitBitVects or SparseBitVects");
|
||||
}
|
||||
void setBitList(BitCorrMatGenerator *cmGen, python::object bitList) {
|
||||
PySequenceHolder<int> blist(bitList);
|
||||
unsigned int nb = blist.size();
|
||||
RDKit::INT_VECT res;
|
||||
res.reserve(nb);
|
||||
for (unsigned int i = 0; i < nb; i++) {
|
||||
res.push_back(blist[i]);
|
||||
}
|
||||
cmGen->setBitIdList(res);
|
||||
}
|
||||
|
||||
struct corrmat_wrap {
|
||||
static void wrap() {
|
||||
std::string docString = "A class to generate a pariwise correlation matrix between a list of bits\n"
|
||||
void CollectVotes(BitCorrMatGenerator *cmGen, python::object bitVect) {
|
||||
python::extract<ExplicitBitVect> ebvWorks(bitVect);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (ebvWorks.check()) {
|
||||
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
|
||||
cmGen->collectVotes(ev);
|
||||
} else if (sbvWorks.check()) {
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
cmGen->collectVotes(sv);
|
||||
} else {
|
||||
throw_value_error(
|
||||
"CollectVote can only take ExplicitBitVects or SparseBitVects");
|
||||
}
|
||||
}
|
||||
|
||||
struct corrmat_wrap {
|
||||
static void wrap() {
|
||||
std::string docString =
|
||||
"A class to generate a pariwise correlation matrix between a list of "
|
||||
"bits\n"
|
||||
"The mode of operation for this class is something like this\n"
|
||||
" >>> cmg = BitCorrMatGenerator() \n"
|
||||
" >>> cmg.SetBitList(blist) \n"
|
||||
@@ -70,28 +70,26 @@ namespace RDInfoTheory {
|
||||
" >>> cmg.CollectVotes(fp) \n"
|
||||
" >>> corrMat = cmg.GetCorrMatrix() \n"
|
||||
" \n"
|
||||
" The resulting correlation matrix is a one dimensional nummeric array containing the \n"
|
||||
" The resulting correlation matrix is a one dimensional nummeric "
|
||||
"array containing the \n"
|
||||
" lower triangle elements\n";
|
||||
python::class_<BitCorrMatGenerator>("BitCorrMatGenerator",
|
||||
docString.c_str())
|
||||
python::class_<BitCorrMatGenerator>("BitCorrMatGenerator",
|
||||
docString.c_str())
|
||||
.def("SetBitList", setBitList,
|
||||
"Set the list of bits that need to be correllated\n\n"
|
||||
" This may for example be ther top ranking ensemble bits\n\n"
|
||||
"ARGUMENTS:\n\n"
|
||||
" - bitList : an integer list of bit IDs\n")
|
||||
.def("CollectVotes", CollectVotes,
|
||||
"For each pair of on bits (bi, bj) in fp increase the correlation count for the pair by 1\n\n"
|
||||
"For each pair of on bits (bi, bj) in fp increase the correlation "
|
||||
"count for the pair by 1\n\n"
|
||||
"ARGUMENTS:\n\n"
|
||||
" - fp : a bit vector to collect the fingerprints from\n")
|
||||
.def("GetCorrMatrix", getCorrMatrix,
|
||||
"Get the correlation matrix following the collection of votes from a bunch of fingerprints\n")
|
||||
;
|
||||
};
|
||||
"Get the correlation matrix following the collection of votes "
|
||||
"from a bunch of fingerprints\n");
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
void wrap_corrmatgen() {
|
||||
RDInfoTheory::corrmat_wrap::wrap();
|
||||
}
|
||||
|
||||
|
||||
void wrap_corrmatgen() { RDInfoTheory::corrmat_wrap::wrap(); }
|
||||
|
||||
@@ -22,161 +22,171 @@
|
||||
namespace python = boost::python;
|
||||
|
||||
namespace RDInfoTheory {
|
||||
|
||||
PyObject *getTopNbits(InfoBitRanker *ranker, int num){// int ignoreNoClass=-1) {
|
||||
double *dres = ranker->getTopN(num);
|
||||
npy_intp dims[2];
|
||||
dims[0] = num;
|
||||
dims[1] = ranker->getNumClasses() + 2;
|
||||
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(2,dims,NPY_DOUBLE);
|
||||
memcpy(static_cast<void *>(res->data),
|
||||
static_cast<void *>(dres), dims[0]*dims[1]*sizeof(double));
|
||||
return PyArray_Return(res);
|
||||
}
|
||||
|
||||
void AccumulateVotes(InfoBitRanker *ranker, python::object bitVect, int label) {
|
||||
python::extract<ExplicitBitVect> ebvWorks(bitVect);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (ebvWorks.check()) {
|
||||
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
|
||||
ranker->accumulateVotes(ev, label);
|
||||
}
|
||||
else if (sbvWorks.check()) {
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
ranker->accumulateVotes(sv, label);
|
||||
}
|
||||
else {
|
||||
throw_value_error("Accumulate Vote can only take a explicitBitVects or SparseBitvects");
|
||||
}
|
||||
}
|
||||
|
||||
void SetBiasList(InfoBitRanker *ranker, python::object classList) {
|
||||
RDKit::INT_VECT cList;
|
||||
PySequenceHolder<int> bList(classList);
|
||||
cList.reserve(bList.size());
|
||||
for (unsigned int i = 0; i < bList.size(); i++) {
|
||||
cList.push_back(bList[i]);
|
||||
}
|
||||
ranker->setBiasList(cList);
|
||||
}
|
||||
PyObject *getTopNbits(InfoBitRanker *ranker,
|
||||
int num) { // int ignoreNoClass=-1) {
|
||||
double *dres = ranker->getTopN(num);
|
||||
npy_intp dims[2];
|
||||
dims[0] = num;
|
||||
dims[1] = ranker->getNumClasses() + 2;
|
||||
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_DOUBLE);
|
||||
memcpy(static_cast<void *>(res->data), static_cast<void *>(dres),
|
||||
dims[0] * dims[1] * sizeof(double));
|
||||
return PyArray_Return(res);
|
||||
}
|
||||
|
||||
void SetMaskBits(InfoBitRanker *ranker, python::object maskBits) {
|
||||
RDKit::INT_VECT cList;
|
||||
PySequenceHolder<int> bList(maskBits);
|
||||
cList.reserve(bList.size());
|
||||
for (unsigned int i = 0; i < bList.size(); i++) {
|
||||
cList.push_back(bList[i]);
|
||||
}
|
||||
ranker->setMaskBits(cList);
|
||||
void AccumulateVotes(InfoBitRanker *ranker, python::object bitVect, int label) {
|
||||
python::extract<ExplicitBitVect> ebvWorks(bitVect);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (ebvWorks.check()) {
|
||||
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
|
||||
ranker->accumulateVotes(ev, label);
|
||||
} else if (sbvWorks.check()) {
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
ranker->accumulateVotes(sv, label);
|
||||
} else {
|
||||
throw_value_error(
|
||||
"Accumulate Vote can only take a explicitBitVects or SparseBitvects");
|
||||
}
|
||||
}
|
||||
|
||||
void tester(InfoBitRanker *ranker, python::object bitVect) {
|
||||
RDUNUSED_PARAM(ranker);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (sbvWorks.check()){
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
std::cout << "Num of on bits: " << sv.getNumOnBits() << "\n";
|
||||
}
|
||||
void SetBiasList(InfoBitRanker *ranker, python::object classList) {
|
||||
RDKit::INT_VECT cList;
|
||||
PySequenceHolder<int> bList(classList);
|
||||
cList.reserve(bList.size());
|
||||
for (unsigned int i = 0; i < bList.size(); i++) {
|
||||
cList.push_back(bList[i]);
|
||||
}
|
||||
ranker->setBiasList(cList);
|
||||
}
|
||||
|
||||
struct ranker_wrap {
|
||||
static void wrap() {
|
||||
std::string docString = "A class to rank the bits from a series of labelled fingerprints\n"
|
||||
"A simple demonstration may help clarify what this class does. \n"
|
||||
"Here's a small set of vectors:\n"
|
||||
">>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]\n"
|
||||
"... \n"
|
||||
"0001 0\n"
|
||||
"0101 0\n"
|
||||
"0010 1\n"
|
||||
"1110 1\n"
|
||||
"\n"
|
||||
"Default ranker, using infogain:\n"
|
||||
">>> ranker = InfoBitRanker(4,2) \n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"3 1.000 2 0\n"
|
||||
"2 1.000 0 2\n"
|
||||
"0 0.311 0 1\n"
|
||||
"\n"
|
||||
"Using the biased infogain:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)\n"
|
||||
">>> ranker.SetBiasList((1,))\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"2 1.000 0 2\n"
|
||||
"0 0.311 0 1\n"
|
||||
"1 0.000 1 1\n"
|
||||
"\n"
|
||||
"A chi squared ranker is also available:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"3 4.000 2 0\n"
|
||||
"2 4.000 0 2\n"
|
||||
"0 1.333 0 1\n"
|
||||
"\n"
|
||||
"As is a biased chi squared:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)\n"
|
||||
">>> ranker.SetBiasList((1,))\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"2 4.000 0 2\n"
|
||||
"0 1.333 0 1\n"
|
||||
"1 0.000 1 1\n";
|
||||
void SetMaskBits(InfoBitRanker *ranker, python::object maskBits) {
|
||||
RDKit::INT_VECT cList;
|
||||
PySequenceHolder<int> bList(maskBits);
|
||||
cList.reserve(bList.size());
|
||||
for (unsigned int i = 0; i < bList.size(); i++) {
|
||||
cList.push_back(bList[i]);
|
||||
}
|
||||
ranker->setMaskBits(cList);
|
||||
}
|
||||
|
||||
python::class_<InfoBitRanker>("InfoBitRanker",
|
||||
docString.c_str(),
|
||||
python::init<int, int>(python::args("nBits", "nClasses")))
|
||||
.def(python::init<int, int, InfoBitRanker::InfoType>
|
||||
(python::args("nBits", "nClasses", "infoType")))
|
||||
void tester(InfoBitRanker *ranker, python::object bitVect) {
|
||||
RDUNUSED_PARAM(ranker);
|
||||
python::extract<SparseBitVect> sbvWorks(bitVect);
|
||||
if (sbvWorks.check()) {
|
||||
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
|
||||
std::cout << "Num of on bits: " << sv.getNumOnBits() << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
struct ranker_wrap {
|
||||
static void wrap() {
|
||||
std::string docString =
|
||||
"A class to rank the bits from a series of labelled fingerprints\n"
|
||||
"A simple demonstration may help clarify what this class does. \n"
|
||||
"Here's a small set of vectors:\n"
|
||||
">>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]\n"
|
||||
"... \n"
|
||||
"0001 0\n"
|
||||
"0101 0\n"
|
||||
"0010 1\n"
|
||||
"1110 1\n"
|
||||
"\n"
|
||||
"Default ranker, using infogain:\n"
|
||||
">>> ranker = InfoBitRanker(4,2) \n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
|
||||
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"3 1.000 2 0\n"
|
||||
"2 1.000 0 2\n"
|
||||
"0 0.311 0 1\n"
|
||||
"\n"
|
||||
"Using the biased infogain:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)\n"
|
||||
">>> ranker.SetBiasList((1,))\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
|
||||
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"2 1.000 0 2\n"
|
||||
"0 0.311 0 1\n"
|
||||
"1 0.000 1 1\n"
|
||||
"\n"
|
||||
"A chi squared ranker is also available:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
|
||||
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"3 4.000 2 0\n"
|
||||
"2 4.000 0 2\n"
|
||||
"0 1.333 0 1\n"
|
||||
"\n"
|
||||
"As is a biased chi squared:\n"
|
||||
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)\n"
|
||||
">>> ranker.SetBiasList((1,))\n"
|
||||
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
|
||||
"... \n"
|
||||
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
|
||||
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
|
||||
"... \n"
|
||||
"2 4.000 0 2\n"
|
||||
"0 1.333 0 1\n"
|
||||
"1 0.000 1 1\n";
|
||||
|
||||
python::class_<InfoBitRanker>(
|
||||
"InfoBitRanker", docString.c_str(),
|
||||
python::init<int, int>(python::args("nBits", "nClasses")))
|
||||
.def(python::init<int, int, InfoBitRanker::InfoType>(
|
||||
python::args("nBits", "nClasses", "infoType")))
|
||||
.def("AccumulateVotes", AccumulateVotes,
|
||||
"Accumulate the votes for all the bits turned on in a bit vector\n\n"
|
||||
"Accumulate the votes for all the bits turned on in a bit "
|
||||
"vector\n\n"
|
||||
"ARGUMENTS:\n\n"
|
||||
" - bv : bit vector either ExplicitBitVect or SparseBitVect operator\n"
|
||||
" - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses \n")
|
||||
.def ("SetBiasList", SetBiasList,
|
||||
"Set the classes to which the entropy calculation should be biased\n\n"
|
||||
"This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. \n"
|
||||
"In this mode, a bit must be correlated higher with one of the biased classes than all the \n"
|
||||
"other classes. For example, in a two class problem with actives and inactives, the fraction of \n"
|
||||
"actives that hit the bit has to be greater than the fraction of inactives that hit the bit\n\n"
|
||||
"ARGUMENTS: \n\n"
|
||||
" - classList : list of class ids that we want a bias towards\n")
|
||||
.def ("SetMaskBits", SetMaskBits,
|
||||
"Set the mask bits for the calculation\n\n"
|
||||
"ARGUMENTS: \n\n"
|
||||
" - maskBits : list of mask bits to use\n")
|
||||
" - bv : bit vector either ExplicitBitVect or SparseBitVect "
|
||||
"operator\n"
|
||||
" - label : the class label for the bit vector. It is assumed "
|
||||
"that 0 <= class < nClasses \n")
|
||||
.def("SetBiasList", SetBiasList,
|
||||
"Set the classes to which the entropy calculation should be "
|
||||
"biased\n\n"
|
||||
"This list contains a set of class ids used when in the "
|
||||
"BIASENTROPY mode of ranking bits. \n"
|
||||
"In this mode, a bit must be correlated higher with one of the "
|
||||
"biased classes than all the \n"
|
||||
"other classes. For example, in a two class problem with actives "
|
||||
"and inactives, the fraction of \n"
|
||||
"actives that hit the bit has to be greater than the fraction of "
|
||||
"inactives that hit the bit\n\n"
|
||||
"ARGUMENTS: \n\n"
|
||||
" - classList : list of class ids that we want a bias towards\n")
|
||||
.def("SetMaskBits", SetMaskBits,
|
||||
"Set the mask bits for the calculation\n\n"
|
||||
"ARGUMENTS: \n\n"
|
||||
" - maskBits : list of mask bits to use\n")
|
||||
.def("GetTopN", getTopNbits,
|
||||
"Returns the top n bits ranked by the information metric\n"
|
||||
"This is actually the function where most of the work of ranking is happening\n\n"
|
||||
"This is actually the function where most of the work of ranking "
|
||||
"is happening\n\n"
|
||||
"ARGUMENTS:\n\n"
|
||||
" - num : the number of top ranked bits that are required\n")
|
||||
.def("WriteTopBitsToFile", &InfoBitRanker::writeTopBitsToFile,
|
||||
"Write the bits that have been ranked to a file")
|
||||
.def("Tester", tester)
|
||||
;
|
||||
|
||||
python::enum_<InfoBitRanker::InfoType>("InfoType")
|
||||
.def("Tester", tester);
|
||||
|
||||
python::enum_<InfoBitRanker::InfoType>("InfoType")
|
||||
.value("ENTROPY", InfoBitRanker::ENTROPY)
|
||||
.value("BIASENTROPY", InfoBitRanker::BIASENTROPY)
|
||||
.value("CHISQUARE", InfoBitRanker::CHISQUARE)
|
||||
.value("BIASCHISQUARE", InfoBitRanker::BIASCHISQUARE)
|
||||
.export_values();
|
||||
;
|
||||
};
|
||||
;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
void wrap_ranker() {
|
||||
RDInfoTheory::ranker_wrap::wrap();
|
||||
}
|
||||
|
||||
void wrap_ranker() { RDInfoTheory::ranker_wrap::wrap(); }
|
||||
|
||||
@@ -18,126 +18,127 @@ namespace python = boost::python;
|
||||
using namespace RDInfoTheory;
|
||||
|
||||
namespace RDInfoTheory {
|
||||
double infoEntropy(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
|
||||
((PyArrayObject *)matObj)->descr->type_num,
|
||||
1,1);
|
||||
double res=0.0;
|
||||
// we are expecting a 1 dimensional array
|
||||
long int ncols = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
CHECK_INVARIANT(ncols > 0, "");
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
double infoEntropy(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
|
||||
double infoGain(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
|
||||
((PyArrayObject *)matObj)->descr->type_num,
|
||||
2,2);
|
||||
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
|
||||
double res=0.0;
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else {
|
||||
throw_value_error("Numeric array object of type int or long or float or double");
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
|
||||
matObj, ((PyArrayObject *)matObj)->descr->type_num, 1, 1);
|
||||
double res = 0.0;
|
||||
// we are expecting a 1 dimensional array
|
||||
long int ncols = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
CHECK_INVARIANT(ncols > 0, "");
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = InfoEntropy(data, ncols);
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
}
|
||||
|
||||
double chiSquare(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
|
||||
((PyArrayObject *)matObj)->descr->type_num,
|
||||
2,2);
|
||||
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
|
||||
double res=0.0;
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else {
|
||||
throw_value_error("Numeric array object of type int or long or float or double");
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
double infoGain(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
|
||||
matObj, ((PyArrayObject *)matObj)->descr->type_num, 2, 2);
|
||||
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
|
||||
double res = 0.0;
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = InfoEntropyGain(data, rows, cols);
|
||||
} else {
|
||||
throw_value_error(
|
||||
"Numeric array object of type int or long or float or double");
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
}
|
||||
|
||||
double chiSquare(python::object resArr) {
|
||||
PyObject *matObj = resArr.ptr();
|
||||
if (!PyArray_Check(matObj)) {
|
||||
throw_value_error("Expecting a Numeric array object");
|
||||
}
|
||||
PyArrayObject *copy;
|
||||
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
|
||||
matObj, ((PyArrayObject *)matObj)->descr->type_num, 2, 2);
|
||||
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
|
||||
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
|
||||
double res = 0.0;
|
||||
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
|
||||
double *data = (double *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
|
||||
float *data = (float *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
|
||||
int *data = (int *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
|
||||
long int *data = (long int *)copy->data;
|
||||
res = ChiSquare(data, rows, cols);
|
||||
} else {
|
||||
throw_value_error(
|
||||
"Numeric array object of type int or long or float or double");
|
||||
}
|
||||
Py_DECREF(copy);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
void wrap_ranker();
|
||||
void wrap_corrmatgen();
|
||||
|
||||
BOOST_PYTHON_MODULE(rdInfoTheory)
|
||||
{
|
||||
BOOST_PYTHON_MODULE(rdInfoTheory) {
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing bunch of functions for information metrics and a ranker to rank bits"
|
||||
;
|
||||
|
||||
"Module containing bunch of functions for information metrics and a "
|
||||
"ranker to rank bits";
|
||||
|
||||
rdkit_import_array();
|
||||
python::register_exception_translator<IndexErrorException>(&translate_index_error);
|
||||
python::register_exception_translator<ValueErrorException>(&translate_value_error);
|
||||
python::register_exception_translator<IndexErrorException>(
|
||||
&translate_index_error);
|
||||
python::register_exception_translator<ValueErrorException>(
|
||||
&translate_value_error);
|
||||
|
||||
wrap_ranker();
|
||||
wrap_corrmatgen();
|
||||
|
||||
std::string docString="calculates the informational entropy of the values in an array\n\n\
|
||||
std::string docString =
|
||||
"calculates the informational entropy of the values in an array\n\n\
|
||||
ARGUMENTS:\n\
|
||||
\n\
|
||||
- resMat: pointer to a long int array containing the data\n\
|
||||
- dim: long int containing the length of the _tPtr_ array.\n\n\
|
||||
RETURNS:\n\n\
|
||||
a double\n";
|
||||
python::def("InfoEntropy", RDInfoTheory::infoEntropy,
|
||||
docString.c_str());
|
||||
python::def("InfoEntropy", RDInfoTheory::infoEntropy, docString.c_str());
|
||||
|
||||
docString="Calculates the information gain for a variable\n\n\
|
||||
docString =
|
||||
"Calculates the information gain for a variable\n\n\
|
||||
ARGUMENTS:\n\n\
|
||||
- varMat: a Numeric Array object\n\
|
||||
varMat is a Numeric array with the number of possible occurances\n\
|
||||
@@ -148,11 +149,10 @@ BOOST_PYTHON_MODULE(rdInfoTheory)
|
||||
- a Python float object\n\n\
|
||||
NOTES\n\n\
|
||||
- this is a dropin replacement for _PyInfoGain()_ in entropy.py\n";
|
||||
python::def("InfoGain", RDInfoTheory::infoGain,
|
||||
docString.c_str());
|
||||
python::def("InfoGain", RDInfoTheory::infoGain, docString.c_str());
|
||||
|
||||
|
||||
docString="Calculates the chi squared value for a variable\n\n\
|
||||
docString =
|
||||
"Calculates the chi squared value for a variable\n\n\
|
||||
ARGUMENTS:\n\n\
|
||||
- varMat: a Numeric Array object\n\
|
||||
varMat is a Numeric array with the number of possible occurances\n\
|
||||
@@ -161,8 +161,5 @@ BOOST_PYTHON_MODULE(rdInfoTheory)
|
||||
has 3 possible values, varMat would be 4x3\n\n\
|
||||
RETURNS:\n\n\
|
||||
- a Python float object\n";
|
||||
python::def("ChiSquare", RDInfoTheory::chiSquare,
|
||||
docString.c_str());
|
||||
|
||||
python::def("ChiSquare", RDInfoTheory::chiSquare, docString.c_str());
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user