first pass, using google style

This commit is contained in:
Greg Landrum
2015-11-14 14:58:11 +01:00
parent 80bb809b31
commit e08e0d16d8
619 changed files with 138877 additions and 133381 deletions

View File

@@ -20,11 +20,9 @@ namespace python = boost::python;
typedef double real;
extern "C"
void distdriver_(boost::int64_t *n,boost::int64_t *len,
real *dists,
boost::int64_t *toggle,
boost::int64_t *ia,boost::int64_t *ib,real *crit);
extern "C" void distdriver_(boost::int64_t *n, boost::int64_t *len, real *dists,
boost::int64_t *toggle, boost::int64_t *ia,
boost::int64_t *ib, real *crit);
//
// Rather than deal with any nonsense like trying to get
@@ -32,54 +30,53 @@ void distdriver_(boost::int64_t *n,boost::int64_t *len,
// (thus drowning in the waves of f2c hate), we'll generate
// the distance matrix on our own here and then call distdriver_
//
void clusterit(real *dataP,boost::int64_t n,boost::int64_t m,boost::int64_t iopt,
boost::int64_t *ia,boost::int64_t *ib,real *crit){
void clusterit(real *dataP, boost::int64_t n, boost::int64_t m,
boost::int64_t iopt, boost::int64_t *ia, boost::int64_t *ib,
real *crit) {
real *dists;
boost::int64_t len;
boost::int64_t pos = 0;
boost::int64_t i,j,k,iTab,jTab;
boost::int64_t i, j, k, iTab, jTab;
double tmp;
len = (n*(n-1))/2;
dists = (real *)calloc(len,sizeof(real));
for(i=1;i<n;i++){
iTab = i*m;
for(j=0;j<i;j++){
jTab = j*m;
for(k=0;k<m;k++){
tmp = dataP[iTab+k]-dataP[jTab+k];
dists[pos] += tmp*tmp;
len = (n * (n - 1)) / 2;
dists = (real *)calloc(len, sizeof(real));
for (i = 1; i < n; i++) {
iTab = i * m;
for (j = 0; j < i; j++) {
jTab = j * m;
for (k = 0; k < m; k++) {
tmp = dataP[iTab + k] - dataP[jTab + k];
dists[pos] += tmp * tmp;
}
pos++;
}
}
distdriver_(&n,&len,dists,&iopt,ia,ib,crit);
distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
free(dists);
};
static PyObject *
Clustering_MurtaghCluster(python::object data, int nPts, int sz, int option)
{
static PyObject *Clustering_MurtaghCluster(python::object data, int nPts,
int sz, int option) {
PyArrayObject *dataContig;
boost::int64_t *ia,*ib;
boost::int64_t *ia, *ib;
real *crit;
PyObject *res;
PyObject *tmp;
npy_intp dims[2];
if (PyArray_Check(data.ptr())) {
dataContig
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(data.ptr(),PyArray_DOUBLE,2,2));
}
else {
dataContig = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(data.ptr(), PyArray_DOUBLE, 2, 2));
} else {
throw_value_error("PyArray_Type expected as input");
return NULL;
}
ia = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
ib = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
crit = (real *)calloc(nPts,sizeof(real));
ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
crit = (real *)calloc(nPts, sizeof(real));
clusterit((real *)dataContig->data,nPts,sz,option,ia,ib,crit);
clusterit((real *)dataContig->data, nPts, sz, option, ia, ib, crit);
dims[0] = nPts;
res = PyTuple_New(3);
@@ -88,52 +85,47 @@ Clustering_MurtaghCluster(python::object data, int nPts, int sz, int option)
// that's why it's ok that we do not free them in this function,
// Python will take care of it for us.
//
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ia);
PyTuple_SetItem(res,0,(PyObject *)tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
PyTuple_SetItem(res, 0, (PyObject *)tmp);
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ib);
PyTuple_SetItem(res,1,(PyObject *)tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
PyTuple_SetItem(res, 1, (PyObject *)tmp);
tmp = PyArray_SimpleNewFromData(1,dims,NPY_DOUBLE,(void *)crit);
PyTuple_SetItem(res,2,(PyObject *)tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
PyTuple_SetItem(res, 2, (PyObject *)tmp);
return res;
};
void distclusterit(real *dists,boost::int64_t n,boost::int64_t iopt,
boost::int64_t *ia,boost::int64_t *ib,real *crit){
void distclusterit(real *dists, boost::int64_t n, boost::int64_t iopt,
boost::int64_t *ia, boost::int64_t *ib, real *crit) {
boost::int64_t len;
len = (n*(n-1))/2;
distdriver_(&n,&len,dists,&iopt,ia,ib,crit);
len = (n * (n - 1)) / 2;
distdriver_(&n, &len, dists, &iopt, ia, ib, crit);
};
static PyObject *
Clustering_MurtaghDistCluster(python::object data, int nPts, int option)
{
static PyObject *Clustering_MurtaghDistCluster(python::object data, int nPts,
int option) {
PyArrayObject *dataContig;
boost::int64_t *ia,*ib;
boost::int64_t *ia, *ib;
real *crit;
PyObject *res=PyTuple_New(3);
PyObject *res = PyTuple_New(3);
PyObject *tmp;
npy_intp dims[] = {1};
if (PyArray_Check(data.ptr())) {
dataContig
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(data.ptr(),PyArray_DOUBLE,1,1));
}
else {
dataContig = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(data.ptr(), PyArray_DOUBLE, 1, 1));
} else {
throw_value_error("PyArray_Type expected as input");
return NULL;
}
ia = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
ib = (boost::int64_t *)calloc(nPts,sizeof(boost::int64_t));
crit = (real *)calloc(nPts,sizeof(real));
distclusterit((real *)dataContig->data,nPts,option,ia,ib,crit);
ia = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
ib = (boost::int64_t *)calloc(nPts, sizeof(boost::int64_t));
crit = (real *)calloc(nPts, sizeof(real));
distclusterit((real *)dataContig->data, nPts, option, ia, ib, crit);
dims[0] = nPts;
@@ -142,30 +134,26 @@ Clustering_MurtaghDistCluster(python::object data, int nPts, int option)
// that's why it's ok that we do not free them in this function,
// Python will take care of it for us.
//
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ia);
PyTuple_SetItem(res,0,tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ia);
PyTuple_SetItem(res, 0, tmp);
tmp = PyArray_SimpleNewFromData(1,dims,NPY_LONG,(void *)ib);
PyTuple_SetItem(res,1,tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_LONG, (void *)ib);
PyTuple_SetItem(res, 1, tmp);
tmp = PyArray_SimpleNewFromData(1, dims, NPY_DOUBLE, (void *)crit);
PyTuple_SetItem(res, 2, tmp);
tmp = PyArray_SimpleNewFromData(1,dims,NPY_DOUBLE,(void *)crit);
PyTuple_SetItem(res,2,tmp);
return res;
};
BOOST_PYTHON_MODULE(Clustering) {
rdkit_import_array();
python::def("MurtaghCluster", Clustering_MurtaghCluster,
( python::arg("data"), python::arg("nPts"),
python::arg("sz"), python::arg("option") ),
"TODO: provide docstring");
(python::arg("data"), python::arg("nPts"), python::arg("sz"),
python::arg("option")),
"TODO: provide docstring");
python::def("MurtaghDistCluster", Clustering_MurtaghDistCluster,
( python::arg("data"), python::arg("nPts"),
python::arg("option") ),
"TODO: provide docstring");
(python::arg("data"), python::arg("nPts"), python::arg("option")),
"TODO: provide docstring");
}

View File

@@ -2,7 +2,7 @@
/** barf [ba:rf] 2. "He suggested using FORTRAN, and everybody barfed."
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
#ifndef F2C_INCLUDE
#define F2C_INCLUDE
@@ -19,11 +19,11 @@ typedef long int logical;
typedef short int shortlogical;
typedef char logical1;
typedef char integer1;
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
typedef long long longint; /* system-dependent */
typedef unsigned long long ulongint; /* system-dependent */
#define qbit_clear(a,b) ((a) & ~((ulongint)1 << (b)))
#define qbit_set(a,b) ((a) | ((ulongint)1 << (b)))
#ifdef INTEGER_STAR_8 /* Adjust for integer*8. */
typedef long long longint; /* system-dependent */
typedef unsigned long long ulongint; /* system-dependent */
#define qbit_clear(a, b) ((a) & ~((ulongint)1 << (b)))
#define qbit_set(a, b) ((a) | ((ulongint)1 << (b)))
#endif
#define TRUE_ (1)
@@ -48,121 +48,121 @@ typedef long int ftnint;
#endif
/*external read, write*/
typedef struct
{ flag cierr;
ftnint ciunit;
flag ciend;
char *cifmt;
ftnint cirec;
typedef struct {
flag cierr;
ftnint ciunit;
flag ciend;
char *cifmt;
ftnint cirec;
} cilist;
/*internal read, write*/
typedef struct
{ flag icierr;
char *iciunit;
flag iciend;
char *icifmt;
ftnint icirlen;
ftnint icirnum;
typedef struct {
flag icierr;
char *iciunit;
flag iciend;
char *icifmt;
ftnint icirlen;
ftnint icirnum;
} icilist;
/*open*/
typedef struct
{ flag oerr;
ftnint ounit;
char *ofnm;
ftnlen ofnmlen;
char *osta;
char *oacc;
char *ofm;
ftnint orl;
char *oblnk;
typedef struct {
flag oerr;
ftnint ounit;
char *ofnm;
ftnlen ofnmlen;
char *osta;
char *oacc;
char *ofm;
ftnint orl;
char *oblnk;
} olist;
/*close*/
typedef struct
{ flag cerr;
ftnint cunit;
char *csta;
typedef struct {
flag cerr;
ftnint cunit;
char *csta;
} cllist;
/*rewind, backspace, endfile*/
typedef struct
{ flag aerr;
ftnint aunit;
typedef struct {
flag aerr;
ftnint aunit;
} alist;
/* inquire */
typedef struct
{ flag inerr;
ftnint inunit;
char *infile;
ftnlen infilen;
ftnint *inex; /*parameters in standard's order*/
ftnint *inopen;
ftnint *innum;
ftnint *innamed;
char *inname;
ftnlen innamlen;
char *inacc;
ftnlen inacclen;
char *inseq;
ftnlen inseqlen;
char *indir;
ftnlen indirlen;
char *infmt;
ftnlen infmtlen;
char *inform;
ftnint informlen;
char *inunf;
ftnlen inunflen;
ftnint *inrecl;
ftnint *innrec;
char *inblank;
ftnlen inblanklen;
typedef struct {
flag inerr;
ftnint inunit;
char *infile;
ftnlen infilen;
ftnint *inex; /*parameters in standard's order*/
ftnint *inopen;
ftnint *innum;
ftnint *innamed;
char *inname;
ftnlen innamlen;
char *inacc;
ftnlen inacclen;
char *inseq;
ftnlen inseqlen;
char *indir;
ftnlen indirlen;
char *infmt;
ftnlen infmtlen;
char *inform;
ftnint informlen;
char *inunf;
ftnlen inunflen;
ftnint *inrecl;
ftnint *innrec;
char *inblank;
ftnlen inblanklen;
} inlist;
#define VOID void
union Multitype { /* for multiple entry points */
integer1 g;
shortint h;
integer i;
/* longint j; */
real r;
doublereal d;
complex c;
doublecomplex z;
};
union Multitype {/* for multiple entry points */
integer1 g;
shortint h;
integer i;
/* longint j; */
real r;
doublereal d;
complex c;
doublecomplex z;
};
typedef union Multitype Multitype;
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
/*typedef long int Long;*/ /* No longer used; formerly in Namelist */
struct Vardesc { /* for Namelist */
char *name;
char *addr;
ftnlen *dims;
int type;
};
struct Vardesc {/* for Namelist */
char *name;
char *addr;
ftnlen *dims;
int type;
};
typedef struct Vardesc Vardesc;
struct Namelist {
char *name;
Vardesc **vars;
int nvars;
};
char *name;
Vardesc **vars;
int nvars;
};
typedef struct Namelist Namelist;
#define abs(x) ((x) >= 0 ? (x) : -(x))
#define dabs(x) (doublereal)abs(x)
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
#define dmin(a,b) (doublereal)min(a,b)
#define dmax(a,b) (doublereal)max(a,b)
#define bit_test(a,b) ((a) >> (b) & 1)
#define bit_clear(a,b) ((a) & ~((uinteger)1 << (b)))
#define bit_set(a,b) ((a) | ((uinteger)1 << (b)))
#define dabs(x) (doublereal) abs(x)
#define min(a, b) ((a) <= (b) ? (a) : (b))
#define max(a, b) ((a) >= (b) ? (a) : (b))
#define dmin(a, b) (doublereal) min(a, b)
#define dmax(a, b) (doublereal) max(a, b)
#define bit_test(a, b) ((a) >> (b)&1)
#define bit_clear(a, b) ((a) & ~((uinteger)1 << (b)))
#define bit_set(a, b) ((a) | ((uinteger)1 << (b)))
/* procedure parameter types for -A and -C++ */
@@ -193,10 +193,10 @@ typedef /* Character */ VOID (*H_fp)();
typedef /* Subroutine */ int (*S_fp)();
#endif
/* E_fp is for real functions when -R is not specified */
typedef VOID C_f; /* complex function */
typedef VOID H_f; /* character function */
typedef VOID Z_f; /* double complex function */
typedef doublereal E_f; /* real function with -R not specified */
typedef VOID C_f; /* complex function */
typedef VOID H_f; /* character function */
typedef VOID Z_f; /* double complex function */
typedef doublereal E_f; /* real function with -R not specified */
/* undef any lower-case symbols that your C compiler predefines, e.g.: */

View File

@@ -20,10 +20,11 @@ namespace python = boost::python;
/***********************************************
constructs a variable table for the data passed in
The table for a given variable records the number of times each possible value
The table for a given variable records the number of times each possible
value
of that variable appears for each possible result of the function.
**Arguments**
**Arguments**
- vals: pointer to double, contains the values of the variable,
should be sorted
@@ -34,14 +35,15 @@ namespace python = boost::python;
- nCuts: int, the length of _cuts_
- starts: pointer to int, the potential starting points for quantization bounds
- starts: pointer to int, the potential starting points for quantization
bounds
- nStarts: int, the length of _starts_
- results: poitner to int, the result codes
- nPossibleRes: int, the number of possible result codes
**Returns**
@@ -54,30 +56,29 @@ namespace python = boost::python;
- the _results_ array is assumed to be _nVals_ long
***********************************************/
long int *
GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
long int *results,int nPossibleRes,long int *varTable)
{
long int *GenVarTable(double *vals, int nVals, long int *cuts, int nCuts,
long int *starts, long int *results, int nPossibleRes,
long int *varTable) {
RDUNUSED_PARAM(vals);
int nBins = nCuts + 1;
int idx,i,iTab;
int idx, i, iTab;
memset(varTable,0,nBins*nPossibleRes*sizeof(long int));
memset(varTable, 0, nBins * nPossibleRes * sizeof(long int));
idx = 0;
for(i=0;i<nCuts;i++){
for (i = 0; i < nCuts; i++) {
int cut = cuts[i];
iTab = i*nPossibleRes;
while(idx<starts[cut]){
varTable[iTab+results[idx]] += 1;
iTab = i * nPossibleRes;
while (idx < starts[cut]) {
varTable[iTab + results[idx]] += 1;
idx++;
}
}
iTab = nCuts*nPossibleRes;
while(idx<nVals){
varTable[iTab+results[idx]] += 1;
iTab = nCuts * nPossibleRes;
while (idx < nVals) {
varTable[iTab + results[idx]] += 1;
idx++;
}
return varTable;
return varTable;
}
/***********************************************
@@ -86,7 +87,7 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
we do things this way to avoid having to convert things back and forth
from Python objects
**Arguments**
**Arguments**
- vals: pointer to double, contains the values of the variable,
should be sorted
@@ -99,14 +100,15 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
- which: int, the quant bound being modified here
- starts: pointer to int, the potential starting points for quantization bounds
- starts: pointer to int, the potential starting points for quantization
bounds
- nStarts: int, the length of _starts_
- results: poitner to int, the result codes
- nPossibleRes: int, the number of possible result codes
**Returns**
@@ -120,66 +122,65 @@ GenVarTable(double *vals,int nVals,long int *cuts,int nCuts,long int *starts,
- the _results_ array is assumed to be _nVals_ long
***********************************************/
double
RecurseHelper(double *vals,int nVals,long int *cuts,int nCuts,int which,
long int *starts,int nStarts,long int *results,int nPossibleRes)
{
double maxGain=-1e6,gainHere;
long int *bestCuts,*tCuts;
long int *varTable=0;
double RecurseHelper(double *vals, int nVals, long int *cuts, int nCuts,
int which, long int *starts, int nStarts,
long int *results, int nPossibleRes) {
double maxGain = -1e6, gainHere;
long int *bestCuts, *tCuts;
long int *varTable = 0;
int highestCutHere = nStarts - nCuts + which;
int i,nBounds=nCuts;
varTable = (long int *)calloc((nCuts+1)*nPossibleRes,sizeof(long int));
bestCuts = (long int *)calloc(nCuts,sizeof(long int));
tCuts = (long int *)calloc(nCuts,sizeof(long int));
GenVarTable(vals,nVals,cuts,nCuts,starts,results,nPossibleRes,varTable);
while(cuts[which] <= highestCutHere){
gainHere = RDInfoTheory::InfoEntropyGain(varTable,nCuts+1,nPossibleRes);
if(gainHere > maxGain){
int i, nBounds = nCuts;
varTable = (long int *)calloc((nCuts + 1) * nPossibleRes, sizeof(long int));
bestCuts = (long int *)calloc(nCuts, sizeof(long int));
tCuts = (long int *)calloc(nCuts, sizeof(long int));
GenVarTable(vals, nVals, cuts, nCuts, starts, results, nPossibleRes,
varTable);
while (cuts[which] <= highestCutHere) {
gainHere = RDInfoTheory::InfoEntropyGain(varTable, nCuts + 1, nPossibleRes);
if (gainHere > maxGain) {
maxGain = gainHere;
memcpy(bestCuts,cuts,nCuts*sizeof(long int));
memcpy(bestCuts, cuts, nCuts * sizeof(long int));
}
// recurse on the next vars if needed
if(which < nBounds-1){
memcpy(tCuts,cuts,nCuts*sizeof(long int));
gainHere = RecurseHelper(vals,nVals,tCuts,nCuts,which+1,starts,nStarts,
results,nPossibleRes);
if(gainHere > maxGain){
if (which < nBounds - 1) {
memcpy(tCuts, cuts, nCuts * sizeof(long int));
gainHere = RecurseHelper(vals, nVals, tCuts, nCuts, which + 1, starts,
nStarts, results, nPossibleRes);
if (gainHere > maxGain) {
maxGain = gainHere;
memcpy(bestCuts,tCuts,nCuts*sizeof(long int));
memcpy(bestCuts, tCuts, nCuts * sizeof(long int));
}
}
// update this cut
int oldCut = cuts[which];
cuts[which] += 1;
int top,bot;
int top, bot;
bot = starts[oldCut];
if(oldCut+1 < nStarts)
top = starts[oldCut+1];
if (oldCut + 1 < nStarts)
top = starts[oldCut + 1];
else
top = starts[nStarts-1];
for(i=bot;i<top;i++) {
int v=results[i];
varTable[which*nPossibleRes+v] += 1;
varTable[(which+1)*nPossibleRes+v] -= 1;
top = starts[nStarts - 1];
for (i = bot; i < top; i++) {
int v = results[i];
varTable[which * nPossibleRes + v] += 1;
varTable[(which + 1) * nPossibleRes + v] -= 1;
}
for(i=which+1;i<nBounds;i++){
if(cuts[i] == cuts[i-1]) cuts[i] += 1;
for (i = which + 1; i < nBounds; i++) {
if (cuts[i] == cuts[i - 1]) cuts[i] += 1;
}
}
memcpy(cuts,bestCuts,nCuts*sizeof(long int));
memcpy(cuts, bestCuts, nCuts * sizeof(long int));
free(tCuts);
free(bestCuts);
free(varTable);
return maxGain;
}
/***********************************************
Recursively finds the best quantization boundaries
**Arguments**
@@ -206,21 +207,22 @@ RecurseHelper(double *vals,int nVals,long int *cuts,int nCuts,int which,
1) the best information gain found so far
2) a list of the quantization bound indices ( _cuts_ for the best case)
**Notes**
- this is not even remotely efficient, which is why a C replacement
was written
- this is a drop-in replacement for *ML.Data.Quantize._PyRecurseBounds*
***********************************************/
static python::tuple
cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
python::list pyStarts, python::object results, int nPossibleRes)
{
PyArrayObject *contigVals,*contigResults;
long int *cuts,*starts;
static python::tuple cQuantize_RecurseOnBounds(python::object vals,
python::list pyCuts, int which,
python::list pyStarts,
python::object results,
int nPossibleRes) {
PyArrayObject *contigVals, *contigResults;
long int *cuts, *starts;
/*
-------
@@ -229,38 +231,37 @@ cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
-------
*/
contigVals
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(vals.ptr(),PyArray_DOUBLE,1,1));
if(!contigVals){
contigVals = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(vals.ptr(), PyArray_DOUBLE, 1, 1));
if (!contigVals) {
throw_value_error("could not convert value argument");
}
contigResults
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(results.ptr(),PyArray_LONG,1,1));
if(!contigResults){
contigResults = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(results.ptr(), PyArray_LONG, 1, 1));
if (!contigResults) {
throw_value_error("could not convert results argument");
}
python::ssize_t nCuts = python::len(pyCuts);
cuts = (long int *)calloc(nCuts,sizeof(long int));
for (python::ssize_t i=0; i<nCuts; i++) {
cuts = (long int *)calloc(nCuts, sizeof(long int));
for (python::ssize_t i = 0; i < nCuts; i++) {
python::object elem = pyCuts[i];
cuts[i] = python::extract<long int>(elem);
}
python::ssize_t nStarts = python::len(pyStarts);
starts = (long int *)calloc(nStarts,sizeof(long int));
for (python::ssize_t i=0; i<nStarts; i++){
starts = (long int *)calloc(nStarts, sizeof(long int));
for (python::ssize_t i = 0; i < nStarts; i++) {
python::object elem = pyStarts[i];
starts[i] = python::extract<long int>(elem);
}
// do the real work
double gain
= RecurseHelper((double *)contigVals->data,contigVals->dimensions[0],
cuts,nCuts,which,starts,nStarts,
(long int *)contigResults->data,nPossibleRes);
double gain = RecurseHelper(
(double *)contigVals->data, contigVals->dimensions[0], cuts, nCuts, which,
starts, nStarts, (long int *)contigResults->data, nPossibleRes);
/*
-------
@@ -269,72 +270,71 @@ cQuantize_RecurseOnBounds(python::object vals, python::list pyCuts, int which,
-------
*/
python::list cutObj;
for (python::ssize_t i=0; i<nCuts; i++) {
for (python::ssize_t i = 0; i < nCuts; i++) {
cutObj.append(cuts[i]);
}
free(cuts);
free(starts);
return python::make_tuple(gain, cutObj);
return python::make_tuple(gain, cutObj);
}
static python::list
cQuantize_FindStartPoints(python::object values, python::object results,
int nData)
{
static python::list cQuantize_FindStartPoints(python::object values,
python::object results,
int nData) {
python::list startPts;
if(nData<2){
if (nData < 2) {
return startPts;
}
PyArrayObject *contigVals
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(values.ptr(),PyArray_DOUBLE,1,1));
if(!contigVals){
PyArrayObject *contigVals = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(values.ptr(), PyArray_DOUBLE, 1, 1));
if (!contigVals) {
throw_value_error("could not convert value argument");
}
double *vals=(double *)contigVals->data;
double *vals = (double *)contigVals->data;
PyArrayObject *contigResults
= reinterpret_cast<PyArrayObject *>(PyArray_ContiguousFromObject(results.ptr(),PyArray_LONG,1,1));
if(!contigResults){
PyArrayObject *contigResults = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(results.ptr(), PyArray_LONG, 1, 1));
if (!contigResults) {
throw_value_error("could not convert results argument");
}
long *res=(long *)contigResults->data;
long *res = (long *)contigResults->data;
bool firstBlock=true;
long lastBlockAct=-2,blockAct=res[0];
int lastDiv=-1;
double tol=1e-8;
bool firstBlock = true;
long lastBlockAct = -2, blockAct = res[0];
int lastDiv = -1;
double tol = 1e-8;
int i=1;
while(i<nData){
while(i<nData && vals[i]-vals[i-1]<=tol){
if(res[i]!=blockAct){
blockAct=-1;
int i = 1;
while (i < nData) {
while (i < nData && vals[i] - vals[i - 1] <= tol) {
if (res[i] != blockAct) {
blockAct = -1;
}
++i;
}
if(firstBlock){
firstBlock=false;
lastBlockAct=blockAct;
lastDiv=i;
if (firstBlock) {
firstBlock = false;
lastBlockAct = blockAct;
lastDiv = i;
} else {
if(blockAct==-1 || lastBlockAct==-1 || blockAct!=lastBlockAct){
startPts.append(lastDiv);
lastDiv=i;
lastBlockAct=blockAct;
if (blockAct == -1 || lastBlockAct == -1 || blockAct != lastBlockAct) {
startPts.append(lastDiv);
lastDiv = i;
lastBlockAct = blockAct;
} else {
lastDiv=i;
lastDiv = i;
}
}
if(i<nData) blockAct=res[i];
++i;
if (i < nData) blockAct = res[i];
++i;
}
// catch the case that the last point also sets a bin:
if( blockAct != lastBlockAct ){
if (blockAct != lastBlockAct) {
startPts.append(lastDiv);
}
@@ -342,19 +342,15 @@ cQuantize_FindStartPoints(python::object values, python::object results,
}
BOOST_PYTHON_MODULE(cQuantize) {
rdkit_import_array();
python::def("_RecurseOnBounds", cQuantize_RecurseOnBounds,
( python::arg("vals"), python::arg("pyCuts"),
python::arg("which"), python::arg("pyStarts"),
python::arg("results"), python::arg("nPossibleRes") ),
"TODO: provide docstring");
python::def("_FindStartPoints", cQuantize_FindStartPoints,
( python::arg("values"), python::arg("results"),
python::arg("nData") ),
"TODO: provide docstring");
(python::arg("vals"), python::arg("pyCuts"), python::arg("which"),
python::arg("pyStarts"), python::arg("results"),
python::arg("nPossibleRes")),
"TODO: provide docstring");
python::def(
"_FindStartPoints", cQuantize_FindStartPoints,
(python::arg("values"), python::arg("results"), python::arg("nData")),
"TODO: provide docstring");
}

View File

@@ -15,105 +15,100 @@
#include <boost/dynamic_bitset.hpp>
namespace RDInfoTheory {
//FIX: won't worry about it now, but this class can be templated by the type of
// container for the bit list and type of descriptors (fingerprint vs. real valued)
class BitCorrMatGenerator {
/*! \brief A class to generate a correlation matrix for a bunch of fingerprints
*
* The correlation matrix is done only for the bit IDs that are set by a call to the
* function setDescriptorIdList
*
* cr = CorrMatGenerator();
* cr.setDescriptorIdList(descList);
* for each fingerprint in list of fingerprints {
* cr.collectVotes(fingerprint);
* }
* double *corrMat = cr.getCorrMat()
*
* The resulting correlation matrix is a one dimension matrix with only the lower triangle elements
* of the symmetric matrix
*/
public:
BitCorrMatGenerator() {
this->initGenerator();
};
// FIX: won't worry about it now, but this class can be templated by the type of
// container for the bit list and type of descriptors (fingerprint vs. real
// valued)
class BitCorrMatGenerator {
/*! \brief A class to generate a correlation matrix for a bunch of
*fingerprints
*
* The correlation matrix is done only for the bit IDs that are set by a call
*to the
* function setDescriptorIdList
*
* cr = CorrMatGenerator();
* cr.setDescriptorIdList(descList);
* for each fingerprint in list of fingerprints {
* cr.collectVotes(fingerprint);
* }
* double *corrMat = cr.getCorrMat()
*
* The resulting correlation matrix is a one dimension matrix with only the
*lower triangle elements
* of the symmetric matrix
*/
public:
BitCorrMatGenerator() { this->initGenerator(); };
~BitCorrMatGenerator() {
delete [] dp_corrMat;
~BitCorrMatGenerator() { delete[] dp_corrMat; }
void initGenerator() {
dp_corrMat = 0;
d_descs.resize(0);
d_nExamples = 0;
};
/*! \brief Set the list bits that we are interested in correlating
*
* \param bitIdList is a list of bit ids that need to be correlated e.g. a
*list top ranked ensemble
* of bits
*/
void setBitIdList(const RDKit::INT_VECT &bitIdList) {
d_descs = bitIdList;
int i, nd = d_descs.size();
int nelem = nd * (nd - 1) / 2;
delete[] dp_corrMat;
dp_corrMat = new double[nd * (nd - 1) / 2];
for (i = 0; i < nelem; i++) {
dp_corrMat[i] = 0.0;
}
};
void initGenerator() {
dp_corrMat = 0;
d_descs.resize(0);
d_nExamples = 0;
};
//! \brief get the number of examples we used so far to compute the
//correlation matrix
int getNumExamples() const { return d_nExamples; };
/*! \brief Set the list bits that we are interested in correlating
*
* \param bitIdList is a list of bit ids that need to be correlated e.g. a list top ranked ensemble
* of bits
*/
void setBitIdList(const RDKit::INT_VECT &bitIdList) {
d_descs = bitIdList;
int i, nd = d_descs.size();
int nelem = nd*(nd-1)/2;
delete [] dp_corrMat;
//! \brief Get the list of bits ID that are used to generate the correlation
//matrix
RDKit::INT_VECT getCorrBitList() const { return d_descs; };
dp_corrMat = new double[nd*(nd-1)/2];
for (i = 0; i < nelem; i++) {
dp_corrMat[i] = 0.0;
//! \brief Gets a pointer to the correlation matrix
double *getCorrMat() { return dp_corrMat; };
//! \brief For each pair of on bits (bi, bj) in fp increase the correlation
//count
// for the pair by 1
void collectVotes(const BitVect &fp) {
unsigned int nd = d_descs.size();
// use a temporary bit vector to first mask the fingerprint
ExplicitBitVect ebv(nd);
int bi;
for (unsigned int i = 0; i < nd; i++) {
bi = d_descs[i];
if (fp[bi]) {
ebv.setBit(i);
}
};
//! \brief get the number of examples we used so far to compute the correlation matrix
int getNumExamples() const {
return d_nExamples;
};
//! \brief Get the list of bits ID that are used to generate the correlation matrix
RDKit::INT_VECT getCorrBitList() const {
return d_descs;
};
//! \brief Gets a pointer to the correlation matrix
double *getCorrMat() {
return dp_corrMat;
};
//! \brief For each pair of on bits (bi, bj) in fp increase the correlation count
// for the pair by 1
void collectVotes(const BitVect &fp) {
unsigned int nd = d_descs.size();
// use a temporary bit vector to first mask the fingerprint
ExplicitBitVect ebv(nd);
int bi;
for (unsigned int i = 0; i < nd; i++) {
bi = d_descs[i];
if (fp[bi]) {
ebv.setBit(i);
}
}
for (unsigned i = 1; i < nd; i++) {
unsigned int itab = i*(i-1)/2;
if (ebv[i]) {
for (unsigned int j = 0; j < i; j++) {
if ( ebv[j]) {
dp_corrMat[itab + j] += 1;
}
}
for (unsigned i = 1; i < nd; i++) {
unsigned int itab = i * (i - 1) / 2;
if (ebv[i]) {
for (unsigned int j = 0; j < i; j++) {
if (ebv[j]) {
dp_corrMat[itab + j] += 1;
}
}
}
d_nExamples++;
};
private:
RDKit::INT_VECT d_descs;
double *dp_corrMat;
int d_nExamples;
}
d_nExamples++;
};
private:
RDKit::INT_VECT d_descs;
double *dp_corrMat;
int d_nExamples;
};
}
#endif

View File

@@ -20,167 +20,168 @@
#include <queue>
namespace RDInfoTheory {
typedef std::pair<double, int> PAIR_D_I;
typedef std::vector<PAIR_D_I> VECT_PDI;
typedef std::pair<double, int> PAIR_D_I;
typedef std::vector<PAIR_D_I> VECT_PDI;
struct gtDIPair {
bool operator() ( const PAIR_D_I &pd1, const PAIR_D_I &pd2) const {
return pd1.first > pd2.first;
}
};
typedef std::priority_queue<PAIR_D_I, VECT_PDI, gtDIPair> PR_QUEUE;
void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) {
URANGE_CHECK(classList.size(), d_classes);
d_biasList = classList;
//make sure we don't have any duplicates
std::sort(d_biasList.begin(), d_biasList.end());
RDKit::INT_VECT_CI bi = std::unique(d_biasList.begin(), d_biasList.end());
CHECK_INVARIANT(bi == d_biasList.end(), "There are duplicates in the class bias list");
// finally make sure all the class ID in d_biasList are within range
for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) {
URANGE_CHECK(static_cast<unsigned int>(*bi), d_classes-1);
}
struct gtDIPair {
bool operator()(const PAIR_D_I &pd1, const PAIR_D_I &pd2) const {
return pd1.first > pd2.first;
}
};
void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) {
delete dp_maskBits;
dp_maskBits = new ExplicitBitVect(d_dims);
for (RDKit::INT_VECT_CI bi = maskBits.begin();
bi != maskBits.end(); ++bi) {
dp_maskBits->setBit(*bi);
}
typedef std::priority_queue<PAIR_D_I, VECT_PDI, gtDIPair> PR_QUEUE;
void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) {
URANGE_CHECK(classList.size(), d_classes);
d_biasList = classList;
// make sure we don't have any duplicates
std::sort(d_biasList.begin(), d_biasList.end());
RDKit::INT_VECT_CI bi = std::unique(d_biasList.begin(), d_biasList.end());
CHECK_INVARIANT(bi == d_biasList.end(),
"There are duplicates in the class bias list");
// finally make sure all the class ID in d_biasList are within range
for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) {
URANGE_CHECK(static_cast<unsigned int>(*bi), d_classes - 1);
}
}
bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const {
PRECONDITION(resMat,"bad results pointer");
if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) {
//we will accept the bit
return true;
}
RDKit::DOUBLE_VECT fracs;
fracs.resize(d_classes);
// compute the fractions of items in each class that hit the bit
// and record the maximum for the those classes not in the bias list
double maxCor = 0.0;
for (unsigned int i = 0; i < d_classes; i++) {
if (d_clsCount[i] > 0) {
fracs[i] = ((double)resMat[i])/d_clsCount[i];
} else {
fracs[i] = 0.0;
}
if (std::find(d_biasList.begin(), d_biasList.end(), i) == d_biasList.end()) {
// if not in the biasList
if (fracs[i] > maxCor) {
// if this is fraction is greater than the previously known maximum
maxCor = fracs[i];
}
}
}
bool bitOk = false;
for (RDKit::INT_VECT_CI bci = d_biasList.begin(); bci !=
d_biasList.end(); ++bci) {
if (fracs[*bci] >= maxCor) {
bitOk = true;
break;
}
}
return bitOk;
void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) {
delete dp_maskBits;
dp_maskBits = new ExplicitBitVect(d_dims);
for (RDKit::INT_VECT_CI bi = maskBits.begin(); bi != maskBits.end(); ++bi) {
dp_maskBits->setBit(*bi);
}
}
double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const {
PRECONDITION(resMat,"bad result pointer");
bool bitOk = this->BiasCheckBit(resMat);
double info=0.0;
if (bitOk) {
info = ChiSquare(resMat, 2, d_classes);
}
return info;
bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const {
PRECONDITION(resMat, "bad results pointer");
if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) {
// we will accept the bit
return true;
}
RDKit::DOUBLE_VECT fracs;
fracs.resize(d_classes);
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const {
PRECONDITION(resMat,"bad result pointer");
bool bitOk = this->BiasCheckBit(resMat);
double info=0.0;
if (bitOk) {
info = InfoEntropyGain(resMat, 2, d_classes);
// compute the fractions of items in each class that hit the bit
// and record the maximum for the those classes not in the bias list
double maxCor = 0.0;
for (unsigned int i = 0; i < d_classes; i++) {
if (d_clsCount[i] > 0) {
fracs[i] = ((double)resMat[i]) / d_clsCount[i];
} else {
fracs[i] = 0.0;
}
return info;
}
void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv, unsigned int label) {
URANGE_CHECK(label, d_classes-1);
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
d_nInst += 1;
d_clsCount[label] += 1;
for (unsigned int i=0;i<bv.getNumBits();i++){
if( (*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i)) ){
d_counts[label][i] += 1;
if (std::find(d_biasList.begin(), d_biasList.end(), i) ==
d_biasList.end()) {
// if not in the biasList
if (fracs[i] > maxCor) {
// if this is fraction is greater than the previously known maximum
maxCor = fracs[i];
}
}
}
void InfoBitRanker::accumulateVotes(const SparseBitVect &bv, unsigned int label) {
URANGE_CHECK(label, d_classes-1);
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
d_nInst += 1;
d_clsCount[label] += 1;
for (IntSet::const_iterator obi = bv.dp_bits->begin();
obi != bv.dp_bits->end();
++obi) {
if(!dp_maskBits || dp_maskBits->getBit(*obi)){
d_counts[label][(*obi)] += 1;
}
bool bitOk = false;
for (RDKit::INT_VECT_CI bci = d_biasList.begin(); bci != d_biasList.end();
++bci) {
if (fracs[*bci] >= maxCor) {
bitOk = true;
break;
}
}
double *InfoBitRanker::getTopN(unsigned int num) {
// this is a place holder to pass along to infogain function
// the size of this container should nVals*d_classes, where nVals
// is the number of values a variable can take.
// since we are dealing with a binary bit vector nVals = 2
// in addition the infogain function pretends that this is a 2D matrix
// with the number of rows equal to nVals and num of columns equal to
// d_classes
if(num>d_dims) throw ValueErrorException("attempt to rank more bits than present in the bit vectors");
if(dp_maskBits)
CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(), "Can't rank more bits than the ensemble size");
RDKit::USHORT *resMat = new RDKit::USHORT[2*d_classes];
PR_QUEUE topN;
return bitOk;
}
for (unsigned int i = 0; i < d_dims; i++) {
// we may want to ignore bits that are not turned on in any item of class
// "ignoreNoClass"
/*
if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) {
if (d_counts[ignoreNoClass][i] == 0) {
continue;
}
}*/
if (dp_maskBits && !dp_maskBits->getBit(i)) {
continue;
}
double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const {
PRECONDITION(resMat, "bad result pointer");
bool bitOk = this->BiasCheckBit(resMat);
double info = 0.0;
if (bitOk) {
info = ChiSquare(resMat, 2, d_classes);
}
return info;
}
// fill up dmat
for (unsigned int j = 0; j < d_classes; j++) {
// we know that we have only two rows here
resMat[j] = d_counts[j][i];
resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]);
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const {
PRECONDITION(resMat, "bad result pointer");
bool bitOk = this->BiasCheckBit(resMat);
double info = 0.0;
if (bitOk) {
info = InfoEntropyGain(resMat, 2, d_classes);
}
return info;
}
void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv,
unsigned int label) {
URANGE_CHECK(label, d_classes - 1);
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
d_nInst += 1;
d_clsCount[label] += 1;
for (unsigned int i = 0; i < bv.getNumBits(); i++) {
if ((*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i))) {
d_counts[label][i] += 1;
}
}
}
void InfoBitRanker::accumulateVotes(const SparseBitVect &bv,
unsigned int label) {
URANGE_CHECK(label, d_classes - 1);
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
d_nInst += 1;
d_clsCount[label] += 1;
for (IntSet::const_iterator obi = bv.dp_bits->begin();
obi != bv.dp_bits->end(); ++obi) {
if (!dp_maskBits || dp_maskBits->getBit(*obi)) {
d_counts[label][(*obi)] += 1;
}
}
}
double *InfoBitRanker::getTopN(unsigned int num) {
// this is a place holder to pass along to infogain function
// the size of this container should nVals*d_classes, where nVals
// is the number of values a variable can take.
// since we are dealing with a binary bit vector nVals = 2
// in addition the infogain function pretends that this is a 2D matrix
// with the number of rows equal to nVals and num of columns equal to
// d_classes
if (num > d_dims)
throw ValueErrorException(
"attempt to rank more bits than present in the bit vectors");
if (dp_maskBits)
CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(),
"Can't rank more bits than the ensemble size");
RDKit::USHORT *resMat = new RDKit::USHORT[2 * d_classes];
PR_QUEUE topN;
for (unsigned int i = 0; i < d_dims; i++) {
// we may want to ignore bits that are not turned on in any item of class
// "ignoreNoClass"
/*
if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) {
if (d_counts[ignoreNoClass][i] == 0) {
continue;
}
double info = 0.0;
switch (d_type) {
}*/
if (dp_maskBits && !dp_maskBits->getBit(i)) {
continue;
}
// fill up dmat
for (unsigned int j = 0; j < d_classes; j++) {
// we know that we have only two rows here
resMat[j] = d_counts[j][i];
resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]);
}
double info = 0.0;
switch (d_type) {
case ENTROPY:
info = InfoEntropyGain(resMat, 2, d_classes);
break;
@@ -195,100 +196,93 @@ namespace RDInfoTheory {
break;
default:
break;
}
PAIR_D_I entry(info, i);
if (info >= 0.0) {
if (topN.size() < num) {
topN.push(entry);
}
else if (info > topN.top().first) {
topN.pop();
topN.push(entry);
}
}
}
delete [] resMat;
// now fill up the result matrix for the topN bits
// the result from this function is a double * of size
// num*4. The caller of this function interprets this
// array as a two dimensional array of size num*(2+d_classes) with each row
// containing the following entries
// bitId, infogain, 1 additional column for number of hits for each class
//double *res = new double[num*(2+d_classes)];
d_top = num;
int ncols = 2+d_classes;
delete [] dp_topBits;
dp_topBits = new double[num*ncols];
int offset, bid;
RDKit::INT_VECT maskBits;
if (dp_maskBits && topN.size() < num) {
dp_maskBits->getOnBits(maskBits);
}
for (int i = num - 1; i >= 0; i--) {
offset = i*ncols;
if (topN.size() == 0 ) {
if (dp_maskBits) {
bid = maskBits[i];
} else {
bid = i;
}
dp_topBits[offset + 1] = 0.0;
} else {
bid = topN.top().second; // bit id
dp_topBits[offset + 1] = topN.top().first; // value of the infogain
PAIR_D_I entry(info, i);
if (info >= 0.0) {
if (topN.size() < num) {
topN.push(entry);
} else if (info > topN.top().first) {
topN.pop();
}
dp_topBits[offset] = (double)bid;
for (unsigned int j = 0; j < d_classes; j++) {
dp_topBits[offset + 2 + j] = (double)d_counts[j][bid];
topN.push(entry);
}
}
return dp_topBits;
}
void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const {
(*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent";
for (unsigned int ic = 0; ic < d_classes; ic++) {
(*outStream) << std::setw(10) << "class" << ic;
}
(*outStream) << std::endl;
unsigned int ncols = 2 + d_classes;
for (unsigned int i = 0; i < d_top; i++) {
(*outStream) << std::setw(12) << (int)dp_topBits[i*ncols]
<< std::setw(12) << std::setprecision(5)
<< dp_topBits[i*ncols + 1];
for (unsigned int ic = 0; ic < d_classes; ic++) {
(*outStream) << std::setw(10) << (int)dp_topBits[i*ncols + 2 + ic];
}
(*outStream) << "\n";
}
}
void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const {
std::ofstream tmpStream(fileName.c_str());
if ((!tmpStream) || (tmpStream.bad()) ) {
std::ostringstream errout;
errout << "Bad output file " << fileName;
throw RDKit::FileParseException(errout.str());
}
delete[] resMat;
std::ostream &outStream = static_cast<std::ostream &>(tmpStream);
this->writeTopBitsToStream(&outStream);
// now fill up the result matrix for the topN bits
// the result from this function is a double * of size
// num*4. The caller of this function interprets this
// array as a two dimensional array of size num*(2+d_classes) with each row
// containing the following entries
// bitId, infogain, 1 additional column for number of hits for each class
// double *res = new double[num*(2+d_classes)];
d_top = num;
int ncols = 2 + d_classes;
delete[] dp_topBits;
dp_topBits = new double[num * ncols];
int offset, bid;
RDKit::INT_VECT maskBits;
if (dp_maskBits && topN.size() < num) {
dp_maskBits->getOnBits(maskBits);
}
for (int i = num - 1; i >= 0; i--) {
offset = i * ncols;
if (topN.size() == 0) {
if (dp_maskBits) {
bid = maskBits[i];
} else {
bid = i;
}
dp_topBits[offset + 1] = 0.0;
} else {
bid = topN.top().second; // bit id
dp_topBits[offset + 1] = topN.top().first; // value of the infogain
topN.pop();
}
dp_topBits[offset] = (double)bid;
for (unsigned int j = 0; j < d_classes; j++) {
dp_topBits[offset + 2 + j] = (double)d_counts[j][bid];
}
}
return dp_topBits;
}
void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const {
(*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent";
for (unsigned int ic = 0; ic < d_classes; ic++) {
(*outStream) << std::setw(10) << "class" << ic;
}
(*outStream) << std::endl;
unsigned int ncols = 2 + d_classes;
for (unsigned int i = 0; i < d_top; i++) {
(*outStream) << std::setw(12) << (int)dp_topBits[i * ncols] << std::setw(12)
<< std::setprecision(5) << dp_topBits[i * ncols + 1];
for (unsigned int ic = 0; ic < d_classes; ic++) {
(*outStream) << std::setw(10) << (int)dp_topBits[i * ncols + 2 + ic];
}
(*outStream) << "\n";
}
}
void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const {
std::ofstream tmpStream(fileName.c_str());
if ((!tmpStream) || (tmpStream.bad())) {
std::ostringstream errout;
errout << "Bad output file " << fileName;
throw RDKit::FileParseException(errout.str());
}
std::ostream &outStream = static_cast<std::ostream &>(tmpStream);
this->writeTopBitsToStream(&outStream);
}
}

View File

@@ -15,236 +15,262 @@
#include <DataStructs/BitVects.h>
#include <iostream>
/*! \brief Class used to rank bits based on a specified measure of infomation
*
* Basically a primitive mimic of the CombiChem "signal" functionality
* To use:
* - create an instance of this class
* - loop over the fingerprints in the dataset by calling accumulateVotes method
* - create an instance of this class
* - loop over the fingerprints in the dataset by calling accumulateVotes
*method
* - call getTopN to get the top n ranked bits
*
* Sample usage and results from the python wrapper:
* Here's a small set of vectors:
* >>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]
* ...
* ...
* 0001 0
* 0101 0
* 0010 1
* 1110 1
*
*
* Default ranker, using infogain:
* >>> ranker = InfoBitRanker(4,2)
* >>> ranker = InfoBitRanker(4,2)
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 3 1.000 2 0
* 2 1.000 0 2
* 0 0.311 0 1
*
*
* Using the biased infogain:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)
* >>> ranker.SetBiasList((1,))
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 2 1.000 0 2
* 0 0.311 0 1
* 1 0.000 1 1
*
*
* A chi squared ranker is also available:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 3 4.000 2 0
* 2 4.000 0 2
* 0 1.333 0 1
*
*
* As is a biased chi squared:
* >>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)
* >>> ranker.SetBiasList((1,))
* >>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* ...
* >>> for bit,gain,n0,n1 in ranker.GetTopN(3): print
*int(bit),'%.3f'%gain,int(n0),int(n1)
* ...
* 2 4.000 0 2
* 0 1.333 0 1
* 1 0.000 1 1
*/
namespace RDInfoTheory {
typedef std::vector<RDKit::USHORT> USHORT_VECT;
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
typedef std::vector<RDKit::USHORT> USHORT_VECT;
typedef std::vector<USHORT_VECT> VECT_USHORT_VECT;
class InfoBitRanker {
public:
/*! \brief the type of measure for information
*
*/
typedef enum {
ENTROPY=1,
BIASENTROPY=2,
CHISQUARE=3,
BIASCHISQUARE=4
} InfoType;
/*! \brief Constructor
*
* ARGUMENTS:
*
* - nBits: the dimension of the bit vectors or the fingerprint length
* - nClasses: the number of classes used in the classification problem (e.g. active,
* moderately active, inactive etc.). It is assumed that the classes are
* numbered from 0 to (nClasses - 1)
* - infoType: the type of information metric
*/
InfoBitRanker(unsigned int nBits, unsigned int nClasses, InfoType infoType=InfoBitRanker::ENTROPY) :
d_dims(nBits), d_classes(nClasses), d_type(infoType) {
d_counts.resize(0);
for (unsigned int i = 0; i < nClasses; i++) {
USHORT_VECT cCount;
cCount.resize(d_dims, 0);
d_counts.push_back(cCount);
}
d_clsCount.resize(d_classes, 0);
d_nInst = 0;
d_top = 0;
dp_topBits=0;
d_biasList.resize(0);
dp_maskBits=0;
}
~InfoBitRanker() {
if(dp_topBits)
delete [] dp_topBits;
if(dp_maskBits)
delete dp_maskBits;
class InfoBitRanker {
public:
/*! \brief the type of measure for information
*
*/
typedef enum {
ENTROPY = 1,
BIASENTROPY = 2,
CHISQUARE = 3,
BIASCHISQUARE = 4
} InfoType;
/*! \brief Constructor
*
* ARGUMENTS:
*
* - nBits: the dimension of the bit vectors or the fingerprint length
* - nClasses: the number of classes used in the classification problem
*(e.g. active,
* moderately active, inactive etc.). It is assumed that the
*classes are
* numbered from 0 to (nClasses - 1)
* - infoType: the type of information metric
*/
InfoBitRanker(unsigned int nBits, unsigned int nClasses,
InfoType infoType = InfoBitRanker::ENTROPY)
: d_dims(nBits), d_classes(nClasses), d_type(infoType) {
d_counts.resize(0);
for (unsigned int i = 0; i < nClasses; i++) {
USHORT_VECT cCount;
cCount.resize(d_dims, 0);
d_counts.push_back(cCount);
}
d_clsCount.resize(d_classes, 0);
d_nInst = 0;
d_top = 0;
dp_topBits = 0;
d_biasList.resize(0);
dp_maskBits = 0;
}
/*! \brief Accumulate the votes for all the bits turned on in a bit vector
*
* ARGUMENTS:
*
* - bv : bit vector that supports [] operator
* - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses
*/
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
void accumulateVotes(const SparseBitVect &bv, unsigned int label);
/*! \brief Returns the top n bits ranked by the information metric
*
* This is actually the function where most of the work of ranking is happening
*
* \param num the number of top ranked bits that are required
*
* \return a pointer to an information array. The client should *not*
* delete this
*/
double *getTopN(unsigned int num);
/*! \brief return the number of labelled instances(examples) or fingerprints seen so far
*
*/
unsigned int getNumInstances() const {
return d_nInst;
}
/*! \brief return the number of classes
*
*/
unsigned int getNumClasses() const {
return d_classes;
}
~InfoBitRanker() {
if (dp_topBits) delete[] dp_topBits;
if (dp_maskBits) delete dp_maskBits;
}
/*! \brief Set the classes to which the entropy calculation should be biased
*
* This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits.
* In this mode, a bit must be correllated higher with one of the biased classes than all the
* other classes. For example, in a two class problem with actives and inactives, the fraction of
* actives that hit the bit has to be greater than the fraction of inactives that hit the bit
*
* ARGUMENTS:
* classList - list of class ids that we want a bias towards
*/
void setBiasList(RDKit::INT_VECT &classList);
/*! \brief Accumulate the votes for all the bits turned on in a bit vector
*
* ARGUMENTS:
*
* - bv : bit vector that supports [] operator
* - label : the class label for the bit vector. It is assumed that 0 <=
*class < nClasses
*/
void accumulateVotes(const ExplicitBitVect &bv, unsigned int label);
void accumulateVotes(const SparseBitVect &bv, unsigned int label);
/*! \brief Returns the top n bits ranked by the information metric
*
* This is actually the function where most of the work of ranking is
*happening
*
* \param num the number of top ranked bits that are required
*
* \return a pointer to an information array. The client should *not*
* delete this
*/
double *getTopN(unsigned int num);
/*! \brief Set the bits to be used as a mask
*
* If this function is called, only the bits which are present in the
* maskBits list will be used.
*
* ARGUMENTS:
* maskBits - the bits to be considered
*/
void setMaskBits(RDKit::INT_VECT &maskBits);
/*! \brief return the number of labelled instances(examples) or fingerprints
*seen so far
*
*/
unsigned int getNumInstances() const { return d_nInst; }
/*! \brief Write the top N bits to a stream
*
*/
void writeTopBitsToStream(std::ostream *outStream) const;
/*! \brief Write the top bits to a file
*
*/
void writeTopBitsToFile(const std::string &fileName) const;
/*! \brief return the number of classes
*
*/
unsigned int getNumClasses() const { return d_classes; }
private:
/*! \brief check if we want to compute the info content for a bit based on the bias list
*
* This what happens here:
* - the fraction of items in each class that hit a particular bit are computed
* - the maximum of these fractions for classes that are not in the biasList are computed
* - If this maximum is less than the fraction for atleast one of classes in the biaslist
* the bit is considered good
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
* a 2D structure is assumed with the first row containing number of items of each class
* with the bit set and the second row to entires of each class with the bit turned off
*/
bool BiasCheckBit(RDKit::USHORT *resMat) const;
/*! \brief Set the classes to which the entropy calculation should be biased
*
* This list contains a set of class ids used when in the BIASENTROPY mode of
*ranking bits.
* In this mode, a bit must be correllated higher with one of the biased
*classes than all the
* other classes. For example, in a two class problem with actives and
*inactives, the fraction of
* actives that hit the bit has to be greater than the fraction of inactives
*that hit the bit
*
* ARGUMENTS:
* classList - list of class ids that we want a bias towards
*/
void setBiasList(RDKit::INT_VECT &classList);
/*! \brief Compute the biased info entropy gain based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
* a 2D structure is assumed with the first row containing number of items of each class
* with the bit set and the second row to entires of each class with the bit turned off
*/
double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
/*! \brief Set the bits to be used as a mask
*
* If this function is called, only the bits which are present in the
* maskBits list will be used.
*
* ARGUMENTS:
* maskBits - the bits to be considered
*/
void setMaskBits(RDKit::INT_VECT &maskBits);
/*! \brief Compute the biased chi qsure value based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num of classes))
* a 2D structure is assumed with the first row containing number of items of each class
* with the bit set and the second row to entires of each class with the bit turned off
*/
double BiasChiSquareGain(RDKit::USHORT *resMat) const;
/*! \brief Write the top N bits to a stream
*
*/
void writeTopBitsToStream(std::ostream *outStream) const;
unsigned int d_dims; // the number of bits in the fingerprints
unsigned int d_classes; // the number of classes (active, inactive, moderately active etc.)
InfoType d_type; // the type of information meassure - currently we support only entropy
VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for each bit for each class
USHORT_VECT d_clsCount; // counter for the number of instances of each class
double *dp_topBits; // storage for the top ranked bits and the corresponding statistics
unsigned int d_top; // the number of bits that have been ranked
unsigned int d_nInst; // total number of instances or fingerprints used accumulate votes
RDKit::INT_VECT d_biasList; // if we want a bias towards certain classes in ranking bits
ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
};
/*! \brief Write the top bits to a file
*
*/
void writeTopBitsToFile(const std::string &fileName) const;
private:
/*! \brief check if we want to compute the info content for a bit based on the
*bias list
*
* This what happens here:
* - the fraction of items in each class that hit a particular bit are
*computed
* - the maximum of these fractions for classes that are not in the
*biasList are computed
* - If this maximum is less than the fraction for atleast one of classes
*in the biaslist
* the bit is considered good
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
bool BiasCheckBit(RDKit::USHORT *resMat) const;
/*! \brief Compute the biased info entropy gain based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the
*infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
double BiasInfoEntropyGain(RDKit::USHORT *resMat) const;
/*! \brief Compute the biased chi qsure value based on the bias list
*
* This what happens here:
* - we call BiasCheckBit to see if the bit qualifies to compute the
*infocontent
* - If this bit is ok then we call InfoEntropyGain otherwise we return 0.0
*
* ARGUMENTS:
* - resMat : the result matrix, one dimensional matrix of dimension (2*(num
*of classes))
* a 2D structure is assumed with the first row containing number
*of items of each class
* with the bit set and the second row to entires of each class
*with the bit turned off
*/
double BiasChiSquareGain(RDKit::USHORT *resMat) const;
unsigned int d_dims; // the number of bits in the fingerprints
unsigned int d_classes; // the number of classes (active, inactive,
// moderately active etc.)
InfoType d_type; // the type of information meassure - currently we support
// only entropy
VECT_USHORT_VECT d_counts; // place holder of counting the number of hits for
// each bit for each class
USHORT_VECT d_clsCount; // counter for the number of instances of each class
double *dp_topBits; // storage for the top ranked bits and the corresponding
// statistics
unsigned int d_top; // the number of bits that have been ranked
unsigned int d_nInst; // total number of instances or fingerprints used
// accumulate votes
RDKit::INT_VECT
d_biasList; // if we want a bias towards certain classes in ranking bits
ExplicitBitVect *dp_maskBits; // allows only certain bits to be considered
};
}
#endif

View File

@@ -10,129 +10,129 @@
namespace RDInfoTheory {
template<class T> double ChiSquare(T *dMat, long int dim1,long int dim2) {
// For a contingency matrix with each column corresponding to a class and each row to a
// the descriptor (or variable) state, the matrix looks something like for 3x3 problem
//
// 1 2 3 Totals
// 1 | N11 N12 N13 R1
// 2 | N21 N22 N23 R2
// 3 | N31 N32 N33 R3
// Totals | C1 C2 C3 N
//
// Th chi squere formula is
// chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
T *rowSums, *colSums;
int i, j, tSum;
// find the row sum
tSum = 0;
rowSums = new T[dim1];
for (i = 0; i < dim1; i++) {
int idx1 = i*dim2;
rowSums[i] = (T)0.0;
for (j = 0; j < dim2; j++) {
rowSums[i] += dMat[idx1 + j];
}
tSum += (int)rowSums[i];
template <class T>
double ChiSquare(T *dMat, long int dim1, long int dim2) {
// For a contingency matrix with each column corresponding to a class and each
// row to a
// the descriptor (or variable) state, the matrix looks something like for 3x3
// problem
//
// 1 2 3 Totals
// 1 | N11 N12 N13 R1
// 2 | N21 N22 N23 R2
// 3 | N31 N32 N33 R3
// Totals | C1 C2 C3 N
//
// Th chi squere formula is
// chi = sum((N/Ri)*sum(Nij^2/Cj) ) -N
T *rowSums, *colSums;
int i, j, tSum;
// find the row sum
tSum = 0;
rowSums = new T[dim1];
for (i = 0; i < dim1; i++) {
int idx1 = i * dim2;
rowSums[i] = (T)0.0;
for (j = 0; j < dim2; j++) {
rowSums[i] += dMat[idx1 + j];
}
// find the column sums
colSums = new T[dim2];
for (i = 0; i < dim2; i++) {
colSums[i] = (T)0.0;
for (j = 0; j < dim1; j++) {
colSums[i] += dMat[j*dim2 + i];
}
}
double chi = 0.0;
for ( i = 0; i < dim1; i++) {
double rchi = 0.0;
for (j = 0; j < dim2; j++) {
rchi += (pow((double)dMat[i*dim2 + j], 2)/colSums[j]);
}
chi += ( ((double)tSum/rowSums[i])*rchi );
}
chi -= tSum;
delete [] rowSums;
delete [] colSums;
return chi;
tSum += (int)rowSums[i];
}
template<class T> double InfoEntropy(T *tPtr, long int dim) {
int i;
T nInstances = 0;
double accum=0.0,d;
for(i=0;i<dim;i++){
nInstances += tPtr[i];
// find the column sums
colSums = new T[dim2];
for (i = 0; i < dim2; i++) {
colSums[i] = (T)0.0;
for (j = 0; j < dim1; j++) {
colSums[i] += dMat[j * dim2 + i];
}
if(nInstances != 0){
for(i=0;i<dim;i++){
d = (double)tPtr[i]/nInstances;
if(d != 0){
accum += -d*log(d);
}
}
}
return accum/log(2.0);
}
template<class T> double InfoEntropyGain(T *dMat, long int dim1,long int dim2) {
T *variableRes, *overallRes;
double gain,term2;
int tSum;
//std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
variableRes = new T[dim1];
for(long int i=0;i<dim1;i++){
long int idx1 = i*dim2;
variableRes[i] = (T)0.0;
for(long int j=0;j<dim2;j++){
variableRes[i] += dMat[idx1+j];
//std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
}
double chi = 0.0;
for (i = 0; i < dim1; i++) {
double rchi = 0.0;
for (j = 0; j < dim2; j++) {
rchi += (pow((double)dMat[i * dim2 + j], 2) / colSums[j]);
}
overallRes = new T[dim2];
// do the col sums
for(long int i=0;i<dim2;i++){
overallRes[i] = (T)0.0;
for(long int j=0;j<dim1;j++){
overallRes[i] += dMat[j*dim2+i];
//std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
}
}
term2 = 0.0;
for(long int i=0;i<dim1;i++) {
T *tPtr;
tPtr = dMat + i*dim2;
term2 += variableRes[i] * InfoEntropy(tPtr,dim2);
}
tSum = 0;
for(long int i=0;i<dim2;i++){
tSum += static_cast<int>(overallRes[i]);
}
if(tSum != 0){
term2 /= tSum;
gain = InfoEntropy(overallRes,dim2) - term2;
}
else{
gain = 0.0;
}
//std::cerr<<" >gain> "<<gain<<std::endl;
delete [] overallRes;
delete [] variableRes;
return gain;
chi += (((double)tSum / rowSums[i]) * rchi);
}
chi -= tSum;
delete[] rowSums;
delete[] colSums;
return chi;
}
template <class T>
double InfoEntropy(T *tPtr, long int dim) {
int i;
T nInstances = 0;
double accum = 0.0, d;
for (i = 0; i < dim; i++) {
nInstances += tPtr[i];
}
if (nInstances != 0) {
for (i = 0; i < dim; i++) {
d = (double)tPtr[i] / nInstances;
if (d != 0) {
accum += -d * log(d);
}
}
}
return accum / log(2.0);
}
template <class T>
double InfoEntropyGain(T *dMat, long int dim1, long int dim2) {
T *variableRes, *overallRes;
double gain, term2;
int tSum;
// std::cerr<<" --------\n ieg: "<<dim1<<" "<<dim2<<std::endl;
variableRes = new T[dim1];
for (long int i = 0; i < dim1; i++) {
long int idx1 = i * dim2;
variableRes[i] = (T)0.0;
for (long int j = 0; j < dim2; j++) {
variableRes[i] += dMat[idx1 + j];
// std::cerr<<" "<<i<<" "<<j<<" "<<dMat[idx1+j]<<std::endl;
}
}
overallRes = new T[dim2];
// do the col sums
for (long int i = 0; i < dim2; i++) {
overallRes[i] = (T)0.0;
for (long int j = 0; j < dim1; j++) {
overallRes[i] += dMat[j * dim2 + i];
// std::cerr<<" "<<i<<" "<<j<<" "<<dMat[j*dim2+i]<<std::endl;
}
}
term2 = 0.0;
for (long int i = 0; i < dim1; i++) {
T *tPtr;
tPtr = dMat + i * dim2;
term2 += variableRes[i] * InfoEntropy(tPtr, dim2);
}
tSum = 0;
for (long int i = 0; i < dim2; i++) {
tSum += static_cast<int>(overallRes[i]);
}
if (tSum != 0) {
term2 /= tSum;
gain = InfoEntropy(overallRes, dim2) - term2;
} else {
gain = 0.0;
}
// std::cerr<<" >gain> "<<gain<<std::endl;
delete[] overallRes;
delete[] variableRes;
return gain;
}
}
#endif

View File

@@ -8,7 +8,6 @@
// of the RDKit source tree.
//
#define NO_IMPORT_ARRAY
#include <RDBoost/python.h>
#define PY_ARRAY_UNIQUE_SYMBOL rdinfotheory_array_API
@@ -22,47 +21,48 @@
namespace python = boost::python;
namespace RDInfoTheory {
PyObject *getCorrMatrix(BitCorrMatGenerator *cmGen) {
double *dres = cmGen->getCorrMat();
unsigned int nb = cmGen->getCorrBitList().size();
npy_intp dim = nb*(nb-1)/2;
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(1,&dim,NPY_DOUBLE);
memcpy(static_cast<void *>(res->data),
static_cast<void *>(dres), dim*sizeof(double));
return PyArray_Return(res);
}
void setBitList(BitCorrMatGenerator *cmGen, python::object bitList) {
PySequenceHolder<int> blist(bitList);
unsigned int nb = blist.size();
RDKit::INT_VECT res;
res.reserve(nb);
for (unsigned int i = 0; i < nb; i++) {
res.push_back(blist[i]);
}
cmGen->setBitIdList(res);
}
PyObject *getCorrMatrix(BitCorrMatGenerator *cmGen) {
double *dres = cmGen->getCorrMat();
unsigned int nb = cmGen->getCorrBitList().size();
npy_intp dim = nb * (nb - 1) / 2;
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(1, &dim, NPY_DOUBLE);
memcpy(static_cast<void *>(res->data), static_cast<void *>(dres),
dim * sizeof(double));
return PyArray_Return(res);
}
void CollectVotes(BitCorrMatGenerator *cmGen, python::object bitVect) {
python::extract<ExplicitBitVect> ebvWorks(bitVect);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (ebvWorks.check()) {
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
cmGen->collectVotes(ev);
}
else if (sbvWorks.check()) {
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
cmGen->collectVotes(sv);
}
else {
throw_value_error("CollectVote can only take ExplicitBitVects or SparseBitVects");
}
void setBitList(BitCorrMatGenerator *cmGen, python::object bitList) {
PySequenceHolder<int> blist(bitList);
unsigned int nb = blist.size();
RDKit::INT_VECT res;
res.reserve(nb);
for (unsigned int i = 0; i < nb; i++) {
res.push_back(blist[i]);
}
cmGen->setBitIdList(res);
}
struct corrmat_wrap {
static void wrap() {
std::string docString = "A class to generate a pariwise correlation matrix between a list of bits\n"
void CollectVotes(BitCorrMatGenerator *cmGen, python::object bitVect) {
python::extract<ExplicitBitVect> ebvWorks(bitVect);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (ebvWorks.check()) {
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
cmGen->collectVotes(ev);
} else if (sbvWorks.check()) {
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
cmGen->collectVotes(sv);
} else {
throw_value_error(
"CollectVote can only take ExplicitBitVects or SparseBitVects");
}
}
struct corrmat_wrap {
static void wrap() {
std::string docString =
"A class to generate a pariwise correlation matrix between a list of "
"bits\n"
"The mode of operation for this class is something like this\n"
" >>> cmg = BitCorrMatGenerator() \n"
" >>> cmg.SetBitList(blist) \n"
@@ -70,28 +70,26 @@ namespace RDInfoTheory {
" >>> cmg.CollectVotes(fp) \n"
" >>> corrMat = cmg.GetCorrMatrix() \n"
" \n"
" The resulting correlation matrix is a one dimensional nummeric array containing the \n"
" The resulting correlation matrix is a one dimensional nummeric "
"array containing the \n"
" lower triangle elements\n";
python::class_<BitCorrMatGenerator>("BitCorrMatGenerator",
docString.c_str())
python::class_<BitCorrMatGenerator>("BitCorrMatGenerator",
docString.c_str())
.def("SetBitList", setBitList,
"Set the list of bits that need to be correllated\n\n"
" This may for example be ther top ranking ensemble bits\n\n"
"ARGUMENTS:\n\n"
" - bitList : an integer list of bit IDs\n")
.def("CollectVotes", CollectVotes,
"For each pair of on bits (bi, bj) in fp increase the correlation count for the pair by 1\n\n"
"For each pair of on bits (bi, bj) in fp increase the correlation "
"count for the pair by 1\n\n"
"ARGUMENTS:\n\n"
" - fp : a bit vector to collect the fingerprints from\n")
.def("GetCorrMatrix", getCorrMatrix,
"Get the correlation matrix following the collection of votes from a bunch of fingerprints\n")
;
};
"Get the correlation matrix following the collection of votes "
"from a bunch of fingerprints\n");
};
};
}
void wrap_corrmatgen() {
RDInfoTheory::corrmat_wrap::wrap();
}
void wrap_corrmatgen() { RDInfoTheory::corrmat_wrap::wrap(); }

View File

@@ -22,161 +22,171 @@
namespace python = boost::python;
namespace RDInfoTheory {
PyObject *getTopNbits(InfoBitRanker *ranker, int num){// int ignoreNoClass=-1) {
double *dres = ranker->getTopN(num);
npy_intp dims[2];
dims[0] = num;
dims[1] = ranker->getNumClasses() + 2;
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(2,dims,NPY_DOUBLE);
memcpy(static_cast<void *>(res->data),
static_cast<void *>(dres), dims[0]*dims[1]*sizeof(double));
return PyArray_Return(res);
}
void AccumulateVotes(InfoBitRanker *ranker, python::object bitVect, int label) {
python::extract<ExplicitBitVect> ebvWorks(bitVect);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (ebvWorks.check()) {
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
ranker->accumulateVotes(ev, label);
}
else if (sbvWorks.check()) {
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
ranker->accumulateVotes(sv, label);
}
else {
throw_value_error("Accumulate Vote can only take a explicitBitVects or SparseBitvects");
}
}
void SetBiasList(InfoBitRanker *ranker, python::object classList) {
RDKit::INT_VECT cList;
PySequenceHolder<int> bList(classList);
cList.reserve(bList.size());
for (unsigned int i = 0; i < bList.size(); i++) {
cList.push_back(bList[i]);
}
ranker->setBiasList(cList);
}
PyObject *getTopNbits(InfoBitRanker *ranker,
int num) { // int ignoreNoClass=-1) {
double *dres = ranker->getTopN(num);
npy_intp dims[2];
dims[0] = num;
dims[1] = ranker->getNumClasses() + 2;
PyArrayObject *res = (PyArrayObject *)PyArray_SimpleNew(2, dims, NPY_DOUBLE);
memcpy(static_cast<void *>(res->data), static_cast<void *>(dres),
dims[0] * dims[1] * sizeof(double));
return PyArray_Return(res);
}
void SetMaskBits(InfoBitRanker *ranker, python::object maskBits) {
RDKit::INT_VECT cList;
PySequenceHolder<int> bList(maskBits);
cList.reserve(bList.size());
for (unsigned int i = 0; i < bList.size(); i++) {
cList.push_back(bList[i]);
}
ranker->setMaskBits(cList);
void AccumulateVotes(InfoBitRanker *ranker, python::object bitVect, int label) {
python::extract<ExplicitBitVect> ebvWorks(bitVect);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (ebvWorks.check()) {
ExplicitBitVect ev = python::extract<ExplicitBitVect>(bitVect);
ranker->accumulateVotes(ev, label);
} else if (sbvWorks.check()) {
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
ranker->accumulateVotes(sv, label);
} else {
throw_value_error(
"Accumulate Vote can only take a explicitBitVects or SparseBitvects");
}
}
void tester(InfoBitRanker *ranker, python::object bitVect) {
RDUNUSED_PARAM(ranker);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (sbvWorks.check()){
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
std::cout << "Num of on bits: " << sv.getNumOnBits() << "\n";
}
void SetBiasList(InfoBitRanker *ranker, python::object classList) {
RDKit::INT_VECT cList;
PySequenceHolder<int> bList(classList);
cList.reserve(bList.size());
for (unsigned int i = 0; i < bList.size(); i++) {
cList.push_back(bList[i]);
}
ranker->setBiasList(cList);
}
struct ranker_wrap {
static void wrap() {
std::string docString = "A class to rank the bits from a series of labelled fingerprints\n"
"A simple demonstration may help clarify what this class does. \n"
"Here's a small set of vectors:\n"
">>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]\n"
"... \n"
"0001 0\n"
"0101 0\n"
"0010 1\n"
"1110 1\n"
"\n"
"Default ranker, using infogain:\n"
">>> ranker = InfoBitRanker(4,2) \n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"3 1.000 2 0\n"
"2 1.000 0 2\n"
"0 0.311 0 1\n"
"\n"
"Using the biased infogain:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)\n"
">>> ranker.SetBiasList((1,))\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"2 1.000 0 2\n"
"0 0.311 0 1\n"
"1 0.000 1 1\n"
"\n"
"A chi squared ranker is also available:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"3 4.000 2 0\n"
"2 4.000 0 2\n"
"0 1.333 0 1\n"
"\n"
"As is a biased chi squared:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)\n"
">>> ranker.SetBiasList((1,))\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"2 4.000 0 2\n"
"0 1.333 0 1\n"
"1 0.000 1 1\n";
void SetMaskBits(InfoBitRanker *ranker, python::object maskBits) {
RDKit::INT_VECT cList;
PySequenceHolder<int> bList(maskBits);
cList.reserve(bList.size());
for (unsigned int i = 0; i < bList.size(); i++) {
cList.push_back(bList[i]);
}
ranker->setMaskBits(cList);
}
python::class_<InfoBitRanker>("InfoBitRanker",
docString.c_str(),
python::init<int, int>(python::args("nBits", "nClasses")))
.def(python::init<int, int, InfoBitRanker::InfoType>
(python::args("nBits", "nClasses", "infoType")))
void tester(InfoBitRanker *ranker, python::object bitVect) {
RDUNUSED_PARAM(ranker);
python::extract<SparseBitVect> sbvWorks(bitVect);
if (sbvWorks.check()) {
SparseBitVect sv = python::extract<SparseBitVect>(bitVect);
std::cout << "Num of on bits: " << sv.getNumOnBits() << "\n";
}
}
struct ranker_wrap {
static void wrap() {
std::string docString =
"A class to rank the bits from a series of labelled fingerprints\n"
"A simple demonstration may help clarify what this class does. \n"
"Here's a small set of vectors:\n"
">>> for i,bv in enumerate(bvs): print bv.ToBitString(),acts[i]\n"
"... \n"
"0001 0\n"
"0101 0\n"
"0010 1\n"
"1110 1\n"
"\n"
"Default ranker, using infogain:\n"
">>> ranker = InfoBitRanker(4,2) \n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"3 1.000 2 0\n"
"2 1.000 0 2\n"
"0 0.311 0 1\n"
"\n"
"Using the biased infogain:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASENTROPY)\n"
">>> ranker.SetBiasList((1,))\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"2 1.000 0 2\n"
"0 0.311 0 1\n"
"1 0.000 1 1\n"
"\n"
"A chi squared ranker is also available:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.CHISQUARE)\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"3 4.000 2 0\n"
"2 4.000 0 2\n"
"0 1.333 0 1\n"
"\n"
"As is a biased chi squared:\n"
">>> ranker = InfoBitRanker(4,2,InfoTheory.InfoType.BIASCHISQUARE)\n"
">>> ranker.SetBiasList((1,))\n"
">>> for i,bv in enumerate(bvs): ranker.AccumulateVotes(bv,acts[i])\n"
"... \n"
">>> for bit,gain,n0,n1 in ranker.GetTopN(3): print "
"int(bit),'%.3f'%gain,int(n0),int(n1)\n"
"... \n"
"2 4.000 0 2\n"
"0 1.333 0 1\n"
"1 0.000 1 1\n";
python::class_<InfoBitRanker>(
"InfoBitRanker", docString.c_str(),
python::init<int, int>(python::args("nBits", "nClasses")))
.def(python::init<int, int, InfoBitRanker::InfoType>(
python::args("nBits", "nClasses", "infoType")))
.def("AccumulateVotes", AccumulateVotes,
"Accumulate the votes for all the bits turned on in a bit vector\n\n"
"Accumulate the votes for all the bits turned on in a bit "
"vector\n\n"
"ARGUMENTS:\n\n"
" - bv : bit vector either ExplicitBitVect or SparseBitVect operator\n"
" - label : the class label for the bit vector. It is assumed that 0 <= class < nClasses \n")
.def ("SetBiasList", SetBiasList,
"Set the classes to which the entropy calculation should be biased\n\n"
"This list contains a set of class ids used when in the BIASENTROPY mode of ranking bits. \n"
"In this mode, a bit must be correlated higher with one of the biased classes than all the \n"
"other classes. For example, in a two class problem with actives and inactives, the fraction of \n"
"actives that hit the bit has to be greater than the fraction of inactives that hit the bit\n\n"
"ARGUMENTS: \n\n"
" - classList : list of class ids that we want a bias towards\n")
.def ("SetMaskBits", SetMaskBits,
"Set the mask bits for the calculation\n\n"
"ARGUMENTS: \n\n"
" - maskBits : list of mask bits to use\n")
" - bv : bit vector either ExplicitBitVect or SparseBitVect "
"operator\n"
" - label : the class label for the bit vector. It is assumed "
"that 0 <= class < nClasses \n")
.def("SetBiasList", SetBiasList,
"Set the classes to which the entropy calculation should be "
"biased\n\n"
"This list contains a set of class ids used when in the "
"BIASENTROPY mode of ranking bits. \n"
"In this mode, a bit must be correlated higher with one of the "
"biased classes than all the \n"
"other classes. For example, in a two class problem with actives "
"and inactives, the fraction of \n"
"actives that hit the bit has to be greater than the fraction of "
"inactives that hit the bit\n\n"
"ARGUMENTS: \n\n"
" - classList : list of class ids that we want a bias towards\n")
.def("SetMaskBits", SetMaskBits,
"Set the mask bits for the calculation\n\n"
"ARGUMENTS: \n\n"
" - maskBits : list of mask bits to use\n")
.def("GetTopN", getTopNbits,
"Returns the top n bits ranked by the information metric\n"
"This is actually the function where most of the work of ranking is happening\n\n"
"This is actually the function where most of the work of ranking "
"is happening\n\n"
"ARGUMENTS:\n\n"
" - num : the number of top ranked bits that are required\n")
.def("WriteTopBitsToFile", &InfoBitRanker::writeTopBitsToFile,
"Write the bits that have been ranked to a file")
.def("Tester", tester)
;
python::enum_<InfoBitRanker::InfoType>("InfoType")
.def("Tester", tester);
python::enum_<InfoBitRanker::InfoType>("InfoType")
.value("ENTROPY", InfoBitRanker::ENTROPY)
.value("BIASENTROPY", InfoBitRanker::BIASENTROPY)
.value("CHISQUARE", InfoBitRanker::CHISQUARE)
.value("BIASCHISQUARE", InfoBitRanker::BIASCHISQUARE)
.export_values();
;
};
;
};
};
}
void wrap_ranker() {
RDInfoTheory::ranker_wrap::wrap();
}
void wrap_ranker() { RDInfoTheory::ranker_wrap::wrap(); }

View File

@@ -18,126 +18,127 @@ namespace python = boost::python;
using namespace RDInfoTheory;
namespace RDInfoTheory {
double infoEntropy(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
((PyArrayObject *)matObj)->descr->type_num,
1,1);
double res=0.0;
// we are expecting a 1 dimensional array
long int ncols = (long int)((PyArrayObject *)matObj)->dimensions[0];
CHECK_INVARIANT(ncols > 0, "");
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = InfoEntropy(data, ncols);
}
Py_DECREF(copy);
return res;
double infoEntropy(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
double infoGain(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
((PyArrayObject *)matObj)->descr->type_num,
2,2);
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
double res=0.0;
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else {
throw_value_error("Numeric array object of type int or long or float or double");
}
Py_DECREF(copy);
return res;
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
matObj, ((PyArrayObject *)matObj)->descr->type_num, 1, 1);
double res = 0.0;
// we are expecting a 1 dimensional array
long int ncols = (long int)((PyArrayObject *)matObj)->dimensions[0];
CHECK_INVARIANT(ncols > 0, "");
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = InfoEntropy(data, ncols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = InfoEntropy(data, ncols);
}
Py_DECREF(copy);
return res;
}
double chiSquare(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(matObj,
((PyArrayObject *)matObj)->descr->type_num,
2,2);
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
double res=0.0;
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = ChiSquare(data, rows, cols);
} else {
throw_value_error("Numeric array object of type int or long or float or double");
}
Py_DECREF(copy);
return res;
double infoGain(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
matObj, ((PyArrayObject *)matObj)->descr->type_num, 2, 2);
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
double res = 0.0;
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = InfoEntropyGain(data, rows, cols);
} else {
throw_value_error(
"Numeric array object of type int or long or float or double");
}
Py_DECREF(copy);
return res;
}
double chiSquare(python::object resArr) {
PyObject *matObj = resArr.ptr();
if (!PyArray_Check(matObj)) {
throw_value_error("Expecting a Numeric array object");
}
PyArrayObject *copy;
copy = (PyArrayObject *)PyArray_ContiguousFromObject(
matObj, ((PyArrayObject *)matObj)->descr->type_num, 2, 2);
long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0];
long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1];
double res = 0.0;
if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) {
double *data = (double *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) {
float *data = (float *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) {
int *data = (int *)copy->data;
res = ChiSquare(data, rows, cols);
} else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) {
long int *data = (long int *)copy->data;
res = ChiSquare(data, rows, cols);
} else {
throw_value_error(
"Numeric array object of type int or long or float or double");
}
Py_DECREF(copy);
return res;
}
}
void wrap_ranker();
void wrap_corrmatgen();
BOOST_PYTHON_MODULE(rdInfoTheory)
{
BOOST_PYTHON_MODULE(rdInfoTheory) {
python::scope().attr("__doc__") =
"Module containing bunch of functions for information metrics and a ranker to rank bits"
;
"Module containing bunch of functions for information metrics and a "
"ranker to rank bits";
rdkit_import_array();
python::register_exception_translator<IndexErrorException>(&translate_index_error);
python::register_exception_translator<ValueErrorException>(&translate_value_error);
python::register_exception_translator<IndexErrorException>(
&translate_index_error);
python::register_exception_translator<ValueErrorException>(
&translate_value_error);
wrap_ranker();
wrap_corrmatgen();
std::string docString="calculates the informational entropy of the values in an array\n\n\
std::string docString =
"calculates the informational entropy of the values in an array\n\n\
ARGUMENTS:\n\
\n\
- resMat: pointer to a long int array containing the data\n\
- dim: long int containing the length of the _tPtr_ array.\n\n\
RETURNS:\n\n\
a double\n";
python::def("InfoEntropy", RDInfoTheory::infoEntropy,
docString.c_str());
python::def("InfoEntropy", RDInfoTheory::infoEntropy, docString.c_str());
docString="Calculates the information gain for a variable\n\n\
docString =
"Calculates the information gain for a variable\n\n\
ARGUMENTS:\n\n\
- varMat: a Numeric Array object\n\
varMat is a Numeric array with the number of possible occurances\n\
@@ -148,11 +149,10 @@ BOOST_PYTHON_MODULE(rdInfoTheory)
- a Python float object\n\n\
NOTES\n\n\
- this is a dropin replacement for _PyInfoGain()_ in entropy.py\n";
python::def("InfoGain", RDInfoTheory::infoGain,
docString.c_str());
python::def("InfoGain", RDInfoTheory::infoGain, docString.c_str());
docString="Calculates the chi squared value for a variable\n\n\
docString =
"Calculates the chi squared value for a variable\n\n\
ARGUMENTS:\n\n\
- varMat: a Numeric Array object\n\
varMat is a Numeric array with the number of possible occurances\n\
@@ -161,8 +161,5 @@ BOOST_PYTHON_MODULE(rdInfoTheory)
has 3 possible values, varMat would be 4x3\n\n\
RETURNS:\n\n\
- a Python float object\n";
python::def("ChiSquare", RDInfoTheory::chiSquare,
docString.c_str());
python::def("ChiSquare", RDInfoTheory::chiSquare, docString.c_str());
}