Files
rdkit/Code/ML/Data/cQuantize.cpp
Greg Landrum d41752d558 run clang-tidy with readability-braces-around-statements (#2899)
* run clang-tidy with readability-braces-around-statements
clang-format the results
clean up all the parts that clang-tidy-8 broke

* fix problem on windows
2020-01-25 14:19:32 +01:00

368 lines
10 KiB
C++

//
// Copyright 2003-2008 Rational Discovery LLC and Greg Landrum
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <cstring>
#include <RDBoost/Wrap.h>
#include <RDBoost/import_array.h>
namespace python = boost::python;
#include <ML/InfoTheory/InfoGainFuncs.h>
/***********************************************
constructs a variable table for the data passed in
The table for a given variable records the number of times each possible
value
of that variable appears for each possible result of the function.
**Arguments**
- vals: pointer to double, contains the values of the variable,
should be sorted
- nVals: int, the length of _vals_
- cuts: pointer to int, the indices of the quantization bounds
- nCuts: int, the length of _cuts_
- starts: pointer to int, the potential starting points for quantization
bounds
- nStarts: int, the length of _starts_
- results: poitner to int, the result codes
- nPossibleRes: int, the number of possible result codes
**Returns**
_varTable_ (a pointer to int), which is also modified in place.
**Notes:**
- _varTable_ is modified in place
- the _results_ array is assumed to be _nVals_ long
***********************************************/
long int *GenVarTable(double *vals, int nVals, long int *cuts, int nCuts,
long int *starts, long int *results, int nPossibleRes,
long int *varTable) {
RDUNUSED_PARAM(vals);
int nBins = nCuts + 1;
int idx, i, iTab;
memset(varTable, 0, nBins * nPossibleRes * sizeof(long int));
idx = 0;
for (i = 0; i < nCuts; i++) {
int cut = cuts[i];
iTab = i * nPossibleRes;
while (idx < starts[cut]) {
varTable[iTab + results[idx]] += 1;
idx++;
}
}
iTab = nCuts * nPossibleRes;
while (idx < nVals) {
varTable[iTab + results[idx]] += 1;
idx++;
}
return varTable;
}
/***********************************************
This actually does the recursion required by *cQuantize_RecurseOnBounds()*,
we do things this way to avoid having to convert things back and forth
from Python objects
**Arguments**
- vals: pointer to double, contains the values of the variable,
should be sorted
- nVals: int, the length of _vals_
- cuts: pointer to int, the indices of the quantization bounds
- nCuts: int, the length of _cuts_
- which: int, the quant bound being modified here
- starts: pointer to int, the potential starting points for quantization
bounds
- nStarts: int, the length of _starts_
- results: poitner to int, the result codes
- nPossibleRes: int, the number of possible result codes
**Returns**
a double, the expected information gain for the best bounds found
(which are found in _cuts_ )
**Notes:**
- _cuts_ is modified in place
- the _results_ array is assumed to be _nVals_ long
***********************************************/
double RecurseHelper(double *vals, int nVals, long int *cuts, int nCuts,
int which, long int *starts, int nStarts,
long int *results, int nPossibleRes) {
PRECONDITION(vals, "bad vals pointer");
double maxGain = -1e6, gainHere;
long int *bestCuts, *tCuts;
long int *varTable = nullptr;
int highestCutHere = nStarts - nCuts + which;
int i, nBounds = nCuts;
varTable = (long int *)calloc((nCuts + 1) * nPossibleRes, sizeof(long int));
bestCuts = (long int *)calloc(nCuts, sizeof(long int));
tCuts = (long int *)calloc(nCuts, sizeof(long int));
CHECK_INVARIANT(varTable, "failed to allocate memory");
CHECK_INVARIANT(bestCuts, "failed to allocate memory");
CHECK_INVARIANT(tCuts, "failed to allocate memory");
GenVarTable(vals, nVals, cuts, nCuts, starts, results, nPossibleRes,
varTable);
while (cuts[which] <= highestCutHere) {
gainHere = RDInfoTheory::InfoEntropyGain(varTable, nCuts + 1, nPossibleRes);
if (gainHere > maxGain) {
maxGain = gainHere;
memcpy(bestCuts, cuts, nCuts * sizeof(long int));
}
// recurse on the next vars if needed
if (which < nBounds - 1) {
memcpy(tCuts, cuts, nCuts * sizeof(long int));
gainHere = RecurseHelper(vals, nVals, tCuts, nCuts, which + 1, starts,
nStarts, results, nPossibleRes);
if (gainHere > maxGain) {
maxGain = gainHere;
memcpy(bestCuts, tCuts, nCuts * sizeof(long int));
}
}
// update this cut
int oldCut = cuts[which];
cuts[which] += 1;
int top, bot;
bot = starts[oldCut];
if (oldCut + 1 < nStarts) {
top = starts[oldCut + 1];
} else {
top = starts[nStarts - 1];
}
for (i = bot; i < top; i++) {
int v = results[i];
varTable[which * nPossibleRes + v] += 1;
varTable[(which + 1) * nPossibleRes + v] -= 1;
}
for (i = which + 1; i < nBounds; i++) {
if (cuts[i] == cuts[i - 1]) {
cuts[i] += 1;
}
}
}
memcpy(cuts, bestCuts, nCuts * sizeof(long int));
free(tCuts);
free(bestCuts);
free(varTable);
return maxGain;
}
/***********************************************
Recursively finds the best quantization boundaries
**Arguments**
- vals: a 1D Numeric array with the values of the variables,
this should be sorted
- cuts: a list with the indices of the quantization bounds
(indices are into _starts_ )
- which: an integer indicating which bound is being adjusted here
(and index into _cuts_ )
- starts: a list of potential starting points for quantization bounds
- results: a 1D Numeric array of integer result codes
- nPossibleRes: an integer with the number of possible result codes
**Returns**
- a 2-tuple containing:
1) the best information gain found so far
2) a list of the quantization bound indices ( _cuts_ for the best case)
**Notes**
- this is not even remotely efficient, which is why a C replacement
was written
- this is a drop-in replacement for *ML.Data.Quantize._PyRecurseBounds*
***********************************************/
static python::tuple cQuantize_RecurseOnBounds(python::object vals,
python::list pyCuts, int which,
python::list pyStarts,
python::object results,
int nPossibleRes) {
PyArrayObject *contigVals, *contigResults;
long int *cuts, *starts;
/*
-------
Setup code
-------
*/
contigVals = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(vals.ptr(), NPY_DOUBLE, 1, 1));
if (!contigVals) {
throw_value_error("could not convert value argument");
}
contigResults = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
if (!contigResults) {
throw_value_error("could not convert results argument");
}
python::ssize_t nCuts = python::len(pyCuts);
cuts = (long int *)calloc(nCuts, sizeof(long int));
CHECK_INVARIANT(cuts, "failed to allocate memory");
for (python::ssize_t i = 0; i < nCuts; i++) {
python::object elem = pyCuts[i];
cuts[i] = python::extract<long int>(elem);
}
python::ssize_t nStarts = python::len(pyStarts);
starts = (long int *)calloc(nStarts, sizeof(long int));
CHECK_INVARIANT(starts, "failed to allocate memory");
for (python::ssize_t i = 0; i < nStarts; i++) {
python::object elem = pyStarts[i];
starts[i] = python::extract<long int>(elem);
}
// do the real work
double gain = RecurseHelper(
(double *)PyArray_DATA(contigVals), PyArray_DIM(contigVals, 0), cuts,
nCuts, which, starts, nStarts, (long int *)PyArray_DATA(contigResults),
nPossibleRes);
/*
-------
Construct the return value
-------
*/
python::list cutObj;
for (python::ssize_t i = 0; i < nCuts; i++) {
cutObj.append(cuts[i]);
}
free(cuts);
free(starts);
return python::make_tuple(gain, cutObj);
}
static python::list cQuantize_FindStartPoints(python::object values,
python::object results,
int nData) {
python::list startPts;
if (nData < 2) {
return startPts;
}
auto *contigVals = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(values.ptr(), NPY_DOUBLE, 1, 1));
if (!contigVals) {
throw_value_error("could not convert value argument");
}
auto *vals = (double *)PyArray_DATA(contigVals);
auto *contigResults = reinterpret_cast<PyArrayObject *>(
PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
if (!contigResults) {
throw_value_error("could not convert results argument");
}
long *res = (long *)PyArray_DATA(contigResults);
bool firstBlock = true;
long lastBlockAct = -2, blockAct = res[0];
int lastDiv = -1;
double tol = 1e-8;
int i = 1;
while (i < nData) {
while (i < nData && vals[i] - vals[i - 1] <= tol) {
if (res[i] != blockAct) {
blockAct = -1;
}
++i;
}
if (firstBlock) {
firstBlock = false;
lastBlockAct = blockAct;
lastDiv = i;
} else {
if (blockAct == -1 || lastBlockAct == -1 || blockAct != lastBlockAct) {
startPts.append(lastDiv);
lastDiv = i;
lastBlockAct = blockAct;
} else {
lastDiv = i;
}
}
if (i < nData) {
blockAct = res[i];
}
++i;
}
// catch the case that the last point also sets a bin:
if (blockAct != lastBlockAct) {
startPts.append(lastDiv);
}
return startPts;
}
BOOST_PYTHON_MODULE(cQuantize) {
rdkit_import_array();
python::def("_RecurseOnBounds", cQuantize_RecurseOnBounds,
(python::arg("vals"), python::arg("pyCuts"), python::arg("which"),
python::arg("pyStarts"), python::arg("results"),
python::arg("nPossibleRes")),
"TODO: provide docstring");
python::def(
"_FindStartPoints", cQuantize_FindStartPoints,
(python::arg("values"), python::arg("results"), python::arg("nData")),
"TODO: provide docstring");
}