mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
* clean up iterator defs in types.h * do not use auto for inline constexpr * restore undef max,min * restore types.h declarations
288 lines
8.3 KiB
C++
288 lines
8.3 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
|
|
#include "InfoBitRanker.h"
|
|
#include "InfoGainFuncs.h"
|
|
#include <RDGeneral/Invariant.h>
|
|
#include <iomanip>
|
|
#include <fstream>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <RDGeneral/Exceptions.h>
|
|
#include <algorithm>
|
|
#include <queue>
|
|
|
|
namespace RDInfoTheory {
|
|
typedef std::pair<double, int> PAIR_D_I;
|
|
typedef std::vector<PAIR_D_I> VECT_PDI;
|
|
|
|
struct gtDIPair {
|
|
bool operator()(const PAIR_D_I &pd1, const PAIR_D_I &pd2) const {
|
|
return pd1.first > pd2.first;
|
|
}
|
|
};
|
|
|
|
typedef std::priority_queue<PAIR_D_I, VECT_PDI, gtDIPair> PR_QUEUE;
|
|
|
|
void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) {
|
|
URANGE_CHECK(classList.size(), d_classes);
|
|
d_biasList = classList;
|
|
// make sure we don't have any duplicates
|
|
std::sort(d_biasList.begin(), d_biasList.end());
|
|
auto bi = std::unique(d_biasList.begin(), d_biasList.end());
|
|
CHECK_INVARIANT(bi == d_biasList.end(),
|
|
"There are duplicates in the class bias list");
|
|
|
|
// finally make sure all the class ID in d_biasList are within range
|
|
for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) {
|
|
URANGE_CHECK(static_cast<unsigned int>(*bi), d_classes);
|
|
}
|
|
}
|
|
|
|
void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) {
|
|
delete dp_maskBits;
|
|
dp_maskBits = new ExplicitBitVect(d_dims);
|
|
for (auto bi : maskBits) {
|
|
dp_maskBits->setBit(bi);
|
|
}
|
|
}
|
|
|
|
bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const {
|
|
PRECONDITION(resMat, "bad results pointer");
|
|
if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) {
|
|
// we will accept the bit
|
|
return true;
|
|
}
|
|
RDKit::DOUBLE_VECT fracs;
|
|
fracs.resize(d_classes);
|
|
|
|
// compute the fractions of items in each class that hit the bit
|
|
// and record the maximum for the those classes not in the bias list
|
|
double maxCor = 0.0;
|
|
for (unsigned int i = 0; i < d_classes; i++) {
|
|
if (d_clsCount[i] > 0) {
|
|
fracs[i] = ((double)resMat[i]) / d_clsCount[i];
|
|
} else {
|
|
fracs[i] = 0.0;
|
|
}
|
|
if (std::find(d_biasList.begin(), d_biasList.end(), i) ==
|
|
d_biasList.end()) {
|
|
// if not in the biasList
|
|
if (fracs[i] > maxCor) {
|
|
// if this is fraction is greater than the previously known maximum
|
|
maxCor = fracs[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
bool bitOk = false;
|
|
for (int bci : d_biasList) {
|
|
if (fracs[bci] >= maxCor) {
|
|
bitOk = true;
|
|
break;
|
|
}
|
|
}
|
|
return bitOk;
|
|
}
|
|
|
|
double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const {
|
|
PRECONDITION(resMat, "bad result pointer");
|
|
bool bitOk = this->BiasCheckBit(resMat);
|
|
double info = 0.0;
|
|
if (bitOk) {
|
|
info = ChiSquare(resMat, 2, d_classes);
|
|
}
|
|
return info;
|
|
}
|
|
|
|
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const {
|
|
PRECONDITION(resMat, "bad result pointer");
|
|
bool bitOk = this->BiasCheckBit(resMat);
|
|
double info = 0.0;
|
|
if (bitOk) {
|
|
info = InfoEntropyGain(resMat, 2, d_classes);
|
|
}
|
|
return info;
|
|
}
|
|
|
|
void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv,
|
|
unsigned int label) {
|
|
URANGE_CHECK(label, d_classes);
|
|
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
|
|
|
d_nInst += 1;
|
|
d_clsCount[label] += 1;
|
|
for (unsigned int i = 0; i < bv.getNumBits(); i++) {
|
|
if ((*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i))) {
|
|
d_counts[label][i] += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
void InfoBitRanker::accumulateVotes(const SparseBitVect &bv,
|
|
unsigned int label) {
|
|
URANGE_CHECK(label, d_classes);
|
|
CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size");
|
|
|
|
d_nInst += 1;
|
|
d_clsCount[label] += 1;
|
|
for (int dp_bit : *bv.dp_bits) {
|
|
if (!dp_maskBits || dp_maskBits->getBit(dp_bit)) {
|
|
d_counts[label][dp_bit] += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
double *InfoBitRanker::getTopN(unsigned int num) {
|
|
// this is a place holder to pass along to infogain function
|
|
// the size of this container should nVals*d_classes, where nVals
|
|
// is the number of values a variable can take.
|
|
// since we are dealing with a binary bit vector nVals = 2
|
|
// in addition the infogain function pretends that this is a 2D matrix
|
|
// with the number of rows equal to nVals and num of columns equal to
|
|
// d_classes
|
|
if (num > d_dims) {
|
|
throw ValueErrorException(
|
|
"attempt to rank more bits than present in the bit vectors");
|
|
}
|
|
if (dp_maskBits) {
|
|
CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(),
|
|
"Can't rank more bits than the ensemble size");
|
|
}
|
|
auto *resMat = new RDKit::USHORT[2 * d_classes];
|
|
|
|
PR_QUEUE topN;
|
|
|
|
for (unsigned int i = 0; i < d_dims; i++) {
|
|
// we may want to ignore bits that are not turned on in any item of class
|
|
// "ignoreNoClass"
|
|
/*
|
|
if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) {
|
|
if (d_counts[ignoreNoClass][i] == 0) {
|
|
continue;
|
|
}
|
|
}*/
|
|
|
|
if (dp_maskBits && !dp_maskBits->getBit(i)) {
|
|
continue;
|
|
}
|
|
|
|
// fill up dmat
|
|
for (unsigned int j = 0; j < d_classes; j++) {
|
|
// we know that we have only two rows here
|
|
resMat[j] = d_counts[j][i];
|
|
resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]);
|
|
}
|
|
double info = 0.0;
|
|
switch (d_type) {
|
|
case ENTROPY:
|
|
info = InfoEntropyGain(resMat, 2, d_classes);
|
|
break;
|
|
case BIASENTROPY:
|
|
info = this->BiasInfoEntropyGain(resMat);
|
|
break;
|
|
case CHISQUARE:
|
|
info = ChiSquare(resMat, 2, d_classes);
|
|
break;
|
|
case BIASCHISQUARE:
|
|
info = BiasChiSquareGain(resMat);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
PAIR_D_I entry(info, i);
|
|
|
|
if (info >= 0.0) {
|
|
if (topN.size() < num) {
|
|
topN.push(entry);
|
|
} else if (info > topN.top().first) {
|
|
topN.pop();
|
|
topN.push(entry);
|
|
}
|
|
}
|
|
}
|
|
|
|
delete[] resMat;
|
|
|
|
// now fill up the result matrix for the topN bits
|
|
// the result from this function is a double * of size
|
|
// num*4. The caller of this function interprets this
|
|
// array as a two dimensional array of size num*(2+d_classes) with each row
|
|
// containing the following entries
|
|
// bitId, infogain, 1 additional column for number of hits for each class
|
|
// double *res = new double[num*(2+d_classes)];
|
|
|
|
d_top = num;
|
|
int ncols = 2 + d_classes;
|
|
|
|
delete[] dp_topBits;
|
|
dp_topBits = new double[num * ncols];
|
|
|
|
int offset, bid;
|
|
|
|
RDKit::INT_VECT maskBits;
|
|
if (dp_maskBits && topN.size() < num) {
|
|
dp_maskBits->getOnBits(maskBits);
|
|
}
|
|
|
|
for (int i = num - 1; i >= 0; i--) {
|
|
offset = i * ncols;
|
|
if (topN.size() == 0) {
|
|
if (dp_maskBits) {
|
|
bid = maskBits[i];
|
|
} else {
|
|
bid = i;
|
|
}
|
|
dp_topBits[offset + 1] = 0.0;
|
|
} else {
|
|
bid = topN.top().second; // bit id
|
|
dp_topBits[offset + 1] = topN.top().first; // value of the infogain
|
|
topN.pop();
|
|
}
|
|
dp_topBits[offset] = (double)bid;
|
|
|
|
for (unsigned int j = 0; j < d_classes; j++) {
|
|
dp_topBits[offset + 2 + j] = (double)d_counts[j][bid];
|
|
}
|
|
}
|
|
return dp_topBits;
|
|
}
|
|
|
|
void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const {
|
|
(*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent";
|
|
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
|
(*outStream) << std::setw(10) << "class" << ic;
|
|
}
|
|
(*outStream) << std::endl;
|
|
|
|
unsigned int ncols = 2 + d_classes;
|
|
for (unsigned int i = 0; i < d_top; i++) {
|
|
(*outStream) << std::setw(12) << (int)dp_topBits[i * ncols] << std::setw(12)
|
|
<< std::setprecision(5) << dp_topBits[i * ncols + 1];
|
|
for (unsigned int ic = 0; ic < d_classes; ic++) {
|
|
(*outStream) << std::setw(10) << (int)dp_topBits[i * ncols + 2 + ic];
|
|
}
|
|
(*outStream) << "\n";
|
|
}
|
|
}
|
|
|
|
void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const {
|
|
std::ofstream tmpStream(fileName.c_str());
|
|
if ((!tmpStream) || (tmpStream.bad())) {
|
|
std::ostringstream errout;
|
|
errout << "Bad output file " << fileName;
|
|
throw RDKit::FileParseException(errout.str());
|
|
}
|
|
|
|
auto &outStream = static_cast<std::ostream &>(tmpStream);
|
|
this->writeTopBitsToStream(&outStream);
|
|
}
|
|
} // namespace RDInfoTheory
|