// $Id$ // // Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "InfoBitRanker.h" #include "InfoGainFuncs.h" #include #include #include #include #include #include #include namespace RDInfoTheory { typedef std::pair PAIR_D_I; typedef std::vector VECT_PDI; struct gtDIPair { bool operator()(const PAIR_D_I &pd1, const PAIR_D_I &pd2) const { return pd1.first > pd2.first; } }; typedef std::priority_queue PR_QUEUE; void InfoBitRanker::setBiasList(RDKit::INT_VECT &classList) { URANGE_CHECK(classList.size(), d_classes); d_biasList = classList; // make sure we don't have any duplicates std::sort(d_biasList.begin(), d_biasList.end()); auto bi = std::unique(d_biasList.begin(), d_biasList.end()); CHECK_INVARIANT(bi == d_biasList.end(), "There are duplicates in the class bias list"); // finally make sure all the class ID in d_biasList are within range for (bi = d_biasList.begin(); bi != d_biasList.end(); bi++) { URANGE_CHECK(static_cast(*bi), d_classes); } } void InfoBitRanker::setMaskBits(RDKit::INT_VECT &maskBits) { delete dp_maskBits; dp_maskBits = new ExplicitBitVect(d_dims); for (auto bi : maskBits) { dp_maskBits->setBit(bi); } } bool InfoBitRanker::BiasCheckBit(RDKit::USHORT *resMat) const { PRECONDITION(resMat, "bad results pointer"); if ((d_biasList.size() == 0) || (d_biasList.size() == d_classes)) { // we will accept the bit return true; } RDKit::DOUBLE_VECT fracs; fracs.resize(d_classes); // compute the fractions of items in each class that hit the bit // and record the maximum for the those classes not in the bias list double maxCor = 0.0; for (unsigned int i = 0; i < d_classes; i++) { if (d_clsCount[i] > 0) { fracs[i] = ((double)resMat[i]) / d_clsCount[i]; } else { fracs[i] = 0.0; } if (std::find(d_biasList.begin(), d_biasList.end(), i) == d_biasList.end()) { // if not in the biasList if (fracs[i] > maxCor) { // if this is fraction is greater than the previously known maximum maxCor = fracs[i]; } } } bool bitOk = false; for (int bci : d_biasList) { if (fracs[bci] >= maxCor) { bitOk = true; break; } } return bitOk; } double InfoBitRanker::BiasChiSquareGain(RDKit::USHORT *resMat) const { PRECONDITION(resMat, "bad result pointer"); bool bitOk = this->BiasCheckBit(resMat); double info = 0.0; if (bitOk) { info = ChiSquare(resMat, 2, d_classes); } return info; } double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const { PRECONDITION(resMat, "bad result pointer"); bool bitOk = this->BiasCheckBit(resMat); double info = 0.0; if (bitOk) { info = InfoEntropyGain(resMat, 2, d_classes); } return info; } void InfoBitRanker::accumulateVotes(const ExplicitBitVect &bv, unsigned int label) { URANGE_CHECK(label, d_classes); CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size"); d_nInst += 1; d_clsCount[label] += 1; for (unsigned int i = 0; i < bv.getNumBits(); i++) { if ((*bv.dp_bits)[i] && (!dp_maskBits || dp_maskBits->getBit(i))) { d_counts[label][i] += 1; } } } void InfoBitRanker::accumulateVotes(const SparseBitVect &bv, unsigned int label) { URANGE_CHECK(label, d_classes); CHECK_INVARIANT(bv.getNumBits() == d_dims, "Incorrect bit vector size"); d_nInst += 1; d_clsCount[label] += 1; for (int dp_bit : *bv.dp_bits) { if (!dp_maskBits || dp_maskBits->getBit(dp_bit)) { d_counts[label][dp_bit] += 1; } } } double *InfoBitRanker::getTopN(unsigned int num) { // this is a place holder to pass along to infogain function // the size of this container should nVals*d_classes, where nVals // is the number of values a variable can take. // since we are dealing with a binary bit vector nVals = 2 // in addition the infogain function pretends that this is a 2D matrix // with the number of rows equal to nVals and num of columns equal to // d_classes if (num > d_dims) { throw ValueErrorException( "attempt to rank more bits than present in the bit vectors"); } if (dp_maskBits) { CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(), "Can't rank more bits than the ensemble size"); } auto *resMat = new RDKit::USHORT[2 * d_classes]; PR_QUEUE topN; for (unsigned int i = 0; i < d_dims; i++) { // we may want to ignore bits that are not turned on in any item of class // "ignoreNoClass" /* if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) { if (d_counts[ignoreNoClass][i] == 0) { continue; } }*/ if (dp_maskBits && !dp_maskBits->getBit(i)) { continue; } // fill up dmat for (unsigned int j = 0; j < d_classes; j++) { // we know that we have only two rows here resMat[j] = d_counts[j][i]; resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]); } double info = 0.0; switch (d_type) { case ENTROPY: info = InfoEntropyGain(resMat, 2, d_classes); break; case BIASENTROPY: info = this->BiasInfoEntropyGain(resMat); break; case CHISQUARE: info = ChiSquare(resMat, 2, d_classes); break; case BIASCHISQUARE: info = BiasChiSquareGain(resMat); break; default: break; } PAIR_D_I entry(info, i); if (info >= 0.0) { if (topN.size() < num) { topN.push(entry); } else if (info > topN.top().first) { topN.pop(); topN.push(entry); } } } delete[] resMat; // now fill up the result matrix for the topN bits // the result from this function is a double * of size // num*4. The caller of this function interprets this // array as a two dimensional array of size num*(2+d_classes) with each row // containing the following entries // bitId, infogain, 1 additional column for number of hits for each class // double *res = new double[num*(2+d_classes)]; d_top = num; int ncols = 2 + d_classes; delete[] dp_topBits; dp_topBits = new double[num * ncols]; int offset, bid; RDKit::INT_VECT maskBits; if (dp_maskBits && topN.size() < num) { dp_maskBits->getOnBits(maskBits); } for (int i = num - 1; i >= 0; i--) { offset = i * ncols; if (topN.size() == 0) { if (dp_maskBits) { bid = maskBits[i]; } else { bid = i; } dp_topBits[offset + 1] = 0.0; } else { bid = topN.top().second; // bit id dp_topBits[offset + 1] = topN.top().first; // value of the infogain topN.pop(); } dp_topBits[offset] = (double)bid; for (unsigned int j = 0; j < d_classes; j++) { dp_topBits[offset + 2 + j] = (double)d_counts[j][bid]; } } return dp_topBits; } void InfoBitRanker::writeTopBitsToStream(std::ostream *outStream) const { (*outStream) << std::setw(12) << "Bit" << std::setw(12) << "InfoContent"; for (unsigned int ic = 0; ic < d_classes; ic++) { (*outStream) << std::setw(10) << "class" << ic; } (*outStream) << std::endl; unsigned int ncols = 2 + d_classes; for (unsigned int i = 0; i < d_top; i++) { (*outStream) << std::setw(12) << (int)dp_topBits[i * ncols] << std::setw(12) << std::setprecision(5) << dp_topBits[i * ncols + 1]; for (unsigned int ic = 0; ic < d_classes; ic++) { (*outStream) << std::setw(10) << (int)dp_topBits[i * ncols + 2 + ic]; } (*outStream) << "\n"; } } void InfoBitRanker::writeTopBitsToFile(const std::string &fileName) const { std::ofstream tmpStream(fileName.c_str()); if ((!tmpStream) || (tmpStream.bad())) { std::ostringstream errout; errout << "Bad output file " << fileName; throw RDKit::FileParseException(errout.str()); } auto &outStream = static_cast(tmpStream); this->writeTopBitsToStream(&outStream); } } // namespace RDInfoTheory