// $Id$ // // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "BitVects.h" #include "BitOps.h" #include #include #include #include #include #include #include #include #include #if _MSC_VER #include #endif using namespace RDKit; int getBitId(const char*& text, int format, int size, int curr) { PRECONDITION(text, "no text"); int res = -1; if ((format == 0) || ((format == 1) && (size >= std::numeric_limits::max()))) { int tmp; tmp = EndianSwapBytes(*(int*)text); text += sizeof(tmp); res = tmp; } else if (format == 1) { // version 16 and on bits sotred as short ints unsigned short tmp; tmp = EndianSwapBytes( *(unsigned short*)text); text += sizeof(tmp); res = tmp; } else if (format == 2) { // run length encoded format res = curr + RDKit::pullPackedIntFromString(text); } return res; } bool AllProbeBitsMatch(const std::string& probe, const std::string& ref) { return AllProbeBitsMatch(probe.c_str(), ref.c_str()); } bool AllProbeBitsMatch(const char* probe, const char* ref) { PRECONDITION(probe, "no probe text"); PRECONDITION(ref, "no probe text"); int probeFormat = 0; int refFormat = 0; int version = 0; int probeSize = EndianSwapBytes(*(int*)probe); probe += sizeof(probeSize); if (probeSize < 0) { version = -1 * probeSize; if (version == 16) { probeFormat = 1; } else if (version == 32) { probeFormat = 2; } else { throw("Unknown version type for the encode bit vect"); } probeSize = EndianSwapBytes(*(int*)probe); probe += sizeof(probeSize); } int refSize = EndianSwapBytes(*(int*)ref); ref += sizeof(refSize); if (refSize < 0) { version = -1 * refSize; if (version == 16) { refFormat = 1; } else if (version == 32) { refFormat = 2; } else { throw("Unknown version type for the encode bit vect"); } refSize = EndianSwapBytes(*(int*)ref); ref += sizeof(refSize); } int nProbeOn = EndianSwapBytes(*(int*)probe); probe += sizeof(nProbeOn); int nRefOn = EndianSwapBytes(*(int*)ref); ref += sizeof(nRefOn); int currProbeBit = 0; currProbeBit = getBitId(probe, probeFormat, probeSize, currProbeBit); nProbeOn--; int currRefBit = 0; currRefBit = getBitId(ref, refFormat, refSize, currRefBit); nRefOn--; while (nProbeOn) { while (currRefBit < currProbeBit && nRefOn > 0) { if (refFormat == 2) currRefBit++; currRefBit = getBitId(ref, refFormat, refSize, currRefBit); nRefOn--; } if (currRefBit != currProbeBit) return false; if (probeFormat == 2) currProbeBit++; currProbeBit = getBitId(probe, probeFormat, probeSize, currProbeBit); nProbeOn--; } return true; } template bool AllProbeBitsMatch(const T1& probe, const std::string& pkl) { const char* text = pkl.c_str(); int format = 0; int nOn = 0, size, version = 0; size = EndianSwapBytes(*(int*)text); text += sizeof(size); if (size < 0) { version = -1 * size; if (version == 16) { format = 1; } else if (version == 32) { format = 2; } else { throw("Unknown version type for the encode bit vect"); } size = EndianSwapBytes(*(int*)text); text += sizeof(size); } nOn = EndianSwapBytes(*(int*)text); text += sizeof(nOn); int currBit = 0; currBit = getBitId(text, format, size, currBit); nOn--; std::vector obl; probe.getOnBits(obl); // for(int i=0;i::const_iterator i = obl.begin(); i != obl.end(); i++) { while (currBit < *i && nOn > 0) { if (format == 2) currBit++; currBit = getBitId(text, format, size, currBit); nOn--; } if (currBit != *i) return false; //} } return true; } template bool AllProbeBitsMatch(const SparseBitVect& bv1, const std::string& pkl); template bool AllProbeBitsMatch(const ExplicitBitVect& bv1, const std::string& pkl); template bool AllProbeBitsMatch(const T1& probe, const T1& ref) { for (unsigned int i = 0; i < probe.getNumBits(); ++i) { if (probe.getBit(i) && !ref.getBit(i)) { return false; } } return true; } template bool AllProbeBitsMatch(const SparseBitVect& bv1, const SparseBitVect& bv2); // template bool AllProbeBitsMatch(const ExplicitBitVect& bv1,const // ExplicitBitVect &bv2); bool AllProbeBitsMatch(const ExplicitBitVect& probe, const ExplicitBitVect& ref) { return probe.dp_bits->is_subset_of(*(ref.dp_bits)); } // """ ------------------------------------------------------- // // NumOnBitsInCommon(T1,T2) // Returns the number of on bits which are set in both T1 and T2. // // """ ------------------------------------------------------- template int NumOnBitsInCommon(const T1& bv1, const T2& bv2) { return static_cast(OnBitsInCommon(bv1, bv2).size()); } namespace { struct bitset_impl { std::vector m_bits; std::size_t m_num_bits; }; const bool canUseBitmapHack = sizeof(boost::dynamic_bitset<>) == sizeof(bitset_impl); bool EBVToBitmap(const ExplicitBitVect& bv, const unsigned char*& fp, unsigned int& nBytes) { if (!canUseBitmapHack) return false; const bitset_impl* p1 = (const bitset_impl*)(const void*)bv.dp_bits; // Run-time sanity check (just in case) if (p1->m_num_bits != bv.dp_bits->size()) { return false; } fp = (const unsigned char*)p1->m_bits.data(); nBytes = (unsigned int)p1->m_num_bits / 8; if (p1->m_num_bits % 8) ++nBytes; return true; } } // end of local namespace unsigned int CalcBitmapNumBitsInCommon(const unsigned char* afp, const unsigned char* bfp, unsigned int nBytes); int NumOnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) { // Don't try this at home, we (hope we) know what we're doing const unsigned char *afp, *bfp; unsigned int nBytes; if (EBVToBitmap(bv1, afp, nBytes) && EBVToBitmap(bv2, bfp, nBytes)) { unsigned int result = CalcBitmapNumBitsInCommon(afp, bfp, nBytes); return (int)result; } return static_cast(((*bv1.dp_bits) & (*bv2.dp_bits)).count()); } // In all these similarity metrics the notation is selected to be // consistent with J.W. Raymond and P. Willett, JCAMD _16_ 59-71 (2002) // """ ------------------------------------------------------- // // TanimotoSimilarity(T1,T2) // returns the Tanamoto similarity between T1 and T2, a double. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double TanimotoSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); unsigned int total = bv1.getNumOnBits() + bv2.getNumOnBits(); if (total == 0) return 1.0; unsigned int common = NumOnBitsInCommon(bv1, bv2); return (double)common / (double)(total - common); } template double TverskySimilarity(const T1& bv1, const T2& bv2, double a, double b) { RANGE_CHECK(0, a, 1); RANGE_CHECK(0, b, 1); if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); double denom = a * y + b * z + (1 - a - b) * x; if (denom == 0.0) return 1.0; else return x / denom; } template double CosineSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (y * z > 0.0) { return x / sqrt(y * z); } else { return 0.0; } } template double KulczynskiSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (y * z > 0.0) { return x * (y + z) / (2 * y * z); } else { return 0.0; } } template double DiceSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (y + z > 0.0) { return 2 * x / (y + z); } else { return 0.0; } } template double SokalSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); return x / (2 * y + 2 * z - 3 * x); } template double McConnaugheySimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (y * z > 0.0) { return (x * (y + z) - (y * z)) / (y * z); } else { return 0.0; } } template inline T tmin(T v1, T v2) { if (v1 < v2) return v1; return v2; } template inline T tmax(T v1, T v2) { if (v1 > v2) return v1; return v2; } template double AsymmetricSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (tmin(y, z) > 0) { return x / tmin(y, z); } else { return 0.0; } } template double BraunBlanquetSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if (tmax(y, z) > 0) { return x / tmax(y, z); } else { return 0.0; } } template double RusselSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); return x / bv1.getNumBits(); } template double RogotGoldbergSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1, bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); double l = bv1.getNumBits(); double d = l - y - z + x; if ((x == l) || (d == l)) return 1.0; else return (x / (y + z) + (d) / (2 * l - y - z)); } // """ ------------------------------------------------------- // // OnBitSimilarity(T1,T2) // Returns the percentage of possible on bits in common // between T1 and T2 (a double) // // C++ Notes: T1 and T2 must support operator|, operator& // and getOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double OnBitSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double num = NumOnBitsInCommon(bv1, bv2); double denom = (bv1 | bv2).getNumOnBits(); if (denom > 0) { return num / denom; } else { return 0; } } // """ ------------------------------------------------------- // // NumBitsInCommon(T1,T2) // Returns the number of bits in common (on and off) // between T1 and T2 (an int) // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator^, getNumBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template int NumBitsInCommon(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); return bv1.getNumBits() - (bv1 ^ bv2).getNumOnBits(); } int NumBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) { return bv1.getNumBits() - static_cast(((*bv1.dp_bits) ^ (*bv2.dp_bits)).count()); } // """ ------------------------------------------------------- // // AllBitSimilarity(T1,T2) // Returns the percentage of bits in common (on and off) // between T1 and T2 (a double) // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator^, getNumBits() // and getNumOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double AllBitSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); return double(NumBitsInCommon(bv1, bv2)) / bv1.getNumBits(); } // """ ------------------------------------------------------- // // OnBitsInCommon(T1,T2) // Returns the on bits which are set in both T1 and T2. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getOnBits(), the return value is an IntVect. // // Python Notes: T1 and T2 are BitVects, the return value // is a tuple of ints. // // """ ------------------------------------------------------- template IntVect OnBitsInCommon(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); IntVect res; (bv1 & bv2).getOnBits(res); return res; } // """ ------------------------------------------------------- // // OffBitsInCommon(T1,T2) // Returns the off bits which are set in both T1 and T2. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator|, operator~, // getNumBits() and getOnBits(), the return value is an IntVect. // // Python Notes: T1 and T2 are BitVects, the return value // is a tuple of ints. // // """ ------------------------------------------------------- template IntVect OffBitsInCommon(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); IntVect res; (~(bv1 | bv2)).getOnBits(res); return res; } // """ ------------------------------------------------------- // // OnBitProjSimilarity(T1,T2) // Returns the projected similarity between the on bits of // T1 and T2. // // The on bit projected similarity of T1 onto T2 is the // percentage of T1's on bits which are on in T2. // // This type of measure may be useful for substructure-type // searches. // // Two values are returned, the projection of T1 onto T2 // and the projection of T2 onto T1 // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getNumOnBits(), the return value is an DoubleVect with // two elements. // // Python Notes: T1 and T2 are BitVects, the return value // is a 2-tuple of doubles. // // """ ------------------------------------------------------- template DoubleVect OnBitProjSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); DoubleVect res(2, 0.0); double num = NumOnBitsInCommon(bv1, bv2); if (num) { res[0] = num / bv1.getNumOnBits(); res[1] = num / bv2.getNumOnBits(); } return res; } // """ ------------------------------------------------------- // // OffBitProjSimilarity(T1,T2) // Returns the projected similarity between the off bits of // T1 and T2. // // The off bit projected similarity of T1 onto T2 is the // percentage of T1's off bits which are off in T2. // // This type of measure may be useful for substructure-type // searches. // // Two values are returned, the projection of T1 onto T2 // and the projection of T2 onto T1 // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator|, getNumBits() // and getNumOffBits(), the return value is an DoubleVect with // two elements. // // Python Notes: T1 and T2 are BitVects, the return value // is a 2-tuple of doubles. // // """ ------------------------------------------------------- template DoubleVect OffBitProjSimilarity(const T1& bv1, const T2& bv2) { if (bv1.getNumBits() != bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); DoubleVect res(2, 0.0); double num = (bv1 | bv2).getNumOffBits(); if (num) { res[0] = num / bv1.getNumOffBits(); res[1] = num / bv2.getNumOffBits(); } return res; } template T1* FoldFingerprint(const T1& bv1, unsigned int factor) { if (factor <= 0 || factor >= bv1.getNumBits()) throw ValueErrorException("invalid fold factor"); int initSize = bv1.getNumBits(); int resSize = initSize / factor; auto* res = new T1(resSize); IntVect onBits; bv1.getOnBits(onBits); for (int& onBit : onBits) { int pos = onBit % resSize; res->setBit(pos); } return res; } template std::string BitVectToText(const T1& bv1) { std::string res(bv1.getNumBits(), '0'); for (unsigned int i = 0; i < bv1.getNumBits(); i++) { if (bv1.getBit(i)) res[i] = '1'; } return res; } const char bin2Hex[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; template std::string BitVectToFPSText(const T1& bv1) { unsigned int size = 2 * (bv1.getNumBits() / 8 + (bv1.getNumBits() % 8 ? 1 : 0)); std::string res(size, 0); unsigned char c = 0; unsigned int byte = 0; for (unsigned int i = 0; i < bv1.getNumBits(); i++) { if (bv1.getBit(i)) { c |= 1 << (i % 8); } if (!((i + 1) % 8)) { res[byte++] = bin2Hex[(c >> 4) % 16]; res[byte++] = bin2Hex[c % 16]; c = 0; } } if (byte < size) { res[byte++] = bin2Hex[(c >> 4) % 16]; res[byte++] = bin2Hex[c % 16]; } return res; } template std::string BitVectToBinaryText(const T1& bv1) { std::string res(bv1.getNumBits() / 8 + (bv1.getNumBits() % 8 ? 1 : 0), 0); unsigned char c = 0; unsigned int byte = 0; for (unsigned int i = 0; i < bv1.getNumBits(); i++) { if (bv1.getBit(i)) { c |= 1 << (i % 8); } if (!((i + 1) % 8)) { res[byte++] = c; c = 0; } } if (bv1.getNumBits() % 8) { res[byte] = c; } return res; } template void UpdateBitVectFromFPSText(T1& bv1, const std::string& fps) { PRECONDITION(fps.length() * 4 >= bv1.getNumBits(), "bad FPS length"); PRECONDITION(fps.length() % 2 == 0, "bad FPS length"); unsigned int bitIdx = 0; char tptr[3]; tptr[2] = (char)0; for (unsigned int i = 0; i < fps.size() && bitIdx < bv1.getNumBits(); i += 2) { unsigned short c = 0; try { tptr[0] = fps[i]; tptr[1] = fps[i + 1]; c = static_cast(strtol(tptr, nullptr, 16)); } catch (...) { std::ostringstream errout; errout << "Cannot convert FPS word: " << fps.substr(i, 2) << " to int"; std::cerr << errout.str() << std::endl; throw ValueErrorException(errout.str()); } for (unsigned int bit = 0; bit < 8 && bitIdx < bv1.getNumBits(); ++bit, ++bitIdx) { if (c & (1 << bit)) bv1.setBit(bitIdx); } } } template void UpdateBitVectFromBinaryText(T1& bv1, const std::string& fps) { PRECONDITION(fps.length() * 8 >= bv1.getNumBits(), "bad FPS length"); unsigned int bitIdx = 0; for (unsigned int i = 0; i < fps.size() && bitIdx < bv1.getNumBits(); i++) { unsigned short c = fps[i]; for (unsigned int bit = 0; bit < 8 && bitIdx < bv1.getNumBits(); ++bit, ++bitIdx) { if (c & (1 << bit)) bv1.setBit(bitIdx); } } } template double TanimotoSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double TverskySimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2, double a, double b); template double CosineSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double KulczynskiSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double DiceSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double SokalSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double McConnaugheySimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double AsymmetricSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double BraunBlanquetSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double RusselSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double RogotGoldbergSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double OnBitSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template int NumBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2); template double AllBitSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template int NumOnBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2); template IntVect OnBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2); template IntVect OffBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2); template DoubleVect OnBitProjSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template DoubleVect OffBitProjSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2); template double TanimotoSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double TverskySimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2, double a, double b); template double CosineSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double KulczynskiSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double DiceSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double SokalSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double McConnaugheySimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double AsymmetricSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double BraunBlanquetSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double RusselSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double RogotGoldbergSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double OnBitSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template int NumBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template double AllBitSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template IntVect OnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template IntVect OffBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template DoubleVect OnBitProjSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template DoubleVect OffBitProjSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2); template SparseBitVect* FoldFingerprint(const SparseBitVect&, unsigned int); template ExplicitBitVect* FoldFingerprint(const ExplicitBitVect&, unsigned int); template std::string BitVectToText(const SparseBitVect&); template std::string BitVectToText(const ExplicitBitVect&); template std::string BitVectToFPSText(const SparseBitVect&); template std::string BitVectToFPSText(const ExplicitBitVect&); template void UpdateBitVectFromFPSText(SparseBitVect&, const std::string&); template void UpdateBitVectFromFPSText(ExplicitBitVect&, const std::string&); template std::string BitVectToBinaryText(const SparseBitVect&); template std::string BitVectToBinaryText(const ExplicitBitVect&); template void UpdateBitVectFromBinaryText(SparseBitVect&, const std::string&); template void UpdateBitVectFromBinaryText(ExplicitBitVect&, const std::string&); // from here: // http://stackoverflow.com/questions/3849337/msvc-equivalent-to-builtin-popcount // but corrected to get the ifdef right #ifdef _MSC_VER #include #ifdef _WIN64 #define BUILTIN_POPCOUNT_INSTR __popcnt64 #define BUILTIN_POPCOUNT_TYPE boost::uint64_t #else #define BUILTIN_POPCOUNT_INSTR __popcnt #define BUILTIN_POPCOUNT_TYPE boost::uint32_t #endif #else #define BUILTIN_POPCOUNT_INSTR __builtin_popcountll #define BUILTIN_POPCOUNT_TYPE boost::uint64_t #endif // the Bitmap Tanimoto and Dice similarity code is adapted // from Andrew Dalke's chem-fingerprints code // http://code.google.com/p/chem-fingerprints/ namespace { static int byte_popcounts[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; } unsigned int CalcBitmapPopcount(const unsigned char* afp, unsigned int nBytes) { PRECONDITION(afp, "no afp"); unsigned int popcount = 0; #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { popcount += byte_popcounts[afp[i]]; } #else unsigned int eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (unsigned int i = 0; i < eidx; ++i) { popcount += static_cast( BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i])); } for (unsigned int i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { popcount += byte_popcounts[afp[i]]; } #endif return popcount; } unsigned int CalcBitmapNumBitsInCommon(const unsigned char* afp, const unsigned char* bfp, unsigned int nBytes) { PRECONDITION(afp, "no afp"); PRECONDITION(bfp, "no bfp"); unsigned int intersect_popcount = 0; #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #else BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) { intersect_popcount += static_cast(BUILTIN_POPCOUNT_INSTR( ((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); } for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #endif return intersect_popcount; } double CalcBitmapTanimoto(const unsigned char* afp, const unsigned char* bfp, unsigned int nBytes) { PRECONDITION(afp, "no afp"); PRECONDITION(bfp, "no bfp"); unsigned int union_popcount = 0, intersect_popcount = 0; #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { union_popcount += byte_popcounts[afp[i] | bfp[i]]; intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #else BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) { union_popcount += static_cast(BUILTIN_POPCOUNT_INSTR( ((BUILTIN_POPCOUNT_TYPE*)afp)[i] | ((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); intersect_popcount += static_cast(BUILTIN_POPCOUNT_INSTR( ((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); } for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { union_popcount += byte_popcounts[afp[i] | bfp[i]]; intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #endif if (union_popcount == 0) { return 0.0; } return (intersect_popcount + 0.0) / union_popcount; /* +0.0 to coerce to double */ } double CalcBitmapDice(const unsigned char* afp, const unsigned char* bfp, unsigned int nBytes) { PRECONDITION(afp, "no afp"); PRECONDITION(bfp, "no bfp"); unsigned int intersect_popcount = 0, a_popcount = 0, b_popcount = 0; #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { a_popcount += byte_popcounts[afp[i]]; b_popcount += byte_popcounts[bfp[i]]; intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #else BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) { a_popcount += static_cast( BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i])); b_popcount += static_cast( BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); intersect_popcount += static_cast(BUILTIN_POPCOUNT_INSTR( ((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); } for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { a_popcount += byte_popcounts[afp[i]]; b_popcount += byte_popcounts[bfp[i]]; intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; } #endif if (a_popcount + b_popcount == 0) { return 0.0; } return (2.0 * intersect_popcount) / (a_popcount + b_popcount); } double CalcBitmapTversky(const unsigned char* afp, const unsigned char* bfp, unsigned int nBytes, double ca, double cb) { PRECONDITION(afp, "no afp"); PRECONDITION(bfp, "no bfp"); unsigned int intersect_popcount = 0, acount = 0, bcount = 0; #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; acount += byte_popcounts[afp[i]]; bcount += byte_popcounts[bfp[i]]; } #else BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) { intersect_popcount += static_cast(BUILTIN_POPCOUNT_INSTR( ((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); acount += static_cast( BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i])); bcount += static_cast( BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)bfp)[i])); } for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { intersect_popcount += byte_popcounts[afp[i] & bfp[i]]; acount += byte_popcounts[afp[i]]; bcount += byte_popcounts[bfp[i]]; } #endif double denom = ca * acount + cb * bcount + (1 - ca - cb) * intersect_popcount; if (denom == 0.0) { return 0.0; } return intersect_popcount / denom; } bool CalcBitmapAllProbeBitsMatch(const unsigned char* probe, const unsigned char* ref, unsigned int nBytes) { PRECONDITION(probe, "no probe"); PRECONDITION(ref, "no ref"); #ifndef USE_BUILTIN_POPCOUNT for (unsigned int i = 0; i < nBytes; i++) { if (byte_popcounts[probe[i] & ref[i]] != byte_popcounts[probe[i]]) { return false; } } #else unsigned int eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE); for (unsigned int i = 0; i < eidx; ++i) { if (BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)probe)[i] & ((BUILTIN_POPCOUNT_TYPE*)ref)[i]) != BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)probe)[i])) { return false; } } for (unsigned int i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) { if (byte_popcounts[probe[i] & ref[i]] != byte_popcounts[probe[i]]) { return false; } } #endif return true; }