mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* for-loop modernization * declaration and initialization together * switch statements, min&max from std * switch statement * use of std min&max * fixed unsigned int to int comparison * implement switch statement * fix unsigned int to int comparison * revert previous mistake * Update Code/DataStructs/DiscreteValueVect.cpp Co-authored-by: Greg Landrum <greg.landrum@gmail.com> * implemented suggestions --------- Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
1102 lines
35 KiB
C++
1102 lines
35 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include "BitVects.h"
|
|
#include "BitOps.h"
|
|
#include <cmath>
|
|
#include <string>
|
|
#include <iostream>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <RDGeneral/types.h>
|
|
#include <RDGeneral/Exceptions.h>
|
|
#include <sstream>
|
|
#include <cstdlib>
|
|
#include <algorithm>
|
|
|
|
#include <boost/lexical_cast.hpp>
|
|
|
|
#if _MSC_VER
|
|
#include <intrin.h>
|
|
#endif
|
|
|
|
using namespace RDKit;
|
|
|
|
int getBitId(const char*& text, int format, int size, int curr) {
|
|
PRECONDITION(text, "no text");
|
|
int res = -1;
|
|
if ((format == 0) ||
|
|
((format == 1) && (size >= std::numeric_limits<unsigned short>::max()))) {
|
|
int tmp =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)text);
|
|
text += sizeof(tmp);
|
|
res = tmp;
|
|
} else if (format == 1) { // version 16 and on bits sorted as short ints
|
|
unsigned short tmp =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(
|
|
*(unsigned short*)text);
|
|
text += sizeof(tmp);
|
|
res = tmp;
|
|
} else if (format == 2) { // run length encoded format
|
|
res = curr + RDKit::pullPackedIntFromString(text);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
bool AllProbeBitsMatch(const std::string& probe, const std::string& ref) {
|
|
return AllProbeBitsMatch(probe.c_str(), ref.c_str());
|
|
}
|
|
|
|
bool AllProbeBitsMatch(const char* probe, const char* ref) {
|
|
PRECONDITION(probe, "no probe text");
|
|
PRECONDITION(ref, "no probe text");
|
|
int probeFormat = 0;
|
|
int refFormat = 0;
|
|
int version = 0;
|
|
|
|
int probeSize =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)probe);
|
|
probe += sizeof(probeSize);
|
|
if (probeSize < 0) {
|
|
version = -1 * probeSize;
|
|
switch (version) {
|
|
case 16:
|
|
probeFormat = 1;
|
|
break;
|
|
case 32:
|
|
probeFormat = 2;
|
|
break;
|
|
default:
|
|
throw ValueErrorException("Unknown version type for the encode bit vect");
|
|
break;
|
|
}
|
|
probeSize =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)probe);
|
|
probe += sizeof(probeSize);
|
|
}
|
|
|
|
int refSize =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)ref);
|
|
ref += sizeof(refSize);
|
|
if (refSize < 0) {
|
|
version = -1 * refSize;
|
|
switch (version) {
|
|
case 16:
|
|
refFormat = 1;
|
|
break;
|
|
case 32:
|
|
refFormat = 2;
|
|
break;
|
|
default:
|
|
throw ValueErrorException("Unknown version type for the encode bit vect");
|
|
break;
|
|
}
|
|
refSize =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)ref);
|
|
ref += sizeof(refSize);
|
|
}
|
|
|
|
int nProbeOn =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)probe);
|
|
probe += sizeof(nProbeOn);
|
|
int nRefOn =
|
|
EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)ref);
|
|
ref += sizeof(nRefOn);
|
|
|
|
int currProbeBit = 0;
|
|
currProbeBit = getBitId(probe, probeFormat, probeSize, currProbeBit);
|
|
nProbeOn--;
|
|
|
|
int currRefBit = 0;
|
|
currRefBit = getBitId(ref, refFormat, refSize, currRefBit);
|
|
nRefOn--;
|
|
|
|
while (nProbeOn) {
|
|
while (currRefBit < currProbeBit && nRefOn > 0) {
|
|
if (refFormat == 2) {
|
|
currRefBit++;
|
|
}
|
|
currRefBit = getBitId(ref, refFormat, refSize, currRefBit);
|
|
nRefOn--;
|
|
}
|
|
if (currRefBit != currProbeBit) {
|
|
return false;
|
|
}
|
|
if (probeFormat == 2) {
|
|
currProbeBit++;
|
|
}
|
|
currProbeBit = getBitId(probe, probeFormat, probeSize, currProbeBit);
|
|
nProbeOn--;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <typename T1>
|
|
bool AllProbeBitsMatch(const T1& probe, const std::string& pkl) {
|
|
const char* text = pkl.c_str();
|
|
int format = 0;
|
|
int nOn = 0, size, version = 0;
|
|
size = EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)text);
|
|
text += sizeof(size);
|
|
if (size < 0) {
|
|
version = -1 * size;
|
|
switch (version) {
|
|
case 16:
|
|
format = 1;
|
|
break;
|
|
case 32:
|
|
format = 2;
|
|
break;
|
|
default:
|
|
throw ValueErrorException("Unknown version type for the encode bit vect");
|
|
break;
|
|
}
|
|
size = EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)text);
|
|
text += sizeof(size);
|
|
}
|
|
nOn = EndianSwapBytes<LITTLE_ENDIAN_ORDER, HOST_ENDIAN_ORDER>(*(int*)text);
|
|
text += sizeof(nOn);
|
|
|
|
int currBit = 0;
|
|
currBit = getBitId(text, format, size, currBit);
|
|
nOn--;
|
|
std::vector<int> obl;
|
|
probe.getOnBits(obl);
|
|
|
|
// for(int i=0;i<probe.getNumBits();i++){
|
|
// if(probe.getBit(i)){
|
|
for (std::vector<int>::const_iterator i = obl.begin(); i != obl.end(); i++) {
|
|
while (currBit < *i && nOn > 0) {
|
|
if (format == 2) {
|
|
currBit++;
|
|
}
|
|
currBit = getBitId(text, format, size, currBit);
|
|
nOn--;
|
|
}
|
|
if (currBit != *i) {
|
|
return false;
|
|
}
|
|
//}
|
|
}
|
|
return true;
|
|
}
|
|
template RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(
|
|
const SparseBitVect& bv1, const std::string& pkl);
|
|
template RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(
|
|
const ExplicitBitVect& bv1, const std::string& pkl);
|
|
template <typename T1>
|
|
bool AllProbeBitsMatch(const T1& probe, const T1& ref) {
|
|
for (unsigned int i = 0; i < probe.getNumBits(); ++i) {
|
|
if (probe.getBit(i) && !ref.getBit(i)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
template RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
// template bool AllProbeBitsMatch(const ExplicitBitVect& bv1,const
|
|
// ExplicitBitVect &bv2);
|
|
|
|
bool AllProbeBitsMatch(const ExplicitBitVect& probe,
|
|
const ExplicitBitVect& ref) {
|
|
return probe.dp_bits->is_subset_of(*(ref.dp_bits));
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// NumOnBitsInCommon(T1,T2)
|
|
// Returns the number of on bits which are set in both T1 and T2.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
int NumOnBitsInCommon(const T1& bv1, const T2& bv2) {
|
|
return static_cast<int>(OnBitsInCommon(bv1, bv2).size());
|
|
}
|
|
|
|
namespace {
|
|
struct bitset_impl {
|
|
std::vector<unsigned long> m_bits;
|
|
std::size_t m_num_bits;
|
|
};
|
|
|
|
const bool canUseBitmapHack =
|
|
sizeof(boost::dynamic_bitset<>) == sizeof(bitset_impl);
|
|
|
|
bool EBVToBitmap(const ExplicitBitVect& bv, const unsigned char*& fp,
|
|
unsigned int& nBytes) {
|
|
if (!canUseBitmapHack) {
|
|
return false;
|
|
}
|
|
const auto* p1 = (const bitset_impl*)(const void*)bv.dp_bits;
|
|
// Run-time sanity check (just in case)
|
|
if (p1->m_num_bits != bv.dp_bits->size()) {
|
|
return false;
|
|
}
|
|
fp = (const unsigned char*)p1->m_bits.data();
|
|
nBytes = (unsigned int)p1->m_num_bits / 8;
|
|
if (p1->m_num_bits % 8) {
|
|
++nBytes;
|
|
}
|
|
return true;
|
|
}
|
|
} // namespace
|
|
|
|
unsigned int CalcBitmapNumBitsInCommon(const unsigned char* afp,
|
|
const unsigned char* bfp,
|
|
unsigned int nBytes);
|
|
|
|
int NumOnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) {
|
|
// Don't try this at home, we (hope we) know what we're doing
|
|
const unsigned char *afp, *bfp;
|
|
unsigned int nBytes;
|
|
if (EBVToBitmap(bv1, afp, nBytes) && EBVToBitmap(bv2, bfp, nBytes)) {
|
|
unsigned int result = CalcBitmapNumBitsInCommon(afp, bfp, nBytes);
|
|
return (int)result;
|
|
}
|
|
|
|
return static_cast<int>(((*bv1.dp_bits) & (*bv2.dp_bits)).count());
|
|
}
|
|
|
|
// In all these similarity metrics the notation is selected to be
|
|
// consistent with J.W. Raymond and P. Willett, JCAMD _16_ 59-71 (2002)
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// TanimotoSimilarity(T1,T2)
|
|
// returns the Tanimoto similarity between T1 and T2, a double.
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator&, getNumBits()
|
|
// and getOnBits().
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
double TanimotoSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
unsigned int total = bv1.getNumOnBits() + bv2.getNumOnBits();
|
|
if (total == 0) {
|
|
return 1.0;
|
|
}
|
|
unsigned int common = NumOnBitsInCommon(bv1, bv2);
|
|
return (double)common / (double)(total - common);
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double TverskySimilarity(const T1& bv1, const T2& bv2, double a, double b) {
|
|
RANGE_CHECK(0, a, 1);
|
|
RANGE_CHECK(0, b, 1);
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
double denom = a * y + b * z + (1 - a - b) * x;
|
|
if (denom == 0.0) {
|
|
return 1.0;
|
|
} else {
|
|
return x / denom;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double CosineSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
if (y * z > 0.0) {
|
|
return x / sqrt(y * z);
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double KulczynskiSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
if (y * z > 0.0) {
|
|
return x * (y + z) / (2 * y * z);
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double DiceSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
if (y + z > 0.0) {
|
|
return 2 * x / (y + z);
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double SokalSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
return x / (2 * y + 2 * z - 3 * x);
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double McConnaugheySimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
if (y * z > 0.0) {
|
|
return (x * (y + z) - (y * z)) / (y * z);
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
inline T tmin(T v1, T v2) {
|
|
return std::min(v2, v1);
|
|
}
|
|
|
|
template <typename T>
|
|
inline T tmax(T v1, T v2) {
|
|
return std::max(v2, v1);
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double AsymmetricSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
double min = tmin(y, z);
|
|
if (min > 0.0) {
|
|
return x / min;
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double BraunBlanquetSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
|
|
double max = tmax(y, z);
|
|
if (max > 0.0) {
|
|
return x / max;
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double RusselSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
return x / bv1.getNumBits();
|
|
}
|
|
|
|
template <typename T1, typename T2>
|
|
double RogotGoldbergSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
double x = NumOnBitsInCommon(bv1, bv2);
|
|
double y = bv1.getNumOnBits();
|
|
double z = bv2.getNumOnBits();
|
|
double l = bv1.getNumBits();
|
|
double d = l - y - z + x;
|
|
|
|
double denom1 = y + z;
|
|
double denom2 = 2 * l - y - z;
|
|
if ((x == l) || (d == l)) {
|
|
return 1.0;
|
|
} else if (denom1 == 0 || denom2 == 0) {
|
|
return 0.0;
|
|
} else {
|
|
return (x / (y + z) + (d) / (2 * l - y - z));
|
|
}
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// OnBitSimilarity(T1,T2)
|
|
// Returns the percentage of possible on bits in common
|
|
// between T1 and T2 (a double)
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator|, operator&
|
|
// and getOnBits().
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
double OnBitSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
|
|
double num = NumOnBitsInCommon(bv1, bv2);
|
|
double denom = (bv1 | bv2).getNumOnBits();
|
|
|
|
if (denom > 0) {
|
|
return num / denom;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// NumBitsInCommon(T1,T2)
|
|
// Returns the number of bits in common (on and off)
|
|
// between T1 and T2 (an int)
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator^, getNumBits().
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
int NumBitsInCommon(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
|
|
return bv1.getNumBits() - (bv1 ^ bv2).getNumOnBits();
|
|
}
|
|
|
|
int NumBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) {
|
|
return bv1.getNumBits() -
|
|
static_cast<int>(((*bv1.dp_bits) ^ (*bv2.dp_bits)).count());
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// AllBitSimilarity(T1,T2)
|
|
// Returns the percentage of bits in common (on and off)
|
|
// between T1 and T2 (a double)
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator^, getNumBits()
|
|
// and getNumOnBits().
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
double AllBitSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
|
|
return double(NumBitsInCommon(bv1, bv2)) / bv1.getNumBits();
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// OnBitsInCommon(T1,T2)
|
|
// Returns the on bits which are set in both T1 and T2.
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator&, getNumBits()
|
|
// and getOnBits(), the return value is an IntVect.
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects, the return value
|
|
// is a tuple of ints.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
IntVect OnBitsInCommon(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
IntVect res;
|
|
(bv1 & bv2).getOnBits(res);
|
|
return res;
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// OffBitsInCommon(T1,T2)
|
|
// Returns the off bits which are set in both T1 and T2.
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator|, operator~,
|
|
// getNumBits() and getOnBits(), the return value is an IntVect.
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects, the return value
|
|
// is a tuple of ints.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
IntVect OffBitsInCommon(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
IntVect res;
|
|
(~(bv1 | bv2)).getOnBits(res);
|
|
return res;
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// OnBitProjSimilarity(T1,T2)
|
|
// Returns the projected similarity between the on bits of
|
|
// T1 and T2.
|
|
//
|
|
// The on bit projected similarity of T1 onto T2 is the
|
|
// percentage of T1's on bits which are on in T2.
|
|
//
|
|
// This type of measure may be useful for substructure-type
|
|
// searches.
|
|
//
|
|
// Two values are returned, the projection of T1 onto T2
|
|
// and the projection of T2 onto T1
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator&, getNumBits()
|
|
// and getNumOnBits(), the return value is an DoubleVect with
|
|
// two elements.
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects, the return value
|
|
// is a 2-tuple of doubles.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
DoubleVect OnBitProjSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
DoubleVect res(2, 0.0);
|
|
double num = NumOnBitsInCommon(bv1, bv2);
|
|
if (num) {
|
|
res[0] = num / bv1.getNumOnBits();
|
|
res[1] = num / bv2.getNumOnBits();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
// """ -------------------------------------------------------
|
|
//
|
|
// OffBitProjSimilarity(T1,T2)
|
|
// Returns the projected similarity between the off bits of
|
|
// T1 and T2.
|
|
//
|
|
// The off bit projected similarity of T1 onto T2 is the
|
|
// percentage of T1's off bits which are off in T2.
|
|
//
|
|
// This type of measure may be useful for substructure-type
|
|
// searches.
|
|
//
|
|
// Two values are returned, the projection of T1 onto T2
|
|
// and the projection of T2 onto T1
|
|
//
|
|
// T1 and T2 should be the same length.
|
|
//
|
|
// C++ Notes: T1 and T2 must support operator|, getNumBits()
|
|
// and getNumOffBits(), the return value is an DoubleVect with
|
|
// two elements.
|
|
//
|
|
// Python Notes: T1 and T2 are BitVects, the return value
|
|
// is a 2-tuple of doubles.
|
|
//
|
|
// """ -------------------------------------------------------
|
|
template <typename T1, typename T2>
|
|
DoubleVect OffBitProjSimilarity(const T1& bv1, const T2& bv2) {
|
|
if (bv1.getNumBits() != bv2.getNumBits()) {
|
|
throw ValueErrorException("BitVects must be same length");
|
|
}
|
|
DoubleVect res(2, 0.0);
|
|
double num = (bv1 | bv2).getNumOffBits();
|
|
if (num) {
|
|
res[0] = num / bv1.getNumOffBits();
|
|
res[1] = num / bv2.getNumOffBits();
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename T1>
|
|
T1* FoldFingerprint(const T1& bv1, unsigned int factor) {
|
|
if (factor <= 0 || factor >= bv1.getNumBits()) {
|
|
throw ValueErrorException("invalid fold factor");
|
|
}
|
|
|
|
int initSize = bv1.getNumBits();
|
|
int resSize = initSize / factor;
|
|
auto* res = new T1(resSize);
|
|
|
|
IntVect onBits;
|
|
bv1.getOnBits(onBits);
|
|
for (int& onBit : onBits) {
|
|
int pos = onBit % resSize;
|
|
res->setBit(pos);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename T1>
|
|
std::string BitVectToText(const T1& bv1) {
|
|
std::string res(bv1.getNumBits(), '0');
|
|
for (unsigned int i = 0; i < bv1.getNumBits(); i++) {
|
|
if (bv1.getBit(i)) {
|
|
res[i] = '1';
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
const char bin2Hex[] = {'0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
|
|
template <typename T1>
|
|
std::string BitVectToFPSText(const T1& bv1) {
|
|
unsigned int size =
|
|
2 * (bv1.getNumBits() / 8 + (bv1.getNumBits() % 8 ? 1 : 0));
|
|
std::string res(size, 0);
|
|
unsigned char c = 0;
|
|
unsigned int byte = 0;
|
|
for (unsigned int i = 0; i < bv1.getNumBits(); i++) {
|
|
if (bv1.getBit(i)) {
|
|
c |= 1 << (i % 8);
|
|
}
|
|
if (!((i + 1) % 8)) {
|
|
res[byte++] = bin2Hex[(c >> 4) % 16];
|
|
res[byte++] = bin2Hex[c % 16];
|
|
c = 0;
|
|
}
|
|
}
|
|
if (byte < size) {
|
|
res[byte++] = bin2Hex[(c >> 4) % 16];
|
|
res[byte++] = bin2Hex[c % 16];
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename T1>
|
|
std::string BitVectToBinaryText(const T1& bv1) {
|
|
std::string res(bv1.getNumBits() / 8 + (bv1.getNumBits() % 8 ? 1 : 0), 0);
|
|
unsigned char c = 0;
|
|
unsigned int byte = 0;
|
|
for (unsigned int i = 0; i < bv1.getNumBits(); i++) {
|
|
if (bv1.getBit(i)) {
|
|
c |= 1 << (i % 8);
|
|
}
|
|
if (!((i + 1) % 8)) {
|
|
res[byte++] = c;
|
|
c = 0;
|
|
}
|
|
}
|
|
if (bv1.getNumBits() % 8) {
|
|
res[byte] = c;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
template <typename T1>
|
|
void UpdateBitVectFromFPSText(T1& bv1, const std::string& fps) {
|
|
PRECONDITION(fps.length() * 4 >= bv1.getNumBits(), "bad FPS length");
|
|
PRECONDITION(fps.length() % 2 == 0, "bad FPS length");
|
|
unsigned int bitIdx = 0;
|
|
char tptr[3];
|
|
tptr[2] = (char)0;
|
|
for (unsigned int i = 0; i < fps.size() && bitIdx < bv1.getNumBits();
|
|
i += 2) {
|
|
unsigned short c = 0;
|
|
try {
|
|
tptr[0] = fps[i];
|
|
tptr[1] = fps[i + 1];
|
|
c = static_cast<unsigned short>(strtol(tptr, nullptr, 16));
|
|
} catch (...) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert FPS word: " << fps.substr(i, 2) << " to int";
|
|
std::cerr << errout.str() << std::endl;
|
|
throw ValueErrorException(errout.str());
|
|
}
|
|
for (unsigned int bit = 0; bit < 8 && bitIdx < bv1.getNumBits();
|
|
++bit, ++bitIdx) {
|
|
if (c & (1 << bit)) {
|
|
bv1.setBit(bitIdx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T1>
|
|
void UpdateBitVectFromBinaryText(T1& bv1, const std::string& fps) {
|
|
PRECONDITION(fps.length() * 8 >= bv1.getNumBits(), "bad FPS length");
|
|
unsigned int bitIdx = 0;
|
|
for (unsigned int i = 0; i < fps.size() && bitIdx < bv1.getNumBits(); i++) {
|
|
unsigned short c = fps[i];
|
|
for (unsigned int bit = 0; bit < 8 && bitIdx < bv1.getNumBits();
|
|
++bit, ++bitIdx) {
|
|
if (c & (1 << bit)) {
|
|
bv1.setBit(bitIdx);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2, double a, double b);
|
|
template RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const SparseBitVect& bv1,
|
|
const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(
|
|
const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT IntVect
|
|
OnBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT IntVect
|
|
OffBitsInCommon(const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT DoubleVect
|
|
OnBitProjSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT DoubleVect
|
|
OffBitProjSimilarity(const SparseBitVect& bv1, const SparseBitVect& bv2);
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2, double a, double b);
|
|
template RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(
|
|
const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT IntVect
|
|
OnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT IntVect
|
|
OffBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT DoubleVect
|
|
OnBitProjSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
template RDKIT_DATASTRUCTS_EXPORT DoubleVect
|
|
OffBitProjSimilarity(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT SparseBitVect* FoldFingerprint(
|
|
const SparseBitVect&, unsigned int);
|
|
template RDKIT_DATASTRUCTS_EXPORT ExplicitBitVect* FoldFingerprint(
|
|
const ExplicitBitVect&, unsigned int);
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(
|
|
const SparseBitVect&);
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(
|
|
const ExplicitBitVect&);
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(
|
|
const SparseBitVect&);
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(
|
|
const ExplicitBitVect&);
|
|
template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(
|
|
SparseBitVect&, const std::string&);
|
|
template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(
|
|
ExplicitBitVect&, const std::string&);
|
|
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(
|
|
const SparseBitVect&);
|
|
template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(
|
|
const ExplicitBitVect&);
|
|
template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText(
|
|
SparseBitVect&, const std::string&);
|
|
template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText(
|
|
ExplicitBitVect&, const std::string&);
|
|
|
|
// from here:
|
|
// http://stackoverflow.com/questions/3849337/msvc-equivalent-to-builtin-popcount
|
|
// but corrected to get the ifdef right
|
|
#ifdef _MSC_VER
|
|
#include <intrin.h>
|
|
#ifdef _WIN64
|
|
#define BUILTIN_POPCOUNT_INSTR __popcnt64
|
|
using BUILTIN_POPCOUNT_TYPE = boost::uint64_t;
|
|
#else
|
|
#define BUILTIN_POPCOUNT_INSTR __popcnt
|
|
using BUILTIN_POPCOUNT_TYPE = std::uint32_t;
|
|
#endif
|
|
#else
|
|
#define BUILTIN_POPCOUNT_INSTR __builtin_popcountll
|
|
using BUILTIN_POPCOUNT_TYPE = boost::uint64_t;
|
|
#endif
|
|
|
|
// the Bitmap Tanimoto and Dice similarity code is adapted
|
|
// from Andrew Dalke's chem-fingerprints code
|
|
// http://code.google.com/p/chem-fingerprints/
|
|
namespace {
|
|
static int byte_popcounts[] = {
|
|
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
|
|
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
|
|
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
|
|
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
|
|
}
|
|
unsigned int CalcBitmapPopcount(const unsigned char* afp, unsigned int nBytes) {
|
|
PRECONDITION(afp, "no afp");
|
|
unsigned int popcount = 0;
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
popcount += byte_popcounts[afp[i]];
|
|
}
|
|
#else
|
|
unsigned int eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (unsigned int i = 0; i < eidx; ++i) {
|
|
popcount += static_cast<unsigned int>(
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i]));
|
|
}
|
|
for (unsigned int i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) {
|
|
popcount += byte_popcounts[afp[i]];
|
|
}
|
|
#endif
|
|
return popcount;
|
|
}
|
|
|
|
unsigned int CalcBitmapNumBitsInCommon(const unsigned char* afp,
|
|
const unsigned char* bfp,
|
|
unsigned int nBytes) {
|
|
PRECONDITION(afp, "no afp");
|
|
PRECONDITION(bfp, "no bfp");
|
|
unsigned int intersect_popcount = 0;
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#else
|
|
BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) {
|
|
intersect_popcount += static_cast<unsigned int>(BUILTIN_POPCOUNT_INSTR(
|
|
((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
}
|
|
for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
i < nBytes; ++i) {
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#endif
|
|
return intersect_popcount;
|
|
}
|
|
|
|
double CalcBitmapTanimoto(const unsigned char* afp, const unsigned char* bfp,
|
|
unsigned int nBytes) {
|
|
PRECONDITION(afp, "no afp");
|
|
PRECONDITION(bfp, "no bfp");
|
|
unsigned int union_popcount = 0, intersect_popcount = 0;
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
union_popcount += byte_popcounts[afp[i] | bfp[i]];
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#else
|
|
BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) {
|
|
union_popcount += static_cast<unsigned int>(BUILTIN_POPCOUNT_INSTR(
|
|
((BUILTIN_POPCOUNT_TYPE*)afp)[i] | ((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
intersect_popcount += static_cast<unsigned int>(BUILTIN_POPCOUNT_INSTR(
|
|
((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
}
|
|
for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
i < nBytes; ++i) {
|
|
union_popcount += byte_popcounts[afp[i] | bfp[i]];
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#endif
|
|
if (union_popcount == 0) {
|
|
return 0.0;
|
|
}
|
|
return (intersect_popcount + 0.0) /
|
|
union_popcount; /* +0.0 to coerce to double */
|
|
}
|
|
|
|
double CalcBitmapDice(const unsigned char* afp, const unsigned char* bfp,
|
|
unsigned int nBytes) {
|
|
PRECONDITION(afp, "no afp");
|
|
PRECONDITION(bfp, "no bfp");
|
|
unsigned int intersect_popcount = 0, a_popcount = 0, b_popcount = 0;
|
|
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
a_popcount += byte_popcounts[afp[i]];
|
|
b_popcount += byte_popcounts[bfp[i]];
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#else
|
|
BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) {
|
|
a_popcount += static_cast<unsigned int>(
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i]));
|
|
b_popcount += static_cast<unsigned int>(
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
intersect_popcount += static_cast<unsigned int>(BUILTIN_POPCOUNT_INSTR(
|
|
((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
}
|
|
for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
i < nBytes; ++i) {
|
|
a_popcount += byte_popcounts[afp[i]];
|
|
b_popcount += byte_popcounts[bfp[i]];
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
}
|
|
#endif
|
|
|
|
if (a_popcount + b_popcount == 0) {
|
|
return 0.0;
|
|
}
|
|
return (2.0 * intersect_popcount) / (a_popcount + b_popcount);
|
|
}
|
|
|
|
double CalcBitmapTversky(const unsigned char* afp, const unsigned char* bfp,
|
|
unsigned int nBytes, double ca, double cb) {
|
|
PRECONDITION(afp, "no afp");
|
|
PRECONDITION(bfp, "no bfp");
|
|
unsigned int intersect_popcount = 0, acount = 0, bcount = 0;
|
|
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
acount += byte_popcounts[afp[i]];
|
|
bcount += byte_popcounts[bfp[i]];
|
|
}
|
|
#else
|
|
BUILTIN_POPCOUNT_TYPE eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (BUILTIN_POPCOUNT_TYPE i = 0; i < eidx; ++i) {
|
|
intersect_popcount += static_cast<unsigned int>(BUILTIN_POPCOUNT_INSTR(
|
|
((BUILTIN_POPCOUNT_TYPE*)afp)[i] & ((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
acount += static_cast<unsigned int>(
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)afp)[i]));
|
|
bcount += static_cast<unsigned int>(
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)bfp)[i]));
|
|
}
|
|
for (BUILTIN_POPCOUNT_TYPE i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
i < nBytes; ++i) {
|
|
intersect_popcount += byte_popcounts[afp[i] & bfp[i]];
|
|
acount += byte_popcounts[afp[i]];
|
|
bcount += byte_popcounts[bfp[i]];
|
|
}
|
|
#endif
|
|
double denom = ca * acount + cb * bcount + (1 - ca - cb) * intersect_popcount;
|
|
if (denom == 0.0) {
|
|
return 0.0;
|
|
}
|
|
return intersect_popcount / denom;
|
|
}
|
|
|
|
bool CalcBitmapAllProbeBitsMatch(const unsigned char* probe,
|
|
const unsigned char* ref,
|
|
unsigned int nBytes) {
|
|
PRECONDITION(probe, "no probe");
|
|
PRECONDITION(ref, "no ref");
|
|
|
|
#ifndef RDK_OPTIMIZE_POPCNT
|
|
for (unsigned int i = 0; i < nBytes; i++) {
|
|
if (byte_popcounts[probe[i] & ref[i]] != byte_popcounts[probe[i]]) {
|
|
return false;
|
|
}
|
|
}
|
|
#else
|
|
unsigned int eidx = nBytes / sizeof(BUILTIN_POPCOUNT_TYPE);
|
|
for (unsigned int i = 0; i < eidx; ++i) {
|
|
if (BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)probe)[i] &
|
|
((BUILTIN_POPCOUNT_TYPE*)ref)[i]) !=
|
|
BUILTIN_POPCOUNT_INSTR(((BUILTIN_POPCOUNT_TYPE*)probe)[i])) {
|
|
return false;
|
|
}
|
|
}
|
|
for (unsigned int i = eidx * sizeof(BUILTIN_POPCOUNT_TYPE); i < nBytes; ++i) {
|
|
if (byte_popcounts[probe[i] & ref[i]] != byte_popcounts[probe[i]]) {
|
|
return false;
|
|
}
|
|
}
|
|
#endif
|
|
return true;
|
|
}
|