// // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #ifndef __RD_BITOPS_H__ #define __RD_BITOPS_H__ /*! \file BitOps.h \brief Contains general bit-comparison and similarity operations. The notation used to document the similarity metrics is: - \c V1_n: number of bits in vector 1 - \c V1_o: number of on bits in vector 1 - (V1&V2)_o: number of on bits in the intersection of vectors 1 and 2 */ #include "BitVects.h" #include //! general purpose wrapper for calculating the similarity between two bvs //! that may be of unequal size (will automatically fold as appropriate) template double SimilarityWrapper(const T &bv1, const T &bv2, double (*metric)(const T &, const T &), bool returnDistance = false) { double res = 0.0; if (bv1.getNumBits() > bv2.getNumBits()) { T *bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits()); res = metric(*bv1tmp, bv2); delete bv1tmp; } else if (bv2.getNumBits() > bv1.getNumBits()) { T *bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits()); res = metric(bv1, *bv2tmp); delete bv2tmp; } else { res = metric(bv1, bv2); } if (returnDistance) { res = 1.0 - res; } return res; } //! \overload template double SimilarityWrapper(const T &bv1, const T &bv2, double a, double b, double (*metric)(const T &, const T &, double, double), bool returnDistance = false) { double res = 0.0; if (bv1.getNumBits() > bv2.getNumBits()) { T *bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits()); res = metric(*bv1tmp, bv2, a, b); delete bv1tmp; } else if (bv2.getNumBits() > bv1.getNumBits()) { T *bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits()); res = metric(bv1, *bv2tmp, a, b); delete bv2tmp; } else { res = metric(bv1, bv2, a, b); } if (returnDistance) { res = 1.0 - res; } return res; } RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref); RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const std::string &probe, const std::string &ref); RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const ExplicitBitVect &probe, const ExplicitBitVect &ref); template RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1 &probe, const std::string &pkl); template RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const T1 &probe, const T1 &ref); //! returns the number of on bits in common between two bit vectors /*! \return (bv1&bv2)_o */ template RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2); RDKIT_DATASTRUCTS_EXPORT int NumOnBitsInCommon(const ExplicitBitVect &bv1, const ExplicitBitVect &bv2); //! returns the Tanimoto similarity between two bit vects /*! \return (bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o] */ template RDKIT_DATASTRUCTS_EXPORT double TanimotoSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Cosine similarity between two bit vects /*! \return (bv1&bv2)_o / sqrt(bv1_o + bv2_o) */ template RDKIT_DATASTRUCTS_EXPORT double CosineSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Kulczynski similarity between two bit vects /*! \return (bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o] */ template RDKIT_DATASTRUCTS_EXPORT double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Dice similarity between two bit vects /*! \return 2*(bv1&bv2)_o / [bv1_o + bv2_o] */ template RDKIT_DATASTRUCTS_EXPORT double DiceSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Tversky similarity between two bit vects /*! \return (bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o] Notes: # 0 <= a,b <= 1 # Tversky(a=1,b=1) = Tanimoto # Tversky(a=1/2,b=1/2) = Dice */ template RDKIT_DATASTRUCTS_EXPORT double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b); //! returns the Sokal similarity between two bit vects /*! \return (bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o] */ template RDKIT_DATASTRUCTS_EXPORT double SokalSimilarity(const T1 &bv1, const T2 &bv2); //! returns the McConnaughey similarity between two bit vects /*! \return [(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o * bv2_o) */ template RDKIT_DATASTRUCTS_EXPORT double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2); //! returns the Asymmetric similarity between two bit vects /*! \return (bv1&bv2)_o / min(bv1_o,bv2_o) */ template RDKIT_DATASTRUCTS_EXPORT double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Braun-Blanquet similarity between two bit vects /*! \return (bv1&bv2)_o / max(bv1_o,bv2_o) */ template RDKIT_DATASTRUCTS_EXPORT double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Russel similarity between two bit vects /*! \return (bv1&bv2)_o / bv1_o Note: that this operation is non-commutative: RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1) */ template RDKIT_DATASTRUCTS_EXPORT double RusselSimilarity(const T1 &bv1, const T2 &bv2); //! returns the Rogot-Goldberg similarity between two bit vects /*! \return (bv1&bv2)_o / (bv1_o + bv2_o) + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) */ template RDKIT_DATASTRUCTS_EXPORT double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2); //! returns the on bit similarity between two bit vects /*! \return (bv1&bv2)_o / (bv1|bv2)_o */ template RDKIT_DATASTRUCTS_EXPORT double OnBitSimilarity(const T1 &bv1, const T2 &bv2); //! returns the number of common bits (on and off) between two bit vects /*! \return bv1_n - (bv1^bv2)_o */ template RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const T1 &bv1, const T2 &bv2); RDKIT_DATASTRUCTS_EXPORT int NumBitsInCommon(const ExplicitBitVect &bv1, const ExplicitBitVect &bv2); //! returns the common-bit similarity (on and off) between two bit vects //! This is also called Manhattan similarity. /*! \return [bv1_n - (bv1^bv2)_o] / bv1_n */ template RDKIT_DATASTRUCTS_EXPORT double AllBitSimilarity(const T1 &bv1, const T2 &bv2); //! returns an IntVect with indices of all on bits in common between two bit /// vects template RDKIT_DATASTRUCTS_EXPORT IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2); //! returns an IntVect with indices of all off bits in common between two bit /// vects template RDKIT_DATASTRUCTS_EXPORT IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2); //! returns the on-bit projected similarities between two bit vects /*! \return two values, as a DoubleVect: - (bv1&bv2)_o / bv1_o - (bv1&bv2)_o / bv2_o */ template RDKIT_DATASTRUCTS_EXPORT DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2); //! returns the on-bit projected similarities between two bit vects /*! \return two values, as a DoubleVect: - [bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o] - [bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o] Note: bv1_n = bv2_n */ template RDKIT_DATASTRUCTS_EXPORT DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2); //! folds a bit vector \c factor times and returns the result /*! \param bv1 the vector to be folded \param factor (optional) the number of times to fold it \return a pointer to the folded fingerprint, which is bv1_n/factor long. Note: The caller is responsible for deleteing the result. */ template RDKIT_DATASTRUCTS_EXPORT T1 *FoldFingerprint(const T1 &bv1, unsigned int factor = 2); //! returns a text representation of a bit vector (a string of 0s and 1s) /*! \param bv1 the vector to use \return an std::string */ template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToText(const T1 &bv1); //! returns a hex representation of a bit vector compatible with Andrew Dalke's /// FPS format /*! \param bv1 the vector to use \return an std::string */ template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToFPSText(const T1 &bv1); //! returns a binary string representation of a bit vector (an array of bytes) /*! \param bv1 the vector to use \return an std::string */ template RDKIT_DATASTRUCTS_EXPORT std::string BitVectToBinaryText(const T1 &bv1); //! updates a bit vector from Andrew Dalke's FPS format /*! \param bv1 the vector to use \param fps the FPS hex string */ template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps); //! updates a bit vector from a binary string representation of a bit vector (an /// array of bytes) /*! \param bv1 the vector to use \param fps the binary string */ template RDKIT_DATASTRUCTS_EXPORT void UpdateBitVectFromBinaryText( T1 &bv1, const std::string &fps); // FIX: docs and tests please RDKIT_DATASTRUCTS_EXPORT unsigned int CalcBitmapPopcount( const unsigned char *bv1, unsigned int nBytes); RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTanimoto(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes); RDKIT_DATASTRUCTS_EXPORT double CalcBitmapDice(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes); RDKIT_DATASTRUCTS_EXPORT double CalcBitmapTversky(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes, double ca, double cb); RDKIT_DATASTRUCTS_EXPORT bool CalcBitmapAllProbeBitsMatch( const unsigned char *probe, const unsigned char *ref, unsigned int nBytes); #endif