// $Id$ // // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include "BitVects.h" #include "BitOps.h" #include #include #include #include #include #include #include #include using namespace RDKit; int getBitId(const char *&text,int format,int size,int curr){ PRECONDITION(text,"no text"); int res=-1; if( (format==0) || ( (format == 1) && (size >= std::numeric_limits::max()) ) ) { int tmp; tmp = EndianSwapBytes(*(int *)text); text += sizeof(tmp); res=tmp; } else if (format == 1) { // version 16 and on bits sotred as short ints unsigned short tmp; tmp = EndianSwapBytes(*(unsigned short *)text); text += sizeof(tmp); res=tmp; } else if (format == 2) { // run length encoded format res = curr + RDKit::pullPackedIntFromString(text); } return res; } bool AllProbeBitsMatch(const std::string &probe,const std::string &ref){ return AllProbeBitsMatch(probe.c_str(),ref.c_str()); } bool AllProbeBitsMatch(const char *probe,const char *ref){ PRECONDITION(probe,"no probe text"); PRECONDITION(ref,"no probe text"); int probeFormat=0; int refFormat=0; int version=0; int probeSize = EndianSwapBytes(*(int *)probe); probe+=sizeof(probeSize); if(probeSize<0){ version = -1*probeSize; if (version == 16) { probeFormat=1; } else if (version == 32) { probeFormat=2; } else { throw("Unknown version type for the encode bit vect"); } probeSize = EndianSwapBytes(*(int *)probe); probe+=sizeof(probeSize); } int refSize = EndianSwapBytes(*(int *)ref); ref+=sizeof(refSize); if(refSize<0){ version = -1*refSize; if (version == 16) { refFormat=1; } else if (version == 32) { refFormat=2; } else { throw("Unknown version type for the encode bit vect"); } refSize = EndianSwapBytes(*(int *)ref); ref+=sizeof(refSize); } int nProbeOn = EndianSwapBytes(*(int *)probe); probe+=sizeof(nProbeOn); int nRefOn = EndianSwapBytes(*(int *)ref); ref+=sizeof(nRefOn); int currProbeBit=0; currProbeBit=getBitId(probe,probeFormat,probeSize,currProbeBit); nProbeOn--; int currRefBit=0; currRefBit=getBitId(ref,refFormat,refSize,currRefBit); nRefOn--; while(nProbeOn){ while(currRefBit0){ if(refFormat==2) currRefBit++; currRefBit=getBitId(ref,refFormat,refSize,currRefBit); nRefOn--; } if(currRefBit!=currProbeBit) return false; if(probeFormat==2) currProbeBit++; currProbeBit=getBitId(probe,probeFormat,probeSize,currProbeBit); nProbeOn--; } return true; } template bool AllProbeBitsMatch(const T1 &probe,const std::string &pkl){ const char *text=pkl.c_str(); int format=0; int nOn=0,size,version=0; size = EndianSwapBytes(*(int *)text); text+=sizeof(size); if(size<0){ version = -1*size; if (version == 16) { format=1; } else if (version == 32) { format=2; } else { throw("Unknown version type for the encode bit vect"); } size = EndianSwapBytes(*(int *)text); text+=sizeof(size); } nOn = EndianSwapBytes(*(int *)text); text+=sizeof(nOn); int currBit=0; currBit=getBitId(text,format,size,currBit); nOn--; std::vector obl; probe.getOnBits(obl); //for(int i=0;i::const_iterator i=obl.begin();i!=obl.end();i++){ while(currBit<*i && nOn>0){ if(format==2) currBit++; currBit=getBitId(text,format,size,currBit); nOn--; } if(currBit!=*i) return false; //} } return true; } template bool AllProbeBitsMatch(const SparseBitVect& bv1,const std::string &pkl); template bool AllProbeBitsMatch(const ExplicitBitVect& bv1,const std::string &pkl); template bool AllProbeBitsMatch(const T1 &probe,const T1 &ref){ for(unsigned int i=0;iis_subset_of(*(ref.dp_bits)); } // """ ------------------------------------------------------- // // NumOnBitsInCommon(T1,T2) // Returns the number of on bits which are set in both T1 and T2. // // """ ------------------------------------------------------- template int NumOnBitsInCommon(const T1& bv1, const T2& bv2) { return OnBitsInCommon(bv1,bv2).size(); } int NumOnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) { return ((*bv1.dp_bits) & (*bv2.dp_bits)).count(); } // In all these similarity metrics the notation is selected to be // consistent with J.W. Raymond and P. Willett, JCAMD _16_ 59-71 (2002) // """ ------------------------------------------------------- // // TanimotoSimilarity(T1,T2) // returns the Tanamoto similarity between T1 and T2, a double. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double TanimotoSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if((y+z-x)==0.0) return 1.0; else return x / (y+z-x); } template double TverskySimilarity(const T1& bv1, const T2& bv2, double a, double b) { RANGE_CHECK(0,a,1); RANGE_CHECK(0,b,1); if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); double denom = a*y + b*z + (1-a-b)*x; if(denom==0.0) return 1.0; else return x / denom; } template double CosineSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(y*z>0.0){ return x / sqrt(y*z); } else { return 0.0; } } template double KulczynskiSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(y*z>0.0){ return x*(y+z)/(2*y*z); } else { return 0.0; } } template double DiceSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(y+z>0.0){ return 2*x/(y+z); } else { return 0.0; } } template double SokalSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); return x/(2*y+2*z-3*x); } template double McConnaugheySimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(y*z>0.0){ return (x*(y+z)-(y*z))/(y*z); } else { return 0.0; } } template inline T tmin(T v1,T v2) { if(v1 inline T tmax(T v1,T v2) { if(v1>v2) return v1; return v2; } template double AsymmetricSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(tmin(y,z)>0){ return x/tmin(y,z); } else { return 0.0; } } template double BraunBlanquetSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); if(tmax(y,z)>0){ return x/tmax(y,z); } else { return 0.0; } } template double RusselSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); return x/bv1.getNumBits(); } template double RogotGoldbergSimilarity(const T1& bv1,const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double x = NumOnBitsInCommon(bv1,bv2); double y = bv1.getNumOnBits(); double z = bv2.getNumOnBits(); double l = bv1.getNumBits(); double d = l - y - z + x; if ((x == l) || (d == l)) return 1.0; else return (x/(y+z) + (d)/(2*l-y-z)); } // """ ------------------------------------------------------- // // OnBitSimilarity(T1,T2) // Returns the percentage of possible on bits in common // between T1 and T2 (a double) // // C++ Notes: T1 and T2 must support operator|, operator& // and getOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double OnBitSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); double num = NumOnBitsInCommon(bv1,bv2); double denom=(bv1|bv2).getNumOnBits(); if(denom>0){ return num/denom; } else { return 0; } } // """ ------------------------------------------------------- // // NumBitsInCommon(T1,T2) // Returns the number of bits in common (on and off) // between T1 and T2 (an int) // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator^, getNumBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template int NumBitsInCommon(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); return bv1.getNumBits() - (bv1^bv2).getNumOnBits(); } int NumBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2) { return bv1.getNumBits() - ((*bv1.dp_bits) ^ (*bv2.dp_bits)).count(); } // """ ------------------------------------------------------- // // AllBitSimilarity(T1,T2) // Returns the percentage of bits in common (on and off) // between T1 and T2 (a double) // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator^, getNumBits() // and getNumOnBits(). // // Python Notes: T1 and T2 are BitVects. // // """ ------------------------------------------------------- template double AllBitSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); return double(NumBitsInCommon(bv1,bv2))/bv1.getNumBits(); } // """ ------------------------------------------------------- // // OnBitsInCommon(T1,T2) // Returns the on bits which are set in both T1 and T2. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getOnBits(), the return value is an IntVect. // // Python Notes: T1 and T2 are BitVects, the return value // is a tuple of ints. // // """ ------------------------------------------------------- template IntVect OnBitsInCommon(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); IntVect res; (bv1&bv2).getOnBits(res); return res; } // """ ------------------------------------------------------- // // OffBitsInCommon(T1,T2) // Returns the off bits which are set in both T1 and T2. // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator|, operator~, // getNumBits() and getOnBits(), the return value is an IntVect. // // Python Notes: T1 and T2 are BitVects, the return value // is a tuple of ints. // // """ ------------------------------------------------------- template IntVect OffBitsInCommon(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); IntVect res; (~(bv1|bv2)).getOnBits(res); return res; } // """ ------------------------------------------------------- // // OnBitProjSimilarity(T1,T2) // Returns the projected similarity between the on bits of // T1 and T2. // // The on bit projected similarity of T1 onto T2 is the // percentage of T1's on bits which are on in T2. // // This type of measure may be useful for substructure-type // searches. // // Two values are returned, the projection of T1 onto T2 // and the projection of T2 onto T1 // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator&, getNumBits() // and getNumOnBits(), the return value is an DoubleVect with // two elements. // // Python Notes: T1 and T2 are BitVects, the return value // is a 2-tuple of doubles. // // """ ------------------------------------------------------- template DoubleVect OnBitProjSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); DoubleVect res(2,0.0); double num=NumOnBitsInCommon(bv1,bv2); if(num){ res[0] = num/bv1.getNumOnBits(); res[1] = num/bv2.getNumOnBits(); } return res; } // """ ------------------------------------------------------- // // OffBitProjSimilarity(T1,T2) // Returns the projected similarity between the off bits of // T1 and T2. // // The off bit projected similarity of T1 onto T2 is the // percentage of T1's off bits which are off in T2. // // This type of measure may be useful for substructure-type // searches. // // Two values are returned, the projection of T1 onto T2 // and the projection of T2 onto T1 // // T1 and T2 should be the same length. // // C++ Notes: T1 and T2 must support operator|, getNumBits() // and getNumOffBits(), the return value is an DoubleVect with // two elements. // // Python Notes: T1 and T2 are BitVects, the return value // is a 2-tuple of doubles. // // """ ------------------------------------------------------- template DoubleVect OffBitProjSimilarity(const T1& bv1, const T2& bv2) { if(bv1.getNumBits()!=bv2.getNumBits()) throw ValueErrorException("BitVects must be same length"); DoubleVect res(2,0.0); double num=(bv1|bv2).getNumOffBits(); if(num){ res[0] = num/bv1.getNumOffBits(); res[1] = num/bv2.getNumOffBits(); } return res; } template T1 * FoldFingerprint(const T1 &bv1,unsigned int factor) { if(factor <=0 || factor >= bv1.getNumBits()) throw ValueErrorException("invalid fold factor"); int initSize = bv1.getNumBits(); int resSize = initSize/factor; T1 *res = new T1(resSize); IntVect onBits; bv1.getOnBits(onBits); for(IntVectIter iv=onBits.begin();iv!=onBits.end();iv++){ int pos = (*iv) % resSize; res->setBit(pos); } return res; } template std::string BitVectToText(const T1& bv1){ std::string res(bv1.getNumBits(),'0'); for(unsigned int i=0;i std::string BitVectToFPSText(const T1& bv1){ unsigned int size=2*(bv1.getNumBits()/8 + (bv1.getNumBits()%8?1:0)); std::string res(size,0); unsigned char c=0; unsigned int byte=0; for(unsigned int i=0;i>4)%16]; res[byte++]=bin2Hex[c%16]; c=0; } } if(byte>4)%16]; res[byte++]=bin2Hex[c%16]; } return res; } template std::string BitVectToBinaryText(const T1& bv1){ std::string res(bv1.getNumBits()/8 + (bv1.getNumBits()%8?1:0),0); unsigned char c=0; unsigned int byte=0; for(unsigned int i=0;i void UpdateBitVectFromFPSText(T1& bv1,const std::string &fps){ PRECONDITION(fps.length()*4>=bv1.getNumBits(),"bad FPS length"); PRECONDITION(fps.length()%2==0,"bad FPS length"); unsigned int bitIdx=0; for(unsigned int i=0; i> std::hex >> c; } catch (...) { std::ostringstream errout; errout << "Cannot convert FPS word: " << fps.substr(i,2) << " to int"; std::cerr< void UpdateBitVectFromBinaryText(T1& bv1,const std::string &fps){ PRECONDITION(fps.length()*8>=bv1.getNumBits(),"bad FPS length"); unsigned int bitIdx=0; for(unsigned int i=0;i