rdkit/Code/DataStructs/SparseIntVect.h

// $Id$
//
//  Copyright (C) 2007-2008 Greg Landrum
//
//  @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#ifndef __RD_SPARSE_INT_VECT_20070921__
#define __RD_SPARSE_INT_VECT_20070921__

#include <map>
#include <string>
#include <RDGeneral/Invariant.h>
#include <sstream>
#include <RDGeneral/Exceptions.h>
#include <RDGeneral/StreamOps.h>
#include <boost/cstdint.hpp>


const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
namespace RDKit{
  //! a class for efficiently storing sparse vectors of ints
  template <typename IndexType>
  class SparseIntVect {
  public:
    typedef std::map<IndexType,int> StorageType;

    SparseIntVect() : d_length(0) {};

    //! initialize with a particular length
    SparseIntVect(IndexType length) : d_length(length) {};

    //! Copy constructor
    SparseIntVect(const SparseIntVect<IndexType> &other){
      d_length=other.d_length;
      d_data.insert(other.d_data.begin(),other.d_data.end());
    }

    //! constructor from a pickle
    SparseIntVect(const std::string &pkl){
      initFromText(pkl.c_str(),pkl.size());
    };
    //! constructor from a pickle
    SparseIntVect(const char *pkl,const unsigned int len){
      initFromText(pkl,len);
    };

    //! destructor (doesn't need to do anything)
    ~SparseIntVect() {}

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wtautological-compare"
#elif ( defined(__GNUC__) || defined(__GNUG__) ) && \
  ( __GNUC__ > 4 ||                                    \
    (__GNUC__ == 4 && __GNUC_MINOR__ > 1 ) )
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wtype-limits"
#endif
    //! return the value at an index
    int getVal(IndexType idx) const {
      if(idx<0||idx>=d_length){
        throw IndexErrorException(static_cast<int>(idx));
      }
      int res=0;
      typename StorageType::const_iterator iter=d_data.find(idx);
      if(iter!=d_data.end()){
        res=iter->second;
      }
      return res;
    };

    //! set the value at an index
    void setVal(IndexType idx, int val){
      if(idx<0 || idx>=d_length){
        throw IndexErrorException(static_cast<int>(idx));
      }
      if(val!=0){
        d_data[idx]=val;
      } else {
        d_data.erase(idx);
      }
    };
#ifdef __clang__
#pragma clang diagnostic pop
#elif ( defined(__GNUC__) || defined(__GNUG__) ) &&     \
  ( __GNUC__ > 4 ||                                    \
    (__GNUC__ == 4 && __GNUC_MINOR__ > 1 ) )
#pragma GCC diagnostic pop
#endif
    //! support indexing using []
    int operator[] (IndexType idx) const { return getVal(idx); };

    //! returns the length
    IndexType getLength() const { return d_length; };

    //! returns the sum of all the elements in the vect
    //! the doAbs argument toggles summing the absolute values of the elements
    int getTotalVal(bool doAbs=false) const {
      int res=0;
      typename StorageType::const_iterator iter;
      for(iter=d_data.begin();iter!=d_data.end();++iter){
        if(!doAbs) res+=iter->second;
        else res+=abs(iter->second);
      }
      return res;
    };
    //! returns the length
    unsigned int size() const { return getLength(); };


    //! returns our nonzero elements as a map(IndexType->int)
    const StorageType &getNonzeroElements() const {
      return d_data;
    }


    //! this is a "fuzzy" intesection, the final value
    //! of each element is equal to the minimum from
    //! the two vects.
    SparseIntVect<IndexType> &
    operator&= (const SparseIntVect<IndexType> &other) {
      if(other.d_length!=d_length){
        throw ValueErrorException("SparseIntVect size mismatch");
      }

      typename StorageType::iterator iter=d_data.begin();
      typename StorageType::const_iterator oIter=other.d_data.begin();
      while(iter!=d_data.end()){
        // we're relying on the fact that the maps are sorted:
        while(oIter!=other.d_data.end() && oIter->first < iter->first){
          ++oIter;
        }
        if(oIter!=other.d_data.end() && oIter->first==iter->first){
          // found it:
          if(oIter->second<iter->second){
            iter->second=oIter->second;
          }
          ++oIter;
          ++iter;
        } else {
          // not there; our value is zero, which means
          // we should remove this value:
          typename StorageType::iterator tmpIter=iter;
          ++tmpIter;
          d_data.erase(iter);
          iter=tmpIter;
        }
      }
      return *this;
    };
    const SparseIntVect<IndexType>
    operator& (const SparseIntVect<IndexType> &other) const {
      SparseIntVect<IndexType> res(*this);
      return res&=other;
    }

    //! this is a "fuzzy" union, the final value
    //! of each element is equal to the maximum from
    //! the two vects.
    SparseIntVect<IndexType> &
    operator|= (const SparseIntVect<IndexType> &other) {
      if(other.d_length!=d_length){
        throw ValueErrorException("SparseIntVect size mismatch");
      }

      typename StorageType::iterator iter=d_data.begin();
      typename StorageType::const_iterator oIter=other.d_data.begin();
      while(iter!=d_data.end()){
        // we're relying on the fact that the maps are sorted:
        while(oIter!=other.d_data.end() &&
              oIter->first < iter->first){
          d_data[oIter->first]=oIter->second;
          ++oIter;
        }
        if(oIter!=other.d_data.end() && oIter->first==iter->first){
          // found it:
          if(oIter->second>iter->second){
            iter->second=oIter->second;
          }
          ++oIter;
        }
        ++iter;
      }
      // finish up the other vect:
      while(oIter!=other.d_data.end()){
        d_data[oIter->first]=oIter->second;
        ++oIter;
      }
      return *this;
    };
    const SparseIntVect<IndexType>
    operator| (const SparseIntVect<IndexType> &other) const {
      SparseIntVect<IndexType> res(*this);
      return res|=other;
    }

    SparseIntVect<IndexType> &
    operator+= (const SparseIntVect<IndexType> &other) {
      if(other.d_length!=d_length){
        throw ValueErrorException("SparseIntVect size mismatch");
      }
      typename StorageType::iterator iter=d_data.begin();
      typename StorageType::const_iterator oIter=other.d_data.begin();
      while(oIter!=other.d_data.end()){
        while(iter!=d_data.end() &&
              iter->first < oIter->first){
          ++iter;
        }
        if(iter!=d_data.end() && oIter->first==iter->first){
          // found it:
          iter->second+=oIter->second;
          if(!iter->second){
            typename StorageType::iterator tIter=iter;
            ++tIter;
            d_data.erase(iter);
            iter=tIter;
          } else {
            ++iter;
          }
        } else {
          d_data[oIter->first]=oIter->second;
        }
        ++oIter;
      }
      return *this;
    };
    const SparseIntVect<IndexType>
    operator+ (const SparseIntVect<IndexType> &other) const {
      SparseIntVect<IndexType> res(*this);
      return res+=other;
    }

    SparseIntVect<IndexType> &
    operator-= (const SparseIntVect<IndexType> &other) {
      if(other.d_length!=d_length){
        throw ValueErrorException("SparseIntVect size mismatch");
      }
      typename StorageType::iterator iter=d_data.begin();
      typename StorageType::const_iterator oIter=other.d_data.begin();
      while(oIter!=other.d_data.end()){
        while(iter!=d_data.end() &&
              iter->first < oIter->first){
          ++iter;
        }
        if(iter!=d_data.end() && oIter->first==iter->first){
          // found it:
          iter->second-=oIter->second;
          if(!iter->second){
            typename StorageType::iterator tIter=iter;
            ++tIter;
            d_data.erase(iter);
            iter=tIter;
          } else {
            ++iter;
          }
        } else {
          d_data[oIter->first] = -oIter->second;
        }
        ++oIter;
      }
      return *this;
    };
    const SparseIntVect<IndexType>
    operator- (const SparseIntVect<IndexType> &other) const {
      SparseIntVect<IndexType> res(*this);
      return res-=other;
    }
    SparseIntVect<IndexType> &
    operator*= (int v) {
      typename StorageType::iterator iter=d_data.begin();
      while(iter!=d_data.end()){
        iter->second *= v;
        ++iter;
      }
      return *this;
    };
    SparseIntVect<IndexType> &
    operator* (int v) {
      SparseIntVect<IndexType> res(*this);
      return res*=v;
    };
    SparseIntVect<IndexType> &
    operator/= (int v) {
      typename StorageType::iterator iter=d_data.begin();
      while(iter!=d_data.end()){
        iter->second /= v;
        ++iter;
      }
      return *this;
    };
    SparseIntVect<IndexType> &
    operator/ (int v) {
      SparseIntVect<IndexType> res(*this);
      return res/=v;
    };
    SparseIntVect<IndexType> &
    operator+= (int v) {
      typename StorageType::iterator iter=d_data.begin();
      while(iter!=d_data.end()){
        iter->second += v;
        ++iter;
      }
      return *this;
    };
    SparseIntVect<IndexType> &
    operator+ (int v) {
      SparseIntVect<IndexType> res(*this);
      return res+=v;
    };
    SparseIntVect<IndexType> &
    operator-= (int v) {
      typename StorageType::iterator iter=d_data.begin();
      while(iter!=d_data.end()){
        iter->second -= v;
        ++iter;
      }
      return *this;
    };
    SparseIntVect<IndexType> &
    operator- (int v) {
      SparseIntVect<IndexType> res(*this);
      return res-=v;
    };

    bool operator==(const SparseIntVect<IndexType> &v2) const{
      if(d_length!=v2.d_length){
        return false;
      }
      return d_data==v2.d_data;
    }
    bool operator!=(const SparseIntVect<IndexType> &v2) const {
      return !(*this==v2);
    }

    //! returns a binary string representation (pickle)
    std::string toString() const {
      std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
      boost::uint32_t tInt;
      tInt=ci_SPARSEINTVECT_VERSION;
      streamWrite(ss,tInt);
      tInt=sizeof(IndexType);
      streamWrite(ss,tInt);
      streamWrite(ss,d_length);
      IndexType nEntries=d_data.size();
      streamWrite(ss,nEntries);

      typename StorageType::const_iterator iter=d_data.begin();
      while(iter!=d_data.end()){
        streamWrite(ss,iter->first);
        boost::int32_t tInt=iter->second;
        streamWrite(ss,tInt);
        ++iter;
      }
      return ss.str();
    };

    void fromString(const std::string &txt) {
      initFromText(txt.c_str(),txt.length());
    }

  private:
    IndexType d_length;
    StorageType d_data;

    void initFromText(const char *pkl,const unsigned int len) {
      d_data.clear();
      std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
      ss.write(pkl,len);

      boost::uint32_t vers;
      streamRead(ss,vers);
      if(vers==0x0001){
        boost::uint32_t tInt;
        streamRead(ss,tInt);
        if(tInt>sizeof(IndexType)){
          throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
        }
        switch(tInt){
        case sizeof(char):
          readVals<unsigned char>(ss);break;
        case sizeof(boost::int32_t):
          readVals<boost::uint32_t>(ss);break;
        case sizeof(boost::int64_t):
          readVals<boost::uint64_t>(ss);break;
        default:
          throw ValueErrorException("unreadable format");
        }
      } else {
        throw ValueErrorException("bad version in SparseIntVect pickle");
      }
    };
    template <typename T>
    void readVals(std::stringstream &ss){
      PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
      T tVal;
      streamRead(ss,tVal);
      d_length=tVal;
      T nEntries;
      streamRead(ss,nEntries);
      for(T i=0;i<nEntries;++i){
        streamRead(ss,tVal);
        boost::int32_t val;
        streamRead(ss,val);
        d_data[tVal]=val;
      }
    }
  };

  template <typename IndexType, typename SequenceType>
  void updateFromSequence(SparseIntVect<IndexType> &vect,
                          const SequenceType &seq){
    typename SequenceType::const_iterator seqIt;
    for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
      // EFF: probably not the most efficient approach
      IndexType idx=*seqIt;
      vect.setVal(idx,vect.getVal(idx)+1);
    }
  }

  namespace {
    template <typename IndexType>
    void calcVectParams(const SparseIntVect<IndexType> &v1,
                          const SparseIntVect<IndexType> &v2,
                          double &v1Sum,double &v2Sum,
                          double &andSum){
      if(v1.getLength()!=v2.getLength()){
        throw ValueErrorException("SparseIntVect size mismatch");
      }
      v1Sum=v2Sum=andSum=0.0;
      // we're doing : (v1&v2).getTotalVal(), but w/o generating
      // the other vector:
      typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
      iter1=v1.getNonzeroElements().begin();
      if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
      iter2=v2.getNonzeroElements().begin();
      if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
      while(iter1 != v1.getNonzeroElements().end()){
        while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
          ++iter2;
          if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
        }
        if(iter2!=v2.getNonzeroElements().end()){
          if(iter2->first == iter1->first){
            if(abs(iter2->second)<abs(iter1->second)){
              andSum += abs(iter2->second);
            } else {
              andSum += abs(iter1->second);
            }
            ++iter2;
            if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
          }
          ++iter1;
          if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
        } else {
          break;
        }
      }
      if(iter1 != v1.getNonzeroElements().end()){
        ++iter1;
        while(iter1!=v1.getNonzeroElements().end()){
          v1Sum+=abs(iter1->second);
          ++iter1;
        }
      }
      if(iter2!=v2.getNonzeroElements().end()){
        ++iter2;
        while(iter2!=v2.getNonzeroElements().end()){
          v2Sum+=abs(iter2->second);
          ++iter2;
        }
      }
    }
  }

  template <typename IndexType>
  double DiceSimilarity(const SparseIntVect<IndexType> &v1,
                        const SparseIntVect<IndexType> &v2,
                        bool returnDistance=false,
                        double bounds=0.0){
    if(v1.getLength()!=v2.getLength()){
      throw ValueErrorException("SparseIntVect size mismatch");
    }
    double v1Sum=0.0;
    double v2Sum=0.0;
    if(!returnDistance && bounds>0.0){
      v1Sum=v1.getTotalVal(true);
      v2Sum=v2.getTotalVal(true);
      double denom=v1Sum+v2Sum;
      if(fabs(denom)<1e-6){
        if(returnDistance){
          return 1.0;
        } else {
          return 0.0;
        }
      }
      double minV=v1Sum<v2Sum?v1Sum:v2Sum;
      if(2.*minV/denom<bounds){
        return 0.0;
      }
      v1Sum=0.0;
      v2Sum=0.0;
    }

    double numer=0.0;

    calcVectParams(v1,v2,v1Sum,v2Sum,numer);

    double denom=v1Sum+v2Sum;
    double sim;
    if(fabs(denom)<1e-6){
      sim=0.0;
    } else {
      sim=2.*numer/denom;
    }
    if(returnDistance) sim = 1.-sim;
    //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
    return sim;
  }


  template <typename IndexType>
  double TverskySimilarity(const SparseIntVect<IndexType> &v1,
                           const SparseIntVect<IndexType> &v2,
                           double a, double b,
                           bool returnDistance=false,
                           double bounds=0.0){
    RDUNUSED_PARAM(bounds);
    if(v1.getLength()!=v2.getLength()){
      throw ValueErrorException("SparseIntVect size mismatch");
    }
    double v1Sum=0.0;
    double v2Sum=0.0;
    double andSum=0.0;

    calcVectParams(v1,v2,v1Sum,v2Sum,andSum);

    double denom=a*v1Sum+b*v2Sum+(1-a-b)*andSum;
    double sim;

    if(fabs(denom)<1e-6){
      sim=0.0;
    } else {
      sim=andSum/denom;
    }
    if(returnDistance) sim = 1.-sim;
    //std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
    return sim;
  }

  template <typename IndexType>
  double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
                            const SparseIntVect<IndexType> &v2,
                            bool returnDistance=false,
                            double bounds=0.0){
    return TverskySimilarity(v1,v2,1.0,1.0,returnDistance,bounds);
  }

}


#endif