Files
rdkit/Code/DataStructs/SparseIntVect.h
2015-11-14 14:13:36 +01:00

567 lines
17 KiB
C++

// $Id$
//
// Copyright (C) 2007-2008 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#ifndef __RD_SPARSE_INT_VECT_20070921__
#define __RD_SPARSE_INT_VECT_20070921__
#include <map>
#include <string>
#include <RDGeneral/Invariant.h>
#include <sstream>
#include <RDGeneral/Exceptions.h>
#include <RDGeneral/StreamOps.h>
#include <boost/cstdint.hpp>
const int ci_SPARSEINTVECT_VERSION=0x0001; //!< version number to use in pickles
namespace RDKit{
//! a class for efficiently storing sparse vectors of ints
template <typename IndexType>
class SparseIntVect {
public:
typedef std::map<IndexType,int> StorageType;
SparseIntVect() : d_length(0) {};
//! initialize with a particular length
SparseIntVect(IndexType length) : d_length(length) {};
//! Copy constructor
SparseIntVect(const SparseIntVect<IndexType> &other){
d_length=other.d_length;
d_data.insert(other.d_data.begin(),other.d_data.end());
}
//! constructor from a pickle
SparseIntVect(const std::string &pkl){
initFromText(pkl.c_str(),pkl.size());
};
//! constructor from a pickle
SparseIntVect(const char *pkl,const unsigned int len){
initFromText(pkl,len);
};
//! destructor (doesn't need to do anything)
~SparseIntVect() {}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wtautological-compare"
#elif ( defined(__GNUC__) || defined(__GNUG__) ) && \
( __GNUC__ > 4 || \
(__GNUC__ == 4 && __GNUC_MINOR__ > 1 ) )
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wtype-limits"
#endif
//! return the value at an index
int getVal(IndexType idx) const {
if(idx<0||idx>=d_length){
throw IndexErrorException(static_cast<int>(idx));
}
int res=0;
typename StorageType::const_iterator iter=d_data.find(idx);
if(iter!=d_data.end()){
res=iter->second;
}
return res;
};
//! set the value at an index
void setVal(IndexType idx, int val){
if(idx<0 || idx>=d_length){
throw IndexErrorException(static_cast<int>(idx));
}
if(val!=0){
d_data[idx]=val;
} else {
d_data.erase(idx);
}
};
#ifdef __clang__
#pragma clang diagnostic pop
#elif ( defined(__GNUC__) || defined(__GNUG__) ) && \
( __GNUC__ > 4 || \
(__GNUC__ == 4 && __GNUC_MINOR__ > 1 ) )
#pragma GCC diagnostic pop
#endif
//! support indexing using []
int operator[] (IndexType idx) const { return getVal(idx); };
//! returns the length
IndexType getLength() const { return d_length; };
//! returns the sum of all the elements in the vect
//! the doAbs argument toggles summing the absolute values of the elements
int getTotalVal(bool doAbs=false) const {
int res=0;
typename StorageType::const_iterator iter;
for(iter=d_data.begin();iter!=d_data.end();++iter){
if(!doAbs) res+=iter->second;
else res+=abs(iter->second);
}
return res;
};
//! returns the length
unsigned int size() const { return getLength(); };
//! returns our nonzero elements as a map(IndexType->int)
const StorageType &getNonzeroElements() const {
return d_data;
}
//! this is a "fuzzy" intesection, the final value
//! of each element is equal to the minimum from
//! the two vects.
SparseIntVect<IndexType> &
operator&= (const SparseIntVect<IndexType> &other) {
if(other.d_length!=d_length){
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter=d_data.begin();
typename StorageType::const_iterator oIter=other.d_data.begin();
while(iter!=d_data.end()){
// we're relying on the fact that the maps are sorted:
while(oIter!=other.d_data.end() && oIter->first < iter->first){
++oIter;
}
if(oIter!=other.d_data.end() && oIter->first==iter->first){
// found it:
if(oIter->second<iter->second){
iter->second=oIter->second;
}
++oIter;
++iter;
} else {
// not there; our value is zero, which means
// we should remove this value:
typename StorageType::iterator tmpIter=iter;
++tmpIter;
d_data.erase(iter);
iter=tmpIter;
}
}
return *this;
};
const SparseIntVect<IndexType>
operator& (const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res&=other;
}
//! this is a "fuzzy" union, the final value
//! of each element is equal to the maximum from
//! the two vects.
SparseIntVect<IndexType> &
operator|= (const SparseIntVect<IndexType> &other) {
if(other.d_length!=d_length){
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter=d_data.begin();
typename StorageType::const_iterator oIter=other.d_data.begin();
while(iter!=d_data.end()){
// we're relying on the fact that the maps are sorted:
while(oIter!=other.d_data.end() &&
oIter->first < iter->first){
d_data[oIter->first]=oIter->second;
++oIter;
}
if(oIter!=other.d_data.end() && oIter->first==iter->first){
// found it:
if(oIter->second>iter->second){
iter->second=oIter->second;
}
++oIter;
}
++iter;
}
// finish up the other vect:
while(oIter!=other.d_data.end()){
d_data[oIter->first]=oIter->second;
++oIter;
}
return *this;
};
const SparseIntVect<IndexType>
operator| (const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res|=other;
}
SparseIntVect<IndexType> &
operator+= (const SparseIntVect<IndexType> &other) {
if(other.d_length!=d_length){
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter=d_data.begin();
typename StorageType::const_iterator oIter=other.d_data.begin();
while(oIter!=other.d_data.end()){
while(iter!=d_data.end() &&
iter->first < oIter->first){
++iter;
}
if(iter!=d_data.end() && oIter->first==iter->first){
// found it:
iter->second+=oIter->second;
if(!iter->second){
typename StorageType::iterator tIter=iter;
++tIter;
d_data.erase(iter);
iter=tIter;
} else {
++iter;
}
} else {
d_data[oIter->first]=oIter->second;
}
++oIter;
}
return *this;
};
const SparseIntVect<IndexType>
operator+ (const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res+=other;
}
SparseIntVect<IndexType> &
operator-= (const SparseIntVect<IndexType> &other) {
if(other.d_length!=d_length){
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter=d_data.begin();
typename StorageType::const_iterator oIter=other.d_data.begin();
while(oIter!=other.d_data.end()){
while(iter!=d_data.end() &&
iter->first < oIter->first){
++iter;
}
if(iter!=d_data.end() && oIter->first==iter->first){
// found it:
iter->second-=oIter->second;
if(!iter->second){
typename StorageType::iterator tIter=iter;
++tIter;
d_data.erase(iter);
iter=tIter;
} else {
++iter;
}
} else {
d_data[oIter->first] = -oIter->second;
}
++oIter;
}
return *this;
};
const SparseIntVect<IndexType>
operator- (const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res-=other;
}
SparseIntVect<IndexType> &
operator*= (int v) {
typename StorageType::iterator iter=d_data.begin();
while(iter!=d_data.end()){
iter->second *= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &
operator* (int v) {
SparseIntVect<IndexType> res(*this);
return res*=v;
};
SparseIntVect<IndexType> &
operator/= (int v) {
typename StorageType::iterator iter=d_data.begin();
while(iter!=d_data.end()){
iter->second /= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &
operator/ (int v) {
SparseIntVect<IndexType> res(*this);
return res/=v;
};
SparseIntVect<IndexType> &
operator+= (int v) {
typename StorageType::iterator iter=d_data.begin();
while(iter!=d_data.end()){
iter->second += v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &
operator+ (int v) {
SparseIntVect<IndexType> res(*this);
return res+=v;
};
SparseIntVect<IndexType> &
operator-= (int v) {
typename StorageType::iterator iter=d_data.begin();
while(iter!=d_data.end()){
iter->second -= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &
operator- (int v) {
SparseIntVect<IndexType> res(*this);
return res-=v;
};
bool operator==(const SparseIntVect<IndexType> &v2) const{
if(d_length!=v2.d_length){
return false;
}
return d_data==v2.d_data;
}
bool operator!=(const SparseIntVect<IndexType> &v2) const {
return !(*this==v2);
}
//! returns a binary string representation (pickle)
std::string toString() const {
std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
boost::uint32_t tInt;
tInt=ci_SPARSEINTVECT_VERSION;
streamWrite(ss,tInt);
tInt=sizeof(IndexType);
streamWrite(ss,tInt);
streamWrite(ss,d_length);
IndexType nEntries=d_data.size();
streamWrite(ss,nEntries);
typename StorageType::const_iterator iter=d_data.begin();
while(iter!=d_data.end()){
streamWrite(ss,iter->first);
boost::int32_t tInt=iter->second;
streamWrite(ss,tInt);
++iter;
}
return ss.str();
};
void fromString(const std::string &txt) {
initFromText(txt.c_str(),txt.length());
}
private:
IndexType d_length;
StorageType d_data;
void initFromText(const char *pkl,const unsigned int len) {
d_data.clear();
std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
ss.write(pkl,len);
boost::uint32_t vers;
streamRead(ss,vers);
if(vers==0x0001){
boost::uint32_t tInt;
streamRead(ss,tInt);
if(tInt>sizeof(IndexType)){
throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
}
switch(tInt){
case sizeof(char):
readVals<unsigned char>(ss);break;
case sizeof(boost::int32_t):
readVals<boost::uint32_t>(ss);break;
case sizeof(boost::int64_t):
readVals<boost::uint64_t>(ss);break;
default:
throw ValueErrorException("unreadable format");
}
} else {
throw ValueErrorException("bad version in SparseIntVect pickle");
}
};
template <typename T>
void readVals(std::stringstream &ss){
PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
T tVal;
streamRead(ss,tVal);
d_length=tVal;
T nEntries;
streamRead(ss,nEntries);
for(T i=0;i<nEntries;++i){
streamRead(ss,tVal);
boost::int32_t val;
streamRead(ss,val);
d_data[tVal]=val;
}
}
};
template <typename IndexType, typename SequenceType>
void updateFromSequence(SparseIntVect<IndexType> &vect,
const SequenceType &seq){
typename SequenceType::const_iterator seqIt;
for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
// EFF: probably not the most efficient approach
IndexType idx=*seqIt;
vect.setVal(idx,vect.getVal(idx)+1);
}
}
namespace {
template <typename IndexType>
void calcVectParams(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
double &v1Sum,double &v2Sum,
double &andSum){
if(v1.getLength()!=v2.getLength()){
throw ValueErrorException("SparseIntVect size mismatch");
}
v1Sum=v2Sum=andSum=0.0;
// we're doing : (v1&v2).getTotalVal(), but w/o generating
// the other vector:
typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
iter1=v1.getNonzeroElements().begin();
if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
iter2=v2.getNonzeroElements().begin();
if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
while(iter1 != v1.getNonzeroElements().end()){
while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
++iter2;
if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
}
if(iter2!=v2.getNonzeroElements().end()){
if(iter2->first == iter1->first){
if(abs(iter2->second)<abs(iter1->second)){
andSum += abs(iter2->second);
} else {
andSum += abs(iter1->second);
}
++iter2;
if(iter2!=v2.getNonzeroElements().end()) v2Sum+=abs(iter2->second);
}
++iter1;
if(iter1!=v1.getNonzeroElements().end()) v1Sum+=abs(iter1->second);
} else {
break;
}
}
if(iter1 != v1.getNonzeroElements().end()){
++iter1;
while(iter1!=v1.getNonzeroElements().end()){
v1Sum+=abs(iter1->second);
++iter1;
}
}
if(iter2!=v2.getNonzeroElements().end()){
++iter2;
while(iter2!=v2.getNonzeroElements().end()){
v2Sum+=abs(iter2->second);
++iter2;
}
}
}
}
template <typename IndexType>
double DiceSimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
bool returnDistance=false,
double bounds=0.0){
if(v1.getLength()!=v2.getLength()){
throw ValueErrorException("SparseIntVect size mismatch");
}
double v1Sum=0.0;
double v2Sum=0.0;
if(!returnDistance && bounds>0.0){
v1Sum=v1.getTotalVal(true);
v2Sum=v2.getTotalVal(true);
double denom=v1Sum+v2Sum;
if(fabs(denom)<1e-6){
if(returnDistance){
return 1.0;
} else {
return 0.0;
}
}
double minV=v1Sum<v2Sum?v1Sum:v2Sum;
if(2.*minV/denom<bounds){
return 0.0;
}
v1Sum=0.0;
v2Sum=0.0;
}
double numer=0.0;
calcVectParams(v1,v2,v1Sum,v2Sum,numer);
double denom=v1Sum+v2Sum;
double sim;
if(fabs(denom)<1e-6){
sim=0.0;
} else {
sim=2.*numer/denom;
}
if(returnDistance) sim = 1.-sim;
//std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
return sim;
}
template <typename IndexType>
double TverskySimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
double a, double b,
bool returnDistance=false,
double bounds=0.0){
RDUNUSED_PARAM(bounds);
if(v1.getLength()!=v2.getLength()){
throw ValueErrorException("SparseIntVect size mismatch");
}
double v1Sum=0.0;
double v2Sum=0.0;
double andSum=0.0;
calcVectParams(v1,v2,v1Sum,v2Sum,andSum);
double denom=a*v1Sum+b*v2Sum+(1-a-b)*andSum;
double sim;
if(fabs(denom)<1e-6){
sim=0.0;
} else {
sim=andSum/denom;
}
if(returnDistance) sim = 1.-sim;
//std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
return sim;
}
template <typename IndexType>
double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
bool returnDistance=false,
double bounds=0.0){
return TverskySimilarity(v1,v2,1.0,1.0,returnDistance,bounds);
}
}
#endif