Files
rdkit/Code/DataStructs/SparseIntVect.h
Paolo Tosco c08ea49bda - enable building DLLs on Windows (#1861)
* - enable building DLLs on Windows

* - export.h and test.h are now auto-generated by CMake
2018-05-16 08:42:41 +02:00

549 lines
16 KiB
C++

// $Id$
//
// Copyright (C) 2007-2008 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDBoost/export.h>
#ifndef __RD_SPARSE_INT_VECT_20070921__
#define __RD_SPARSE_INT_VECT_20070921__
#include <map>
#include <string>
#include <RDGeneral/Invariant.h>
#include <sstream>
#include <RDGeneral/Exceptions.h>
#include <RDGeneral/StreamOps.h>
#include <boost/cstdint.hpp>
const int ci_SPARSEINTVECT_VERSION =
0x0001; //!< version number to use in pickles
namespace RDKit {
//! a class for efficiently storing sparse vectors of ints
template <typename IndexType>
class SparseIntVect {
public:
typedef std::map<IndexType, int> StorageType;
SparseIntVect() : d_length(0){};
//! initialize with a particular length
SparseIntVect(IndexType length) : d_length(length){};
//! Copy constructor
SparseIntVect(const SparseIntVect<IndexType> &other) {
d_length = other.d_length;
d_data.insert(other.d_data.begin(), other.d_data.end());
}
//! constructor from a pickle
SparseIntVect(const std::string &pkl) {
initFromText(pkl.c_str(), pkl.size());
};
//! constructor from a pickle
SparseIntVect(const char *pkl, const unsigned int len) {
initFromText(pkl, len);
};
//! destructor (doesn't need to do anything)
~SparseIntVect() {}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wtautological-compare"
#elif (defined(__GNUC__) || defined(__GNUG__)) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
#if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
#pragma GCC diagnostic push
#endif
#pragma GCC diagnostic ignored "-Wtype-limits"
#endif
//! return the value at an index
int getVal(IndexType idx) const {
if (idx < 0 || idx >= d_length) {
throw IndexErrorException(static_cast<int>(idx));
}
int res = 0;
typename StorageType::const_iterator iter = d_data.find(idx);
if (iter != d_data.end()) {
res = iter->second;
}
return res;
};
//! set the value at an index
void setVal(IndexType idx, int val) {
if (idx < 0 || idx >= d_length) {
throw IndexErrorException(static_cast<int>(idx));
}
if (val != 0) {
d_data[idx] = val;
} else {
d_data.erase(idx);
}
};
#ifdef __clang__
#pragma clang diagnostic pop
#elif (defined(__GNUC__) || defined(__GNUG__)) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
#pragma GCC diagnostic pop
#endif
//! support indexing using []
int operator[](IndexType idx) const { return getVal(idx); };
//! returns the length
IndexType getLength() const { return d_length; };
//! returns the sum of all the elements in the vect
//! the doAbs argument toggles summing the absolute values of the elements
int getTotalVal(bool doAbs = false) const {
int res = 0;
typename StorageType::const_iterator iter;
for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
if (!doAbs)
res += iter->second;
else
res += abs(iter->second);
}
return res;
};
//! returns the length
unsigned int size() const { return getLength(); };
//! returns our nonzero elements as a map(IndexType->int)
const StorageType &getNonzeroElements() const { return d_data; }
//! this is a "fuzzy" intesection, the final value
//! of each element is equal to the minimum from
//! the two vects.
SparseIntVect<IndexType> &operator&=(const SparseIntVect<IndexType> &other) {
if (other.d_length != d_length) {
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter = d_data.begin();
typename StorageType::const_iterator oIter = other.d_data.begin();
while (iter != d_data.end()) {
// we're relying on the fact that the maps are sorted:
while (oIter != other.d_data.end() && oIter->first < iter->first) {
++oIter;
}
if (oIter != other.d_data.end() && oIter->first == iter->first) {
// found it:
if (oIter->second < iter->second) {
iter->second = oIter->second;
}
++oIter;
++iter;
} else {
// not there; our value is zero, which means
// we should remove this value:
typename StorageType::iterator tmpIter = iter;
++tmpIter;
d_data.erase(iter);
iter = tmpIter;
}
}
return *this;
};
const SparseIntVect<IndexType> operator&(
const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res &= other;
}
//! this is a "fuzzy" union, the final value
//! of each element is equal to the maximum from
//! the two vects.
SparseIntVect<IndexType> &operator|=(const SparseIntVect<IndexType> &other) {
if (other.d_length != d_length) {
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter = d_data.begin();
typename StorageType::const_iterator oIter = other.d_data.begin();
while (iter != d_data.end()) {
// we're relying on the fact that the maps are sorted:
while (oIter != other.d_data.end() && oIter->first < iter->first) {
d_data[oIter->first] = oIter->second;
++oIter;
}
if (oIter != other.d_data.end() && oIter->first == iter->first) {
// found it:
if (oIter->second > iter->second) {
iter->second = oIter->second;
}
++oIter;
}
++iter;
}
// finish up the other vect:
while (oIter != other.d_data.end()) {
d_data[oIter->first] = oIter->second;
++oIter;
}
return *this;
};
const SparseIntVect<IndexType> operator|(
const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res |= other;
}
SparseIntVect<IndexType> &operator+=(const SparseIntVect<IndexType> &other) {
if (other.d_length != d_length) {
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter = d_data.begin();
typename StorageType::const_iterator oIter = other.d_data.begin();
while (oIter != other.d_data.end()) {
while (iter != d_data.end() && iter->first < oIter->first) {
++iter;
}
if (iter != d_data.end() && oIter->first == iter->first) {
// found it:
iter->second += oIter->second;
if (!iter->second) {
typename StorageType::iterator tIter = iter;
++tIter;
d_data.erase(iter);
iter = tIter;
} else {
++iter;
}
} else {
d_data[oIter->first] = oIter->second;
}
++oIter;
}
return *this;
};
const SparseIntVect<IndexType> operator+(
const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res += other;
}
SparseIntVect<IndexType> &operator-=(const SparseIntVect<IndexType> &other) {
if (other.d_length != d_length) {
throw ValueErrorException("SparseIntVect size mismatch");
}
typename StorageType::iterator iter = d_data.begin();
typename StorageType::const_iterator oIter = other.d_data.begin();
while (oIter != other.d_data.end()) {
while (iter != d_data.end() && iter->first < oIter->first) {
++iter;
}
if (iter != d_data.end() && oIter->first == iter->first) {
// found it:
iter->second -= oIter->second;
if (!iter->second) {
typename StorageType::iterator tIter = iter;
++tIter;
d_data.erase(iter);
iter = tIter;
} else {
++iter;
}
} else {
d_data[oIter->first] = -oIter->second;
}
++oIter;
}
return *this;
};
const SparseIntVect<IndexType> operator-(
const SparseIntVect<IndexType> &other) const {
SparseIntVect<IndexType> res(*this);
return res -= other;
}
SparseIntVect<IndexType> &operator*=(int v) {
typename StorageType::iterator iter = d_data.begin();
while (iter != d_data.end()) {
iter->second *= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &operator*(int v) {
SparseIntVect<IndexType> res(*this);
return res *= v;
};
SparseIntVect<IndexType> &operator/=(int v) {
typename StorageType::iterator iter = d_data.begin();
while (iter != d_data.end()) {
iter->second /= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &operator/(int v) {
SparseIntVect<IndexType> res(*this);
return res /= v;
};
SparseIntVect<IndexType> &operator+=(int v) {
typename StorageType::iterator iter = d_data.begin();
while (iter != d_data.end()) {
iter->second += v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &operator+(int v) {
SparseIntVect<IndexType> res(*this);
return res += v;
};
SparseIntVect<IndexType> &operator-=(int v) {
typename StorageType::iterator iter = d_data.begin();
while (iter != d_data.end()) {
iter->second -= v;
++iter;
}
return *this;
};
SparseIntVect<IndexType> &operator-(int v) {
SparseIntVect<IndexType> res(*this);
return res -= v;
};
bool operator==(const SparseIntVect<IndexType> &v2) const {
if (d_length != v2.d_length) {
return false;
}
return d_data == v2.d_data;
}
bool operator!=(const SparseIntVect<IndexType> &v2) const {
return !(*this == v2);
}
//! returns a binary string representation (pickle)
std::string toString() const {
std::stringstream ss(std::ios_base::binary | std::ios_base::out |
std::ios_base::in);
boost::uint32_t tInt;
tInt = ci_SPARSEINTVECT_VERSION;
streamWrite(ss, tInt);
tInt = sizeof(IndexType);
streamWrite(ss, tInt);
streamWrite(ss, d_length);
IndexType nEntries = d_data.size();
streamWrite(ss, nEntries);
typename StorageType::const_iterator iter = d_data.begin();
while (iter != d_data.end()) {
streamWrite(ss, iter->first);
boost::int32_t tInt = iter->second;
streamWrite(ss, tInt);
++iter;
}
return ss.str();
};
void fromString(const std::string &txt) {
initFromText(txt.c_str(), txt.length());
}
private:
IndexType d_length;
StorageType d_data;
void initFromText(const char *pkl, const unsigned int len) {
d_data.clear();
std::stringstream ss(std::ios_base::binary | std::ios_base::out |
std::ios_base::in);
ss.write(pkl, len);
boost::uint32_t vers;
streamRead(ss, vers);
if (vers == 0x0001) {
boost::uint32_t tInt;
streamRead(ss, tInt);
if (tInt > sizeof(IndexType)) {
throw ValueErrorException(
"IndexType cannot accomodate index size in SparseIntVect pickle");
}
switch (tInt) {
case sizeof(char):
readVals<unsigned char>(ss);
break;
case sizeof(boost::int32_t):
readVals<boost::uint32_t>(ss);
break;
case sizeof(boost::int64_t):
readVals<boost::uint64_t>(ss);
break;
default:
throw ValueErrorException("unreadable format");
}
} else {
throw ValueErrorException("bad version in SparseIntVect pickle");
}
};
template <typename T>
void readVals(std::stringstream &ss) {
PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
T tVal;
streamRead(ss, tVal);
d_length = tVal;
T nEntries;
streamRead(ss, nEntries);
for (T i = 0; i < nEntries; ++i) {
streamRead(ss, tVal);
boost::int32_t val;
streamRead(ss, val);
d_data[tVal] = val;
}
}
};
template <typename IndexType, typename SequenceType>
void updateFromSequence(SparseIntVect<IndexType> &vect,
const SequenceType &seq) {
typename SequenceType::const_iterator seqIt;
for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
// EFF: probably not the most efficient approach
IndexType idx = *seqIt;
vect.setVal(idx, vect.getVal(idx) + 1);
}
}
namespace {
template <typename IndexType>
void calcVectParams(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2, double &v1Sum,
double &v2Sum, double &andSum) {
if (v1.getLength() != v2.getLength()) {
throw ValueErrorException("SparseIntVect size mismatch");
}
v1Sum = v2Sum = andSum = 0.0;
// we're doing : (v1&v2).getTotalVal(), but w/o generating
// the other vector:
typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
iter1 = v1.getNonzeroElements().begin();
if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
iter2 = v2.getNonzeroElements().begin();
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
while (iter1 != v1.getNonzeroElements().end()) {
while (iter2 != v2.getNonzeroElements().end() &&
iter2->first < iter1->first) {
++iter2;
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
}
if (iter2 != v2.getNonzeroElements().end()) {
if (iter2->first == iter1->first) {
if (abs(iter2->second) < abs(iter1->second)) {
andSum += abs(iter2->second);
} else {
andSum += abs(iter1->second);
}
++iter2;
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
}
++iter1;
if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
} else {
break;
}
}
if (iter1 != v1.getNonzeroElements().end()) {
++iter1;
while (iter1 != v1.getNonzeroElements().end()) {
v1Sum += abs(iter1->second);
++iter1;
}
}
if (iter2 != v2.getNonzeroElements().end()) {
++iter2;
while (iter2 != v2.getNonzeroElements().end()) {
v2Sum += abs(iter2->second);
++iter2;
}
}
}
}
template <typename IndexType>
double DiceSimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
bool returnDistance = false, double bounds = 0.0) {
if (v1.getLength() != v2.getLength()) {
throw ValueErrorException("SparseIntVect size mismatch");
}
double v1Sum = 0.0;
double v2Sum = 0.0;
if (!returnDistance && bounds > 0.0) {
v1Sum = v1.getTotalVal(true);
v2Sum = v2.getTotalVal(true);
double denom = v1Sum + v2Sum;
if (fabs(denom) < 1e-6) {
if (returnDistance) {
return 1.0;
} else {
return 0.0;
}
}
double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
if (2. * minV / denom < bounds) {
return 0.0;
}
v1Sum = 0.0;
v2Sum = 0.0;
}
double numer = 0.0;
calcVectParams(v1, v2, v1Sum, v2Sum, numer);
double denom = v1Sum + v2Sum;
double sim;
if (fabs(denom) < 1e-6) {
sim = 0.0;
} else {
sim = 2. * numer / denom;
}
if (returnDistance) sim = 1. - sim;
// std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
return sim;
}
template <typename IndexType>
double TverskySimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2, double a, double b,
bool returnDistance = false, double bounds = 0.0) {
RDUNUSED_PARAM(bounds);
if (v1.getLength() != v2.getLength()) {
throw ValueErrorException("SparseIntVect size mismatch");
}
double v1Sum = 0.0;
double v2Sum = 0.0;
double andSum = 0.0;
calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
double sim;
if (fabs(denom) < 1e-6) {
sim = 0.0;
} else {
sim = andSum / denom;
}
if (returnDistance) sim = 1. - sim;
// std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
return sim;
}
template <typename IndexType>
double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
const SparseIntVect<IndexType> &v2,
bool returnDistance = false, double bounds = 0.0) {
return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
}
}
#endif