mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
549 lines
16 KiB
C++
549 lines
16 KiB
C++
// $Id$
|
|
//
|
|
// Copyright (C) 2007-2008 Greg Landrum
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <RDGeneral/export.h>
|
|
#ifndef __RD_SPARSE_INT_VECT_20070921__
|
|
#define __RD_SPARSE_INT_VECT_20070921__
|
|
|
|
#include <map>
|
|
#include <string>
|
|
#include <RDGeneral/Invariant.h>
|
|
#include <sstream>
|
|
#include <RDGeneral/Exceptions.h>
|
|
#include <RDGeneral/StreamOps.h>
|
|
#include <cstdint>
|
|
|
|
const int ci_SPARSEINTVECT_VERSION =
|
|
0x0001; //!< version number to use in pickles
|
|
namespace RDKit {
|
|
//! a class for efficiently storing sparse vectors of ints
|
|
template <typename IndexType>
|
|
class SparseIntVect {
|
|
public:
|
|
typedef std::map<IndexType, int> StorageType;
|
|
|
|
SparseIntVect() : d_length(0){};
|
|
|
|
//! initialize with a particular length
|
|
SparseIntVect(IndexType length) : d_length(length){};
|
|
|
|
//! Copy constructor
|
|
SparseIntVect(const SparseIntVect<IndexType> &other) {
|
|
d_length = other.d_length;
|
|
d_data.insert(other.d_data.begin(), other.d_data.end());
|
|
}
|
|
|
|
//! constructor from a pickle
|
|
SparseIntVect(const std::string &pkl) {
|
|
initFromText(pkl.c_str(), pkl.size());
|
|
};
|
|
//! constructor from a pickle
|
|
SparseIntVect(const char *pkl, const unsigned int len) {
|
|
initFromText(pkl, len);
|
|
};
|
|
|
|
//! destructor (doesn't need to do anything)
|
|
~SparseIntVect() {}
|
|
|
|
#ifdef __clang__
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wtautological-compare"
|
|
#elif (defined(__GNUC__) || defined(__GNUG__)) && \
|
|
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
|
|
#if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
|
|
#pragma GCC diagnostic push
|
|
#endif
|
|
#pragma GCC diagnostic ignored "-Wtype-limits"
|
|
#endif
|
|
//! return the value at an index
|
|
int getVal(IndexType idx) const {
|
|
if (idx < 0 || idx >= d_length) {
|
|
throw IndexErrorException(static_cast<int>(idx));
|
|
}
|
|
int res = 0;
|
|
typename StorageType::const_iterator iter = d_data.find(idx);
|
|
if (iter != d_data.end()) {
|
|
res = iter->second;
|
|
}
|
|
return res;
|
|
};
|
|
|
|
//! set the value at an index
|
|
void setVal(IndexType idx, int val) {
|
|
if (idx < 0 || idx >= d_length) {
|
|
throw IndexErrorException(static_cast<int>(idx));
|
|
}
|
|
if (val != 0) {
|
|
d_data[idx] = val;
|
|
} else {
|
|
d_data.erase(idx);
|
|
}
|
|
};
|
|
#ifdef __clang__
|
|
#pragma clang diagnostic pop
|
|
#elif (defined(__GNUC__) || defined(__GNUG__)) && \
|
|
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
|
|
#pragma GCC diagnostic pop
|
|
#endif
|
|
//! support indexing using []
|
|
int operator[](IndexType idx) const { return getVal(idx); };
|
|
|
|
//! returns the length
|
|
IndexType getLength() const { return d_length; };
|
|
|
|
//! returns the sum of all the elements in the vect
|
|
//! the doAbs argument toggles summing the absolute values of the elements
|
|
int getTotalVal(bool doAbs = false) const {
|
|
int res = 0;
|
|
typename StorageType::const_iterator iter;
|
|
for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
|
|
if (!doAbs)
|
|
res += iter->second;
|
|
else
|
|
res += abs(iter->second);
|
|
}
|
|
return res;
|
|
};
|
|
//! returns the length
|
|
unsigned int size() const { return getLength(); };
|
|
|
|
//! returns our nonzero elements as a map(IndexType->int)
|
|
const StorageType &getNonzeroElements() const { return d_data; }
|
|
|
|
//! this is a "fuzzy" intesection, the final value
|
|
//! of each element is equal to the minimum from
|
|
//! the two vects.
|
|
SparseIntVect<IndexType> &operator&=(const SparseIntVect<IndexType> &other) {
|
|
if (other.d_length != d_length) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
typename StorageType::const_iterator oIter = other.d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
// we're relying on the fact that the maps are sorted:
|
|
while (oIter != other.d_data.end() && oIter->first < iter->first) {
|
|
++oIter;
|
|
}
|
|
if (oIter != other.d_data.end() && oIter->first == iter->first) {
|
|
// found it:
|
|
if (oIter->second < iter->second) {
|
|
iter->second = oIter->second;
|
|
}
|
|
++oIter;
|
|
++iter;
|
|
} else {
|
|
// not there; our value is zero, which means
|
|
// we should remove this value:
|
|
typename StorageType::iterator tmpIter = iter;
|
|
++tmpIter;
|
|
d_data.erase(iter);
|
|
iter = tmpIter;
|
|
}
|
|
}
|
|
return *this;
|
|
};
|
|
const SparseIntVect<IndexType> operator&(
|
|
const SparseIntVect<IndexType> &other) const {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res &= other;
|
|
}
|
|
|
|
//! this is a "fuzzy" union, the final value
|
|
//! of each element is equal to the maximum from
|
|
//! the two vects.
|
|
SparseIntVect<IndexType> &operator|=(const SparseIntVect<IndexType> &other) {
|
|
if (other.d_length != d_length) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
typename StorageType::const_iterator oIter = other.d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
// we're relying on the fact that the maps are sorted:
|
|
while (oIter != other.d_data.end() && oIter->first < iter->first) {
|
|
d_data[oIter->first] = oIter->second;
|
|
++oIter;
|
|
}
|
|
if (oIter != other.d_data.end() && oIter->first == iter->first) {
|
|
// found it:
|
|
if (oIter->second > iter->second) {
|
|
iter->second = oIter->second;
|
|
}
|
|
++oIter;
|
|
}
|
|
++iter;
|
|
}
|
|
// finish up the other vect:
|
|
while (oIter != other.d_data.end()) {
|
|
d_data[oIter->first] = oIter->second;
|
|
++oIter;
|
|
}
|
|
return *this;
|
|
};
|
|
const SparseIntVect<IndexType> operator|(
|
|
const SparseIntVect<IndexType> &other) const {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res |= other;
|
|
}
|
|
|
|
SparseIntVect<IndexType> &operator+=(const SparseIntVect<IndexType> &other) {
|
|
if (other.d_length != d_length) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
typename StorageType::const_iterator oIter = other.d_data.begin();
|
|
while (oIter != other.d_data.end()) {
|
|
while (iter != d_data.end() && iter->first < oIter->first) {
|
|
++iter;
|
|
}
|
|
if (iter != d_data.end() && oIter->first == iter->first) {
|
|
// found it:
|
|
iter->second += oIter->second;
|
|
if (!iter->second) {
|
|
typename StorageType::iterator tIter = iter;
|
|
++tIter;
|
|
d_data.erase(iter);
|
|
iter = tIter;
|
|
} else {
|
|
++iter;
|
|
}
|
|
} else {
|
|
d_data[oIter->first] = oIter->second;
|
|
}
|
|
++oIter;
|
|
}
|
|
return *this;
|
|
};
|
|
const SparseIntVect<IndexType> operator+(
|
|
const SparseIntVect<IndexType> &other) const {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res += other;
|
|
}
|
|
|
|
SparseIntVect<IndexType> &operator-=(const SparseIntVect<IndexType> &other) {
|
|
if (other.d_length != d_length) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
typename StorageType::const_iterator oIter = other.d_data.begin();
|
|
while (oIter != other.d_data.end()) {
|
|
while (iter != d_data.end() && iter->first < oIter->first) {
|
|
++iter;
|
|
}
|
|
if (iter != d_data.end() && oIter->first == iter->first) {
|
|
// found it:
|
|
iter->second -= oIter->second;
|
|
if (!iter->second) {
|
|
typename StorageType::iterator tIter = iter;
|
|
++tIter;
|
|
d_data.erase(iter);
|
|
iter = tIter;
|
|
} else {
|
|
++iter;
|
|
}
|
|
} else {
|
|
d_data[oIter->first] = -oIter->second;
|
|
}
|
|
++oIter;
|
|
}
|
|
return *this;
|
|
};
|
|
const SparseIntVect<IndexType> operator-(
|
|
const SparseIntVect<IndexType> &other) const {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res -= other;
|
|
}
|
|
SparseIntVect<IndexType> &operator*=(int v) {
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
iter->second *= v;
|
|
++iter;
|
|
}
|
|
return *this;
|
|
};
|
|
SparseIntVect<IndexType> &operator*(int v) {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res *= v;
|
|
};
|
|
SparseIntVect<IndexType> &operator/=(int v) {
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
iter->second /= v;
|
|
++iter;
|
|
}
|
|
return *this;
|
|
};
|
|
SparseIntVect<IndexType> &operator/(int v) {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res /= v;
|
|
};
|
|
SparseIntVect<IndexType> &operator+=(int v) {
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
iter->second += v;
|
|
++iter;
|
|
}
|
|
return *this;
|
|
};
|
|
SparseIntVect<IndexType> &operator+(int v) {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res += v;
|
|
};
|
|
SparseIntVect<IndexType> &operator-=(int v) {
|
|
typename StorageType::iterator iter = d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
iter->second -= v;
|
|
++iter;
|
|
}
|
|
return *this;
|
|
};
|
|
SparseIntVect<IndexType> &operator-(int v) {
|
|
SparseIntVect<IndexType> res(*this);
|
|
return res -= v;
|
|
};
|
|
|
|
bool operator==(const SparseIntVect<IndexType> &v2) const {
|
|
if (d_length != v2.d_length) {
|
|
return false;
|
|
}
|
|
return d_data == v2.d_data;
|
|
}
|
|
bool operator!=(const SparseIntVect<IndexType> &v2) const {
|
|
return !(*this == v2);
|
|
}
|
|
|
|
//! returns a binary string representation (pickle)
|
|
std::string toString() const {
|
|
std::stringstream ss(std::ios_base::binary | std::ios_base::out |
|
|
std::ios_base::in);
|
|
std::uint32_t tInt;
|
|
tInt = ci_SPARSEINTVECT_VERSION;
|
|
streamWrite(ss, tInt);
|
|
tInt = sizeof(IndexType);
|
|
streamWrite(ss, tInt);
|
|
streamWrite(ss, d_length);
|
|
IndexType nEntries = d_data.size();
|
|
streamWrite(ss, nEntries);
|
|
|
|
typename StorageType::const_iterator iter = d_data.begin();
|
|
while (iter != d_data.end()) {
|
|
streamWrite(ss, iter->first);
|
|
std::int32_t tInt = iter->second;
|
|
streamWrite(ss, tInt);
|
|
++iter;
|
|
}
|
|
return ss.str();
|
|
};
|
|
|
|
void fromString(const std::string &txt) {
|
|
initFromText(txt.c_str(), txt.length());
|
|
}
|
|
|
|
private:
|
|
IndexType d_length;
|
|
StorageType d_data;
|
|
|
|
void initFromText(const char *pkl, const unsigned int len) {
|
|
d_data.clear();
|
|
std::stringstream ss(std::ios_base::binary | std::ios_base::out |
|
|
std::ios_base::in);
|
|
ss.write(pkl, len);
|
|
|
|
std::uint32_t vers;
|
|
streamRead(ss, vers);
|
|
if (vers == 0x0001) {
|
|
std::uint32_t tInt;
|
|
streamRead(ss, tInt);
|
|
if (tInt > sizeof(IndexType)) {
|
|
throw ValueErrorException(
|
|
"IndexType cannot accomodate index size in SparseIntVect pickle");
|
|
}
|
|
switch (tInt) {
|
|
case sizeof(char):
|
|
readVals<unsigned char>(ss);
|
|
break;
|
|
case sizeof(std::int32_t):
|
|
readVals<std::uint32_t>(ss);
|
|
break;
|
|
case sizeof(boost::int64_t):
|
|
readVals<boost::uint64_t>(ss);
|
|
break;
|
|
default:
|
|
throw ValueErrorException("unreadable format");
|
|
}
|
|
} else {
|
|
throw ValueErrorException("bad version in SparseIntVect pickle");
|
|
}
|
|
};
|
|
template <typename T>
|
|
void readVals(std::stringstream &ss) {
|
|
PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
|
|
T tVal;
|
|
streamRead(ss, tVal);
|
|
d_length = tVal;
|
|
T nEntries;
|
|
streamRead(ss, nEntries);
|
|
for (T i = 0; i < nEntries; ++i) {
|
|
streamRead(ss, tVal);
|
|
std::int32_t val;
|
|
streamRead(ss, val);
|
|
d_data[tVal] = val;
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename IndexType, typename SequenceType>
|
|
void updateFromSequence(SparseIntVect<IndexType> &vect,
|
|
const SequenceType &seq) {
|
|
typename SequenceType::const_iterator seqIt;
|
|
for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
|
|
// EFF: probably not the most efficient approach
|
|
IndexType idx = *seqIt;
|
|
vect.setVal(idx, vect.getVal(idx) + 1);
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
template <typename IndexType>
|
|
void calcVectParams(const SparseIntVect<IndexType> &v1,
|
|
const SparseIntVect<IndexType> &v2, double &v1Sum,
|
|
double &v2Sum, double &andSum) {
|
|
if (v1.getLength() != v2.getLength()) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
v1Sum = v2Sum = andSum = 0.0;
|
|
// we're doing : (v1&v2).getTotalVal(), but w/o generating
|
|
// the other vector:
|
|
typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
|
|
iter1 = v1.getNonzeroElements().begin();
|
|
if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
|
|
iter2 = v2.getNonzeroElements().begin();
|
|
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
|
|
while (iter1 != v1.getNonzeroElements().end()) {
|
|
while (iter2 != v2.getNonzeroElements().end() &&
|
|
iter2->first < iter1->first) {
|
|
++iter2;
|
|
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
|
|
}
|
|
if (iter2 != v2.getNonzeroElements().end()) {
|
|
if (iter2->first == iter1->first) {
|
|
if (abs(iter2->second) < abs(iter1->second)) {
|
|
andSum += abs(iter2->second);
|
|
} else {
|
|
andSum += abs(iter1->second);
|
|
}
|
|
++iter2;
|
|
if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
|
|
}
|
|
++iter1;
|
|
if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (iter1 != v1.getNonzeroElements().end()) {
|
|
++iter1;
|
|
while (iter1 != v1.getNonzeroElements().end()) {
|
|
v1Sum += abs(iter1->second);
|
|
++iter1;
|
|
}
|
|
}
|
|
if (iter2 != v2.getNonzeroElements().end()) {
|
|
++iter2;
|
|
while (iter2 != v2.getNonzeroElements().end()) {
|
|
v2Sum += abs(iter2->second);
|
|
++iter2;
|
|
}
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <typename IndexType>
|
|
double DiceSimilarity(const SparseIntVect<IndexType> &v1,
|
|
const SparseIntVect<IndexType> &v2,
|
|
bool returnDistance = false, double bounds = 0.0) {
|
|
if (v1.getLength() != v2.getLength()) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
double v1Sum = 0.0;
|
|
double v2Sum = 0.0;
|
|
if (!returnDistance && bounds > 0.0) {
|
|
v1Sum = v1.getTotalVal(true);
|
|
v2Sum = v2.getTotalVal(true);
|
|
double denom = v1Sum + v2Sum;
|
|
if (fabs(denom) < 1e-6) {
|
|
if (returnDistance) {
|
|
return 1.0;
|
|
} else {
|
|
return 0.0;
|
|
}
|
|
}
|
|
double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
|
|
if (2. * minV / denom < bounds) {
|
|
return 0.0;
|
|
}
|
|
v1Sum = 0.0;
|
|
v2Sum = 0.0;
|
|
}
|
|
|
|
double numer = 0.0;
|
|
|
|
calcVectParams(v1, v2, v1Sum, v2Sum, numer);
|
|
|
|
double denom = v1Sum + v2Sum;
|
|
double sim;
|
|
if (fabs(denom) < 1e-6) {
|
|
sim = 0.0;
|
|
} else {
|
|
sim = 2. * numer / denom;
|
|
}
|
|
if (returnDistance) sim = 1. - sim;
|
|
// std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
|
|
return sim;
|
|
}
|
|
|
|
template <typename IndexType>
|
|
double TverskySimilarity(const SparseIntVect<IndexType> &v1,
|
|
const SparseIntVect<IndexType> &v2, double a, double b,
|
|
bool returnDistance = false, double bounds = 0.0) {
|
|
RDUNUSED_PARAM(bounds);
|
|
if (v1.getLength() != v2.getLength()) {
|
|
throw ValueErrorException("SparseIntVect size mismatch");
|
|
}
|
|
double v1Sum = 0.0;
|
|
double v2Sum = 0.0;
|
|
double andSum = 0.0;
|
|
|
|
calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
|
|
|
|
double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
|
|
double sim;
|
|
|
|
if (fabs(denom) < 1e-6) {
|
|
sim = 0.0;
|
|
} else {
|
|
sim = andSum / denom;
|
|
}
|
|
if (returnDistance) sim = 1. - sim;
|
|
// std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
|
|
return sim;
|
|
}
|
|
|
|
template <typename IndexType>
|
|
double TanimotoSimilarity(const SparseIntVect<IndexType> &v1,
|
|
const SparseIntVect<IndexType> &v2,
|
|
bool returnDistance = false, double bounds = 0.0) {
|
|
return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
|
|
}
|
|
} // namespace RDKit
|
|
|
|
#endif
|