Optimization of the MaxMin diversity picker (#1574)

* initial submission; needs review

* minor

* line endings

* remove cache functionality since it's now redundant

* add test data

* add a test

* changes from review
This commit is contained in:
Greg Landrum
2017-09-17 15:13:14 +02:00
committed by Brian Kelley
parent c21605dbea
commit fd3524f77c
4 changed files with 1153 additions and 103 deletions

View File

@@ -1,5 +1,6 @@
//
// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
// Copyright (C) 2017 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
@@ -7,8 +8,8 @@
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#ifndef __RD_MAXMINPICKER_H__
#define __RD_MAXMINPICKER_H__
#ifndef RD_MAXMINPICKER_H
#define RD_MAXMINPICKER_H
#include <RDGeneral/types.h>
#include <RDGeneral/utils.h>
@@ -110,6 +111,7 @@ class MaxMinPicker : public DistPicker {
unsigned int pickSize, RDKit::INT_VECT firstPicks,
int seed = -1) const {
CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
if (!poolSize) throw ValueErrorException("empty pool to pick from");
if (poolSize < pickSize)
throw ValueErrorException("pickSize cannot be larger than the poolSize");
distmatFunctor functor(distMat);
@@ -123,6 +125,13 @@ class MaxMinPicker : public DistPicker {
return pick(distMat, poolSize, pickSize, iv);
}
};
struct MaxMinPickInfo {
double dist_bound; // distance to closest reference
unsigned int picks; // number of references considered
unsigned int next; // singly linked list of candidates
};
// we implement this here in order to allow arbitrary functors without link
// errors
template <typename T>
@@ -130,74 +139,120 @@ RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
RDKit::INT_VECT firstPicks,
int seed) const {
if (!poolSize) throw ValueErrorException("empty pool to pick from");
if (poolSize < pickSize)
throw ValueErrorException("pickSize cannot be larger than the poolSize");
RDKit::INT_LIST pool;
RDKit::INT_VECT picks;
unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
if (!pinfo) return picks;
memset(pinfo, 0, memsize);
picks.reserve(pickSize);
unsigned int pick = 0;
// enter the pool into a list so that we can pick out of it easily
for (unsigned int i = 0; i < poolSize; i++) {
pool.push_back(i);
}
// get a seeded random number generator:
typedef boost::mt19937 rng_type;
typedef boost::uniform_int<> distrib_type;
typedef boost::variate_generator<rng_type &, distrib_type> source_type;
rng_type generator(42u);
distrib_type dist(0, poolSize-1);
source_type randomSource(generator, dist);
if (seed > 0) generator.seed(static_cast<rng_type::result_type>(seed));
unsigned int picked = 0; // picks.size()
unsigned int pick;
// pick the first entry
if (!firstPicks.size()) {
if (firstPicks.empty()) {
// get a seeded random number generator:
typedef boost::mt19937 rng_type;
typedef boost::uniform_int<> distrib_type;
typedef boost::variate_generator<rng_type &, distrib_type> source_type;
rng_type generator(42u);
distrib_type dist(0, poolSize - 1);
source_type randomSource(generator, dist);
if (seed > 0) generator.seed(static_cast<rng_type::result_type>(seed));
pick = randomSource();
// add the pick to the picks
picks.push_back(pick);
// and remove it from the pool
pool.remove(pick);
pinfo[pick].picks = 1;
picked = 1;
} else {
for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
pIdx != firstPicks.end(); ++pIdx) {
pick = static_cast<unsigned int>(*pIdx);
if (pick >= poolSize)
if (pick >= poolSize) {
delete[] pinfo;
throw ValueErrorException("pick index was larger than the poolSize");
}
picks.push_back(pick);
pool.remove(pick);
pinfo[pick].picks = 1;
picked++;
}
}
if (picked >= pickSize) {
delete[] pinfo;
return picks;
}
unsigned int pool_list = 0;
unsigned int *prev = &pool_list;
// enter the pool into a list so that we can pick out of it easily
for (unsigned int i = 0; i < poolSize; i++)
if (pinfo[i].picks == 0) {
*prev = i;
prev = &pinfo[i].next;
}
*prev = 0;
unsigned int poolIdx;
unsigned int pickIdx;
// First pass initialize dist_bound
prev = &pool_list;
pickIdx = picks[0];
do {
poolIdx = *prev;
pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
pinfo[poolIdx].picks = 1;
prev = &pinfo[poolIdx].next;
} while (*prev != 0);
// now pick 1 compound at a time
while (picks.size() < pickSize) {
while (picked < pickSize) {
unsigned int *pick_prev = 0;
double maxOFmin = -1.0;
RDKit::INT_LIST_I plri = pool.end();
for (RDKit::INT_LIST_I pli = pool.begin(); pli != pool.end(); ++pli) {
unsigned int poolIdx = (*pli);
double minTOi = RDKit::MAX_DOUBLE;
for (RDKit::INT_VECT_CI pi = picks.begin(); pi != picks.end(); ++pi) {
unsigned int pickIdx = (*pi);
CHECK_INVARIANT(poolIdx != pickIdx, "");
double dist = func(poolIdx, pickIdx);
if (dist <= minTOi) {
minTOi = dist;
prev = &pool_list;
do {
poolIdx = *prev;
double minTOi = pinfo[poolIdx].dist_bound;
if (minTOi > maxOFmin) {
unsigned int pi = pinfo[poolIdx].picks;
while (pi < picked) {
unsigned int picki = picks[pi];
CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
double dist = func(poolIdx, picki);
pi++;
if (dist <= minTOi) {
minTOi = dist;
if (minTOi <= maxOFmin) break;
}
}
pinfo[poolIdx].dist_bound = minTOi;
pinfo[poolIdx].picks = pi;
if (minTOi > maxOFmin) {
maxOFmin = minTOi;
pick_prev = prev;
pick = poolIdx;
}
}
if (minTOi > maxOFmin ||
(RDKit::feq(minTOi, maxOFmin) && poolIdx < pick)) {
maxOFmin = minTOi;
pick = poolIdx;
plri = pli;
}
}
prev = &pinfo[poolIdx].next;
} while (*prev != 0);
// now add the new pick to picks and remove it from the pool
// now add the new pick to picks and remove it from the pool
*pick_prev = pinfo[pick].next;
picks.push_back(pick);
CHECK_INVARIANT(plri != pool.end(), "");
pool.erase(plri);
picked++;
}
delete[] pinfo;
return picks;
}
};

View File

@@ -1,6 +1,5 @@
// $Id$
//
// Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC
// Copyright (C) 2003-2017 Greg Landrum and Rational Discovery LLC
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
@@ -57,79 +56,57 @@ RDKit::INT_VECT MaxMinPicks(MaxMinPicker *picker, python::object distMat,
class pyobjFunctor {
public:
pyobjFunctor(python::object obj, bool useCache)
: dp_obj(obj), dp_cache(NULL) {
if (useCache)
dp_cache = new std::map<std::pair<unsigned int, unsigned int>, double>();
}
~pyobjFunctor() { delete dp_cache; }
pyobjFunctor(python::object obj) : dp_obj(obj) {}
~pyobjFunctor() {}
double operator()(unsigned int i, unsigned int j) {
double res;
std::pair<unsigned int, unsigned int> idxPair(i, j);
if (dp_cache && dp_cache->count(idxPair) > 0) {
res = (*dp_cache)[idxPair];
} else {
res = python::extract<double>(dp_obj(i, j));
if (dp_cache) (*dp_cache)[idxPair] = res;
}
return res;
return python::extract<double>(dp_obj(i, j));
}
private:
python::object dp_obj;
std::map<std::pair<unsigned int, unsigned int>, double> *dp_cache;
};
RDKit::INT_VECT LazyMaxMinPicks(MaxMinPicker *picker, python::object distFunc,
int poolSize, int pickSize,
python::object firstPicks, int seed,
bool useCache) {
python::object useCache) {
if (useCache != python::object()) {
BOOST_LOG(rdWarningLog) << "the useCache argument is deprecated and ignored"
<< std::endl;
}
RDKit::INT_VECT firstPickVect;
for (unsigned int i = 0;
i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) {
firstPickVect.push_back(python::extract<int>(firstPicks[i]));
}
RDKit::INT_VECT res;
pyobjFunctor functor(distFunc, useCache);
pyobjFunctor functor(distFunc);
res = picker->lazyPick(functor, poolSize, pickSize, firstPickVect, seed);
return res;
}
// NOTE: TANIMOTO and DICE provably return the same results for the diversity
// picking
// this is still here just in case we ever later want to support other
// picking this is still here just in case we ever later want to support other
// methods.
typedef enum { TANIMOTO = 1, DICE } DistanceMethod;
template <typename BV>
class pyBVFunctor {
public:
pyBVFunctor(const std::vector<const BV *> &obj, DistanceMethod method,
bool useCache)
: d_obj(obj), d_method(method), dp_cache(NULL) {
if (useCache)
dp_cache = new std::map<std::pair<unsigned int, unsigned int>, double>();
}
~pyBVFunctor() { delete dp_cache; }
pyBVFunctor(const std::vector<const BV *> &obj, DistanceMethod method)
: d_obj(obj), d_method(method) {}
~pyBVFunctor() {}
double operator()(unsigned int i, unsigned int j) {
double res = 0.0;
std::pair<unsigned int, unsigned int> idxPair(i, j);
if (dp_cache && dp_cache->count(idxPair) > 0) {
res = (*dp_cache)[idxPair];
} else {
switch (d_method) {
case TANIMOTO:
res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]);
break;
case DICE:
res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]);
break;
default:
throw_value_error("unsupported similarity value");
}
if (dp_cache) {
(*dp_cache)[idxPair] = res;
}
switch (d_method) {
case TANIMOTO:
res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]);
break;
case DICE:
res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]);
break;
default:
throw_value_error("unsupported similarity value");
}
return res;
}
@@ -137,18 +114,21 @@ class pyBVFunctor {
private:
const std::vector<const BV *> &d_obj;
DistanceMethod d_method;
std::map<std::pair<unsigned int, unsigned int>, double> *dp_cache;
};
RDKit::INT_VECT LazyVectorMaxMinPicks(MaxMinPicker *picker, python::object objs,
int poolSize, int pickSize,
python::object firstPicks, int seed,
bool useCache) {
python::object useCache) {
if (useCache != python::object()) {
BOOST_LOG(rdWarningLog) << "the useCache argument is deprecated and ignored"
<< std::endl;
}
std::vector<const ExplicitBitVect *> bvs(poolSize);
for (int i = 0; i < poolSize; ++i) {
bvs[i] = python::extract<const ExplicitBitVect *>(objs[i]);
}
pyBVFunctor<ExplicitBitVect> functor(bvs, TANIMOTO, useCache);
pyBVFunctor<ExplicitBitVect> functor(bvs, TANIMOTO);
RDKit::INT_VECT firstPickVect;
for (unsigned int i = 0;
i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) {
@@ -187,7 +167,8 @@ struct MaxMin_wrap {
(python::arg("self"), python::arg("distFunc"),
python::arg("poolSize"), python::arg("pickSize"),
python::arg("firstPicks") = python::tuple(),
python::arg("seed") = -1, python::arg("useCache") = true),
python::arg("seed") = -1,
python::arg("useCache") = python::object()),
"Pick a subset of items from a pool of items using the MaxMin "
"Algorithm\n"
"Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), "
@@ -205,14 +186,13 @@ struct MaxMin_wrap {
" - firstPicks: (optional) the first items to be picked (seeds "
"the list)\n"
" - seed: (optional) seed for the random number generator\n"
" - useCache: (optional) toggles use of a cache for the distance "
"calculation\n"
" This trades memory usage for speed.\n")
" - useCache: IGNORED\n")
.def("LazyBitVectorPick", RDPickers::LazyVectorMaxMinPicks,
(python::arg("self"), python::arg("objects"),
python::arg("poolSize"), python::arg("pickSize"),
python::arg("firstPicks") = python::tuple(),
python::arg("seed") = -1, python::arg("useCache") = true),
python::arg("seed") = -1,
python::arg("useCache") = python::object()),
"Pick a subset of items from a pool of bit vectors using the "
"MaxMin Algorithm\n"
"Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), "
@@ -225,11 +205,7 @@ struct MaxMin_wrap {
" - firstPicks: (optional) the first items to be picked (seeds "
"the list)\n"
" - seed: (optional) seed for the random number generator\n"
" - useCache: (optional) toggles use of a cache for the distance "
"calculation\n"
" This trades memory usage for speed.\n"
);
" - useCache: IGNORED.\n");
};
};

View File

@@ -222,6 +222,25 @@ class TestCase(unittest.TestCase):
self.assertEqual(len(mm2), N)
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin3(self):
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp =rdSimDivPickers.MaxMinPicker()
ids=list(mmp.LazyBitVectorPick(fps,len(fps),20))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
ids=list(mmp.LazyBitVectorPick(fps,len(fps),20,firstPicks=[374,720,690,339,875]))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
if __name__ == '__main__':
unittest.main()

File diff suppressed because it is too large Load Diff