[WIP] Add Leader picker implementation (#2724)

* initial commit of Roger's contrib

* add the new file to the headers list

* clang-format

* stub of the python wrapper

* code added for testing

* code added for testing

* add LeaderPicker.seq.h

* default pickSize value

* first crude python wrapper

* Add C++ tests for the LeaderPicker
combine the thread and non-thread versions

* that was not really a test

* support providing a functor from Python

* no longer need the .seq header

* temporarily disable the threaded version to allow CI runs to pass

* some refactoring and cleanup
This commit is contained in:
Greg Landrum
2019-10-24 13:14:15 +02:00
committed by GitHub
parent 66d92b1be0
commit 6cfc8f36a7
12 changed files with 1025 additions and 108 deletions

View File

@@ -4,12 +4,16 @@ rdkit_library(SimDivPickers
DistPicker.cpp MaxMinPicker.cpp HierarchicalClusterPicker.cpp
LINK_LIBRARIES hc RDGeneral)
rdkit_headers(DistPicker.h
rdkit_headers(DistPicker.h LeaderPicker.h
HierarchicalClusterPicker.h
MaxMinPicker.h DEST SimDivPickers)
rdkit_test(testSimDivPickers testPickers.cpp LINK_LIBRARIES SimDivPickers RDGeneral)
rdkit_catch_test(pickersTestsCatch catch_tests.cpp
LINK_LIBRARIES SimDivPickers DataStructs RDGeneral)
if(RDK_BUILD_PYTHON_WRAPPERS)
add_subdirectory(Wrap)
endif()

View File

@@ -73,6 +73,20 @@ class RDKIT_SIMDIVPICKERS_EXPORT DistPicker {
virtual RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
unsigned int pickSize) const = 0;
};
namespace {
class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor {
public:
distmatFunctor(const double *distMat) : dp_distMat(distMat){};
double operator()(unsigned int i, unsigned int j) {
return getDistFromLTM(this->dp_distMat, i, j);
}
private:
const double *dp_distMat;
};
} // namespace
}; // namespace RDPickers
#endif

View File

@@ -0,0 +1,430 @@
//
// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
// Copyright (C) 2017-2019 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RD_LEADERPICKER_H
#define RD_LEADERPICKER_H
#include <RDGeneral/types.h>
#include <RDGeneral/utils.h>
#include <RDGeneral/Invariant.h>
#include <RDGeneral/RDLog.h>
#include <RDGeneral/Exceptions.h>
#include <cstdlib>
#include "DistPicker.h"
namespace RDPickers {
/*! \brief Implements the Leader algorithm for picking a subset of item from a
*pool
*
* This class inherits from the DistPicker and implements a specific picking
*strategy aimed at diversity. See documentation for "pick()" member function
*for the algorithm details
*/
class RDKIT_SIMDIVPICKERS_EXPORT LeaderPicker : public DistPicker {
public:
double default_threshold;
int default_nthreads;
/*! \brief Default Constructor
*
*/
LeaderPicker() : default_threshold(0.0), default_nthreads(1) {}
LeaderPicker(double threshold)
: default_threshold(threshold), default_nthreads(1) {}
LeaderPicker(double threshold, int nthreads)
: default_threshold(threshold), default_nthreads(nthreads) {}
/*! \brief Contains the implementation for a lazy Leader diversity picker
*
* See the documentation for the pick() method for details about the algorithm
*
* \param func - a function (or functor) taking two unsigned ints as
*arguments and returning the distance (as a double) between those two
*elements.
*
* \param poolSize - the size of the pool to pick the items from. It is
*assumed that the distance matrix above contains the right number of
*elements; i.e. poolSize*(poolSize-1)
*
* \param pickSize - the number items to pick from pool (<= poolSize)
*
* \param firstPicks - (optional)the first items in the pick list
*
* \param seed - (optional) seed for the random number generator. If this is
*<0 the generator will be seeded with a random number.
*/
template <typename T>
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize) const;
template <typename T>
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize, double threshold) const;
template <typename T>
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
const RDKit::INT_VECT &firstPicks,
double threshold) const;
template <typename T>
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
const RDKit::INT_VECT &firstPicks, double threshold,
int nthreads) const;
/*! \brief Contains the implementation for the Leader diversity picker
*
* \param distMat - distance matrix - a vector of double. It is assumed that
*only the lower triangle element of the matrix are supplied in a 1D array\n
*
* \param poolSize - the size of the pool to pick the items from. It is
*assumed that the distance matrix above contains the right number of
*elements; i.e. poolSize*(poolSize-1) \n
*
* \param pickSize - maximum number items to pick from pool (<= poolSize)
*
* \param firstPicks - indices of the items used to seed the pick set.
*/
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
unsigned int pickSize, const RDKit::INT_VECT &firstPicks,
double threshold, int nthreads) const {
CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
if (!poolSize) throw ValueErrorException("empty pool to pick from");
if (poolSize < pickSize)
throw ValueErrorException("pickSize cannot be larger than the poolSize");
distmatFunctor functor(distMat);
return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold,
nthreads);
}
/*! \overload */
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
unsigned int pickSize) const {
RDKit::INT_VECT iv;
return pick(distMat, poolSize, pickSize, iv, default_threshold,
default_nthreads);
}
};
#ifdef USE_THREADED_LEADERPICKER
// Note that this block of code currently only works on linux (which is why it's disabled by default)
// We will revisit this during the 2020.03 release cycle in order to get a multi-threaded version of
// the LeaderPicker that works on all supported platforms
template <typename T>
void *LeaderPickerWork(void *arg);
template <typename T>
struct LeaderPickerState {
typedef struct {
int *ptr;
unsigned int capacity;
unsigned int len;
unsigned int next[2];
} LeaderPickerBlock;
typedef struct {
LeaderPickerState<T> *stat;
pthread_t tid;
unsigned int id;
} LeaderPickerThread;
std::vector<LeaderPickerThread> threads;
std::vector<LeaderPickerBlock> blocks;
pthread_barrier_t wait;
pthread_barrier_t done;
std::vector<int> v;
LeaderPickerBlock *head_block;
unsigned int thread_op;
unsigned int nthreads;
unsigned int tick;
double threshold;
int query;
T *func;
LeaderPickerState(unsigned int count, int nt) {
v.resize(count);
for (unsigned int i = 0; i < count; i++) v[i] = i;
// InitializeBlocks
unsigned int bcount;
unsigned int bsize;
if (nt > 1) {
bsize = 4096;
bcount = (count + (bsize - 1)) / bsize;
unsigned int tasks = (bcount + 1) / 2;
// limit number of threads to available work
if (nt > (int)tasks) nt = tasks;
} else {
bsize = 32768;
bcount = (count + (bsize - 1)) / bsize;
}
blocks.resize(bcount);
head_block = &blocks[0];
tick = 0;
if (bcount > 1) {
int *ptr = &v[0];
unsigned int len = count;
for (unsigned int i = 0; i < bcount; i++) {
LeaderPickerBlock *block = &blocks[i];
block->ptr = ptr;
if (len > bsize) {
block->capacity = bsize;
block->len = bsize;
block->next[0] = i + 1;
} else {
block->capacity = len;
block->len = len;
block->next[0] = 0;
break;
}
ptr += bsize;
len -= bsize;
}
} else {
head_block->capacity = count;
head_block->len = count;
head_block->next[0] = 0;
head_block->next[1] = 0;
head_block->ptr = &v[0];
}
// InitializeThreads
if (nt > 1) {
nthreads = nt;
pthread_barrier_init(&wait, NULL, nthreads + 1);
pthread_barrier_init(&done, NULL, nthreads + 1);
threads.resize(nt);
for (unsigned int i = 0; i < nthreads; i++) {
threads[i].id = i;
threads[i].stat = this;
pthread_create(&threads[i].tid, NULL, LeaderPickerWork<T>,
(void *)&threads[i]);
}
} else
nthreads = 1;
}
~LeaderPickerState() {
if (nthreads > 1) {
thread_op = 1;
pthread_barrier_wait(&wait);
for (unsigned int i = 0; i < nthreads; i++)
pthread_join(threads[i].tid, 0);
pthread_barrier_destroy(&wait);
pthread_barrier_destroy(&done);
}
}
bool empty() {
while (head_block) {
if (head_block->len) return false;
unsigned int next_tick = head_block->next[tick];
if (!next_tick) return true;
head_block = &blocks[next_tick];
}
return true;
}
unsigned int compact(int *dst, int *src, unsigned int len) {
unsigned int count = 0;
for (unsigned int i = 0; i < len; i++) {
if ((*func)(query, src[i]) > threshold) dst[count++] = src[i];
}
return count;
}
void compact_job(unsigned int cycle) {
// On entry, next[tick] for each block is the current linked list.
// On exit, next[tock] is the linked list for the next iteration.
unsigned int tock = tick ^ 1;
LeaderPickerBlock *list = head_block;
for (;;) {
unsigned int next_tick = list->next[tick];
if (next_tick) {
LeaderPickerBlock *next = &blocks[next_tick];
unsigned int next_next_tick = next->next[tick];
if (cycle == 0) {
list->len = compact(list->ptr, list->ptr, list->len);
if (list->len + next->len <= list->capacity) {
list->len += compact(list->ptr + list->len, next->ptr, next->len);
list->next[tock] = next_next_tick;
} else {
next->len = compact(next->ptr, next->ptr, next->len);
if (next->len) {
list->next[tock] = next_tick;
next->next[tock] = next_next_tick;
} else
list->next[tock] = next_next_tick;
}
cycle = nthreads - 1;
} else
cycle--;
if (next_next_tick) {
list = &blocks[next_next_tick];
} else
break;
} else {
if (cycle == 0) {
list->len = compact(list->ptr, list->ptr, list->len);
list->next[tock] = 0;
}
break;
}
}
}
void compact(int pick) {
query = pick;
if (nthreads > 1) {
thread_op = 0;
pthread_barrier_wait(&wait);
pthread_barrier_wait(&done);
} else
compact_job(0);
tick ^= 1;
}
int compact_next() {
compact(head_block->ptr[0]);
return query;
}
};
// This is the loop the worker threads run
template <typename T>
void *LeaderPickerWork(void *arg) {
typename LeaderPickerState<T>::LeaderPickerThread *thread;
thread = (typename LeaderPickerState<T>::LeaderPickerThread *)arg;
LeaderPickerState<T> *stat = thread->stat;
for (;;) {
pthread_barrier_wait(&stat->wait);
if (stat->thread_op) return (void *)0;
stat->compact_job(thread->id);
pthread_barrier_wait(&stat->done);
}
}
#else
template <typename T>
struct LeaderPickerState {
std::vector<int> v;
unsigned int left;
double threshold;
int query;
T *func;
LeaderPickerState(unsigned int count, int) {
v.resize(count);
for (unsigned int i = 0; i < count; i++) v[i] = i;
left = count;
}
bool empty() { return left == 0; }
unsigned int compact(int *dst, int *src, unsigned int len) {
unsigned int count = 0;
for (unsigned int i = 0; i < len; i++) {
double ld = (*func)(query, src[i]);
// std::cerr << query << "-" << src[i] << " " << ld << std::endl;
if (ld > threshold) dst[count++] = src[i];
}
return count;
}
void compact(int pick) {
query = pick;
left = compact(&v[0], &v[0], left);
}
int compact_next() {
query = v[0];
left = compact(&v[0], &v[1], left - 1);
return query;
}
};
#endif
// we implement this here in order to allow arbitrary functors without link
// errors
template <typename T>
RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
const RDKit::INT_VECT &firstPicks,
double threshold, int nthreads) const {
if (!poolSize) throw ValueErrorException("empty pool to pick from");
if (poolSize < pickSize)
throw ValueErrorException("pickSize cannot be larger than the poolSize");
if (!pickSize) pickSize = poolSize;
RDKit::INT_VECT picks;
LeaderPickerState<T> stat(poolSize, nthreads);
stat.threshold = threshold;
stat.func = &func;
unsigned int picked = 0; // picks.size()
unsigned int pick = 0;
if (!firstPicks.empty()) {
for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
pIdx != firstPicks.end(); ++pIdx) {
pick = static_cast<unsigned int>(*pIdx);
if (pick >= poolSize) {
throw ValueErrorException("pick index was larger than the poolSize");
}
picks.push_back(pick);
stat.compact(pick);
picked++;
}
}
while (picked < pickSize && !stat.empty()) {
pick = stat.compact_next();
picks.push_back(pick);
picked++;
}
return picks;
}
template <typename T>
RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize) const {
RDKit::INT_VECT firstPicks;
return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks,
default_threshold, default_nthreads);
}
template <typename T>
RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
double threshold) const {
RDKit::INT_VECT firstPicks;
return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
default_nthreads);
}
template <typename T>
RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize,
unsigned int pickSize,
const RDKit::INT_VECT &firstPicks,
double threshold) const {
return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold,
default_nthreads);
}
}; // namespace RDPickers
#endif

View File

@@ -24,19 +24,6 @@
namespace RDPickers {
namespace {
class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor {
public:
distmatFunctor(const double *distMat) : dp_distMat(distMat){};
double operator()(unsigned int i, unsigned int j) {
return getDistFromLTM(this->dp_distMat, i, j);
}
private:
const double *dp_distMat;
};
} // namespace
/*! \brief Implements the MaxMin algorithm for picking a subset of item from a
*pool
*
@@ -272,7 +259,7 @@ RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
} while (*prev != 0);
// if the current distance is closer then threshold, we're done
if (threshold >= 0.0 && maxOFmin < threshold) break;
if (maxOFmin <= threshold && threshold >= 0.0) break;
tmpThreshold = maxOFmin;
// now add the new pick to picks and remove it from the pool
*pick_prev = pinfo[pick].next;

View File

@@ -1,6 +1,6 @@
remove_definitions(-DRDKIT_SIMDIVPICKERS_BUILD)
rdkit_python_extension(rdSimDivPickers
MaxMinPicker.cpp HierarchicalClusterPicker.cpp
MaxMinPicker.cpp LeaderPicker.cpp HierarchicalClusterPicker.cpp
rdSimDivPickers.cpp
DEST SimDivFilters
LINK_LIBRARIES SimDivPickers

View File

@@ -0,0 +1,109 @@
//
// Copyright (C) 2019 Greg Landrum
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL rdpicker_array_API
#include <RDBoost/python.h>
#include <RDBoost/Wrap.h>
#include <RDBoost/boost_numpy.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#include <numpy/arrayobject.h>
#include <map>
#include "PickerHelpers.h"
#include <DataStructs/BitVects.h>
#include <DataStructs/BitOps.h>
#include <SimDivPickers/DistPicker.h>
#include <SimDivPickers/LeaderPicker.h>
#include <iostream>
#include <utility>
namespace python = boost::python;
namespace RDPickers {
namespace {
template <typename T>
void LazyLeaderHelper(LeaderPicker *picker, T functor, unsigned int poolSize,
double &threshold, unsigned int pickSize,
python::object firstPicks, RDKit::INT_VECT &res,
int nThreads) {
RDKit::INT_VECT firstPickVect;
for (unsigned int i = 0;
i < python::extract<unsigned int>(firstPicks.attr("__len__")()); ++i) {
firstPickVect.push_back(python::extract<int>(firstPicks[i]));
}
res = picker->lazyPick(functor, poolSize, pickSize, firstPickVect, threshold,
nThreads);
}
} // end of anonymous namespace
RDKit::INT_VECT LazyVectorLeaderPicks(LeaderPicker *picker, python::object objs,
int poolSize, double threshold,
int pickSize, python::object firstPicks,
int numThreads) {
std::vector<const ExplicitBitVect *> bvs(poolSize);
for (int i = 0; i < poolSize; ++i) {
bvs[i] = python::extract<const ExplicitBitVect *>(objs[i]);
}
pyBVFunctor<ExplicitBitVect> functor(bvs, TANIMOTO);
RDKit::INT_VECT res;
LazyLeaderHelper(picker, functor, poolSize, threshold, pickSize, firstPicks,
res, numThreads);
return res;
}
RDKit::INT_VECT LazyLeaderPicks(LeaderPicker *picker, python::object distFunc,
int poolSize, double threshold, int pickSize,
python::object firstPicks, int numThreads) {
pyobjFunctor functor(distFunc);
RDKit::INT_VECT res;
LazyLeaderHelper(picker, functor, poolSize, threshold, pickSize, firstPicks,
res, numThreads);
return res;
}
} // end of namespace RDPickers
struct LeaderPicker_wrap {
static void wrap() {
python::class_<RDPickers::LeaderPicker>(
"LeaderPicker",
"A class for diversity picking of items using Roger Sayle's Leader "
"algorithm (analogous to sphere exclusion). The algorithm is "
"currently unpublished, but a description is available in this "
"presentation from the 2019 RDKit UGM: "
"https://github.com/rdkit/UGM_2019/raw/master/Presentations/"
"Sayle_Clustering.pdf\n")
.def("LazyBitVectorPick", RDPickers::LazyVectorLeaderPicks,
(python::arg("self"), python::arg("objects"),
python::arg("poolSize"), python::arg("threshold"),
python::arg("pickSize") = 0,
python::arg("firstPicks") = python::tuple(),
python::arg("numThreads") = 1),
"Pick a subset of items from a collection of bit vectors using "
"Tanimoto distance. The threshold value is a "
"*distance* (i.e. 1-similarity). Note that the numThreads "
"argument is currently ignored.")
.def("LazyPick", RDPickers::LazyLeaderPicks,
(python::arg("self"), python::arg("distFunc"),
python::arg("poolSize"), python::arg("threshold"),
python::arg("pickSize") = 0,
python::arg("firstPicks") = python::tuple(),
python::arg("numThreads") = 1),
"Pick a subset of items from a pool of items using the "
"user-provided function to determine distances. Note that the "
"numThreads argument is currently ignored.")
;
};
};
void wrap_leaderpick() { LeaderPicker_wrap::wrap(); }

View File

@@ -16,6 +16,8 @@
#include <numpy/arrayobject.h>
#include <map>
#include "PickerHelpers.h"
#include <DataStructs/BitVects.h>
#include <DataStructs/BitOps.h>
#include <SimDivPickers/DistPicker.h>
@@ -55,18 +57,6 @@ RDKit::INT_VECT MaxMinPicks(MaxMinPicker *picker, python::object distMat,
return res;
}
class pyobjFunctor {
public:
pyobjFunctor(python::object obj) : dp_obj(std::move(obj)) {}
~pyobjFunctor() {}
double operator()(unsigned int i, unsigned int j) {
return python::extract<double>(dp_obj(i, j));
}
private:
python::object dp_obj;
};
namespace {
template <typename T>
void LazyMaxMinHelper(MaxMinPicker *picker, T functor, unsigned int poolSize,
@@ -107,37 +97,6 @@ python::tuple LazyMaxMinPicksWithThreshold(
return python::make_tuple(res, threshold);
}
// NOTE: TANIMOTO and DICE provably return the same results for the diversity
// picking this is still here just in case we ever later want to support other
// methods.
typedef enum { TANIMOTO = 1, DICE } DistanceMethod;
template <typename BV>
class pyBVFunctor {
public:
pyBVFunctor(const std::vector<const BV *> &obj, DistanceMethod method)
: d_obj(obj), d_method(method) {}
~pyBVFunctor() {}
double operator()(unsigned int i, unsigned int j) {
double res = 0.0;
switch (d_method) {
case TANIMOTO:
res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]);
break;
case DICE:
res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]);
break;
default:
throw_value_error("unsupported similarity value");
}
return res;
}
private:
const std::vector<const BV *> &d_obj;
DistanceMethod d_method;
};
RDKit::INT_VECT LazyVectorMaxMinPicks(MaxMinPicker *picker, python::object objs,
int poolSize, int pickSize,
python::object firstPicks, int seed,

View File

@@ -0,0 +1,60 @@
//
// Copyright (C) 2019 Greg Landrum
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#ifndef RDKIT_PICKERHELPERS_H
#define RDKIT_PICKERHELPERS_H
#include <vector>
#include <DataStructs/BitOps.h>
// NOTE: TANIMOTO and DICE provably return the same results for the diversity
// picking this is still here just in case we ever later want to support other
// methods.
typedef enum { TANIMOTO = 1, DICE } DistanceMethod;
template <typename BV>
class pyBVFunctor {
public:
pyBVFunctor(const std::vector<const BV *> &obj, DistanceMethod method)
: d_obj(obj), d_method(method) {}
~pyBVFunctor() {}
double operator()(unsigned int i, unsigned int j) {
double res = 0.0;
switch (d_method) {
case TANIMOTO:
res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]);
break;
case DICE:
res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]);
break;
default:
throw_value_error("unsupported similarity value");
}
return res;
}
private:
const std::vector<const BV *> &d_obj;
DistanceMethod d_method;
};
class pyobjFunctor {
public:
pyobjFunctor(python::object obj) : dp_obj(std::move(obj)) {}
~pyobjFunctor() {}
double operator()(unsigned int i, unsigned int j) {
return python::extract<double>(dp_obj(i, j));
}
private:
python::object dp_obj;
};
#endif // RDKIT_PICKERHELPERS_H

View File

@@ -1,6 +1,6 @@
// $Id$
//
// Copyright (C) 2003-2006 Rational Discovery LLC
// Copyright (C) 2019 Greg Landrum
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
@@ -15,6 +15,7 @@
namespace python = boost::python;
void wrap_maxminpick();
void wrap_leaderpick();
void wrap_HierarchCP();
BOOST_PYTHON_MODULE(rdSimDivPickers) {
@@ -24,5 +25,6 @@ BOOST_PYTHON_MODULE(rdSimDivPickers) {
rdkit_import_array();
wrap_maxminpick();
wrap_leaderpick();
wrap_HierarchCP();
}

View File

@@ -136,7 +136,7 @@ class TestCase(unittest.TestCase):
# we get the occasional dupe randomly,
# make sure we don't get three dupes in a row
self.assertTrue(tuple(mm2)!=tuple(mm1)) or (tuple(mm3) != tuple(mm1))
self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1))
picker = None
ds = []
@@ -184,40 +184,41 @@ class TestCase(unittest.TestCase):
self.assertEqual(list(mm1), list(mm2))
def testBitVectorMaxMin2(self):
fps = ["11110010101000000000", "00000000000010010000", "11001010000000000001",
"00100110101000001000", "01010110000100011001", "11000110101001000011",
"00000000001100001111", "00011110110000001101", "00000011011110100010",
"11000010110001000000", "00000100010000010000", "10000001000010110010",
"00010010000000010100", "00011100100110101000", "10001001100110100000",
"10000110100110010000", "00101110000101000000", "11011101100011100000",
"10000110000100101000", "00101000100000010001", "01000001000010000000",
"00101101010100000110", "10001000100110110001", "00011000010100000001",
"00101000001000100011", "00010000100010011001", "01100001000100010001",
"10000101000001101101", "00001000011001011000", "11110000100100100000",
"10100110000000011010", "00110100010110010010", "00000000000001010010",
"00100000000010100001", "11110011000010001000", "10110001010100001000",
"00001100100110011011", "00010010100100001110", "10100101100010100010",
"01100100010100000001", "10101110011100000000", "01011000000001000001",
"00000011100110100010", "01100001010001001001", "00001000000001001100",
"10011001110000000100", "10110000001001100100", "00011000000001001011",
"11001011010001100010", "10010000000001001011", "00010000100111100000",
"00001000001110001000", "11010000010001100110", "01101001100000111000",
"01001000001110111000", "10000000000100010010", "11001000010010000000",
"01010010000100110001", "00010001010100100001", "01110010000000010000",
"10001010000011000001", "00000110000000100100", "00010000010001000000",
"11101100011010000011", "00000010100001010001", "00010000110010000101",
"00010001001000111001", "01000010001100100110", "00110110000000100001",
"00100010010110110010", "01000000110011001111", "00011000001000110010",
"01111010101000110100", "00001010000010110110", "00110011000011011010",
"00111010111010000110", "00010011101010000011", "00000001011000010000",
"00011011101110110000", "00010001101000000001", "00010000001010011010",
"00000010100100100010", "00000010001011000100", "11010000000001011100",
"00001000110101000001", "00000010000000110010", "10000000010011000001",
"11110110100100010000", "10001111000110001001", "00100110000110000100",
"00000100100000100100", "00110000101100010100", "00001010100000100000",
"01011000000011000111", "00010000100001010001", "10000010100000010000",
"00001000000000110010", "00001000101011010001", "00011110000100100000",
"11001001010001010100"]
fps = [
"11110010101000000000", "00000000000010010000", "11001010000000000001",
"00100110101000001000", "01010110000100011001", "11000110101001000011",
"00000000001100001111", "00011110110000001101", "00000011011110100010",
"11000010110001000000", "00000100010000010000", "10000001000010110010",
"00010010000000010100", "00011100100110101000", "10001001100110100000",
"10000110100110010000", "00101110000101000000", "11011101100011100000",
"10000110000100101000", "00101000100000010001", "01000001000010000000",
"00101101010100000110", "10001000100110110001", "00011000010100000001",
"00101000001000100011", "00010000100010011001", "01100001000100010001",
"10000101000001101101", "00001000011001011000", "11110000100100100000",
"10100110000000011010", "00110100010110010010", "00000000000001010010",
"00100000000010100001", "11110011000010001000", "10110001010100001000",
"00001100100110011011", "00010010100100001110", "10100101100010100010",
"01100100010100000001", "10101110011100000000", "01011000000001000001",
"00000011100110100010", "01100001010001001001", "00001000000001001100",
"10011001110000000100", "10110000001001100100", "00011000000001001011",
"11001011010001100010", "10010000000001001011", "00010000100111100000",
"00001000001110001000", "11010000010001100110", "01101001100000111000",
"01001000001110111000", "10000000000100010010", "11001000010010000000",
"01010010000100110001", "00010001010100100001", "01110010000000010000",
"10001010000011000001", "00000110000000100100", "00010000010001000000",
"11101100011010000011", "00000010100001010001", "00010000110010000101",
"00010001001000111001", "01000010001100100110", "00110110000000100001",
"00100010010110110010", "01000000110011001111", "00011000001000110010",
"01111010101000110100", "00001010000010110110", "00110011000011011010",
"00111010111010000110", "00010011101010000011", "00000001011000010000",
"00011011101110110000", "00010001101000000001", "00010000001010011010",
"00000010100100100010", "00000010001011000100", "11010000000001011100",
"00001000110101000001", "00000010000000110010", "10000000010011000001",
"11110110100100010000", "10001111000110001001", "00100110000110000100",
"00000100100000100100", "00110000101100010100", "00001010100000100000",
"01011000000011000111", "00010000100001010001", "10000010100000010000",
"00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100"
]
N = 5
fps = [DataStructs.CreateFromBitString(x) for x in fps]
picker = rdSimDivPickers.MaxMinPicker()
@@ -235,14 +236,15 @@ class TestCase(unittest.TestCase):
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp =rdSimDivPickers.MaxMinPicker()
ids=list(mmp.LazyBitVectorPick(fps,len(fps),20,seed=42))
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
ids=list(mmp.LazyBitVectorPick(fps,len(fps),20,firstPicks=[374,720,690,339,875],seed=42))
ids = list(
mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42))
self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
@@ -253,18 +255,63 @@ class TestCase(unittest.TestCase):
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp =rdSimDivPickers.MaxMinPicker()
ids,threshold=mmp.LazyBitVectorPickWithThreshold(fps,len(fps),20,-1.0,seed=42)
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.MaxMinPicker()
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42)
self.assertEqual(list(ids),[374,720,690,339,875,842,404,725,120,385,115,868,630,\
881,516,497,412,718,869,407])
self.assertAlmostEqual(threshold,0.8977,4)
self.assertAlmostEqual(threshold, 0.8977, 4)
ids,threshold=mmp.LazyBitVectorPickWithThreshold(fps,len(fps),20,0.91,seed=42)
self.assertEqual(list(ids),[374,720,690,339,875,842,404,725,120,385,115,868,630])
self.assertTrue(threshold>=0.91)
ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42)
self.assertEqual(list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630])
self.assertTrue(threshold >= 0.91)
def testBitVectorLeader1(self):
# threshold tests
fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
'chembl_cyps.head.fps')
fps = []
with open(fname) as infil:
for line in infil:
fp = DataStructs.CreateFromFPSText(line.strip())
fps.append(fp)
mmp = rdSimDivPickers.LeaderPicker()
thresh = 0.8
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 146)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
thresh = 0.9
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
self.assertEqual(len(ids), 14)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10)
self.assertEqual(len(ids), 10)
for i in range(len(ids)):
for j in range(i):
self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
thresh)
def testLazyLeader(self):
pkr = rdSimDivPickers.LeaderPicker()
def func(i, j):
if i == j:
return 0.0
if i < j:
j, i = i, j
return i - j
lres = pkr.LazyPick(func, 100, 20)
self.assertEqual(list(lres), [0, 21, 42, 63, 84])
if __name__ == '__main__':

View File

@@ -0,0 +1,88 @@
//
// Copyright (C) 2019
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do
// this in one cpp file
#include "catch.hpp"
#include <RDGeneral/types.h>
#include <RDGeneral/test.h>
#include <DataStructs/ExplicitBitVect.h>
#include <DataStructs/BitOps.h>
#include <SimDivPickers/LeaderPicker.h>
#include <iostream>
#include <fstream>
template <typename T>
class BVFunctor {
public:
BVFunctor(const T &obj) : d_obj(obj) {}
~BVFunctor() {}
double operator()(unsigned int i, unsigned int j) {
double res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]);
return res;
}
const T &d_obj;
};
TEST_CASE(
"Leader Picker basics"
"[LeaderPicker]") {
std::string rdbase = getenv("RDBASE");
std::string fName =
rdbase + "/Code/SimDivPickers/Wrap/test_data/chembl_cyps.head.fps";
std::ifstream inf(fName);
std::string fpsText;
std::getline(inf, fpsText);
std::vector<std::unique_ptr<ExplicitBitVect>> fps;
while (!inf.eof() && !fpsText.empty()) {
fps.emplace_back(new ExplicitBitVect(fpsText.size() * 4));
UpdateBitVectFromFPSText(*fps.back(), fpsText);
std::getline(inf, fpsText);
};
REQUIRE(fps.size() == 1000);
BVFunctor<std::vector<std::unique_ptr<ExplicitBitVect>>> bvf(fps);
RDPickers::LeaderPicker pkr;
SECTION("basics1") {
double threshold = 0.8;
auto res = pkr.lazyPick(bvf, fps.size(), 0, threshold);
CHECK(res.size() == 146);
for (unsigned i = 0; i < res.size(); ++i) {
for (unsigned j = 0; j < i; ++j) {
CHECK(bvf(res[i], res[j]) >= threshold);
}
}
}
SECTION("basics2") {
double threshold = 0.9;
auto res = pkr.lazyPick(bvf, fps.size(), 0, threshold);
CHECK(res.size() == 14);
for (unsigned i = 0; i < res.size(); ++i) {
for (unsigned j = 0; j < i; ++j) {
CHECK(bvf(res[i], res[j]) >= threshold);
}
}
}
#ifdef RDK_THREADSAFE_SSS
SECTION("basics multithreaded") {
double threshold = 0.8;
RDKit::INT_VECT firstPicks;
int nThreads = 0; // use max available
auto res =
pkr.lazyPick(bvf, fps.size(), 0, firstPicks, threshold, nThreads);
CHECK(res.size() == 146);
for (unsigned i = 0; i < res.size(); ++i) {
for (unsigned j = 0; j < i; ++j) {
CHECK(bvf(res[i], res[j]) >= threshold);
}
}
}
#endif
}

View File

@@ -0,0 +1,217 @@
/* MaxMinPicker.cpp */
#include <string.h>
#include <stdio.h>
#include <sys/time.h>
#include <vector>
#include <string>
#include <GraphMol/GraphMol.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/Fingerprints/Fingerprints.h>
#include <GraphMol/Fingerprints/MorganFingerprints.h>
#include <DataStructs/BitOps.h>
#if 0
#include <SimDivPickers/MaxMinPicker.h>
#include <SimDivPickers/LeaderPicker.h>
#else
#include "MaxMinPicker.h"
#include "LeaderPicker.h"
#endif
std::vector<RDKit::ROMol *> mols;
std::vector<ExplicitBitVect *> fps;
// ExplicitBitVect **fps;
const char *poolname;
const char *pickname;
unsigned int origsize;
unsigned int picksize;
unsigned int poolsize;
unsigned int newpicks;
int seed;
unsigned long ticks = 0;
static unsigned int LoadDatabase(FILE *fp) {
char buffer[32768];
unsigned int result = 0;
while (fgets(buffer, 327666, fp)) {
if (buffer[0] == '#' || buffer[0] == ' ' || buffer[0] == '\t') continue;
char *ptr = buffer;
while (*ptr && *ptr != ' ' && *ptr != '\t') ptr++;
if (*ptr) {
*ptr++ = '\0';
char *end = ptr;
while (*end && *end != '\n' && *end != '\r') end++;
*end = '\0';
}
RDKit::RWMol *mol;
try {
mol = RDKit::SmilesToMol(buffer);
} catch (...) {
mol = (RDKit::RWMol *)0;
}
if (mol) {
#if 0
if (*ptr) {
static std::string key("_Name");
mol->setProp(key,std::string(ptr));
}
mols.push_back(mol);
#endif
ExplicitBitVect *fp;
fp = RDKit::MorganFingerprints::getFingerprintAsBitVect(*mol, 2, 2048);
fps.push_back(fp);
delete mol;
result++;
}
}
fprintf(stderr, "%u molecules\n", result);
return result;
}
static unsigned int LoadDatabase(const char *fname) {
if (fname && strcmp(fname, "-")) {
FILE *fp = fopen(fname, "rb");
if (!fp) return 0;
unsigned int result = LoadDatabase(fp);
fclose(fp);
return result;
} else
return LoadDatabase(stdin);
}
#ifdef UNUSED
static void GenerateFingerprints() {
unsigned int count = (unsigned int)mols.size();
unsigned int size = (unsigned int)(count * sizeof(void *));
fps = (ExplicitBitVect **)malloc(size);
for (unsigned int i = 0; i < count; i++)
fps[i] =
RDKit::MorganFingerprints::getFingerprintAsBitVect(*mols[i], 2, 2048);
fprintf(stderr, "%u Fingerprints\n", count);
}
static void DestroyFingerprints() {
unsigned int count = (unsigned int)mols.size();
for (unsigned int i = 0; i < count; i++) delete fps[i];
free(fps);
}
#endif
static void DisplayUsage() {
fputs("usage: MaxMinPicker [options] <poolfile> [<pickfile>]\n", stderr);
exit(1);
}
static void ProcessCommandLine(int argc, char *argv[]) {
poolname = (const char *)0;
pickname = (const char *)0;
newpicks = 1000;
for (int i = 1; i < argc; i++) {
const char *ptr = argv[i];
if (ptr[0] == '-' && ptr[1]) {
if (ptr[1] >= '0' && ptr[1] <= '9') {
newpicks = atoi(ptr + 1);
} else
DisplayUsage();
} else if (!poolname) {
poolname = ptr;
} else if (!pickname) {
pickname = ptr;
} else
DisplayUsage();
}
if (!poolname) DisplayUsage();
}
double MyDist(int i, int j) {
ticks++;
return 1.0 - TanimotoSimilarity(*fps[i], *fps[j]);
}
int main(int argc, char *argv[]) {
struct timeval beg, end;
double elapsed;
ProcessCommandLine(argc, argv);
gettimeofday(&beg, (struct timezone *)0);
origsize = LoadDatabase(poolname);
gettimeofday(&end, (struct timezone *)0);
elapsed = (end.tv_sec + 0.000001 * end.tv_usec) -
(beg.tv_sec + 0.000001 * beg.tv_usec);
fprintf(stderr, "Elapsed time: %g secs\n", elapsed);
if (pickname) {
gettimeofday(&beg, (struct timezone *)0);
picksize = LoadDatabase(pickname);
gettimeofday(&end, (struct timezone *)0);
elapsed = (end.tv_sec + 0.000001 * end.tv_usec) -
(beg.tv_sec + 0.000001 * beg.tv_usec);
fprintf(stderr, "Elapsed time: %g secs\n", elapsed);
} else
picksize = 0;
poolsize = origsize + picksize;
if (newpicks == 0) newpicks = origsize;
#ifdef UNUSED
gettimeofday(&beg, (struct timezone *)0);
GenerateFingerprints();
gettimeofday(&end, (struct timezone *)0);
elapsed = (end.tv_sec + 0.000001 * end.tv_usec) -
(beg.tv_sec + 0.000001 * beg.tv_usec);
fprintf(stderr, "Elapsed time: %g secs\n", elapsed);
#endif
RDKit::INT_VECT firstPicks;
if (picksize) {
firstPicks.reserve(picksize);
for (unsigned int i = 0; i < picksize; i++)
firstPicks.push_back((int)(i + origsize));
}
// RDPickers::MaxMinPicker mmpicker;
double threshold = 0.9; // 0.675;
RDPickers::LeaderPicker ldpicker(threshold, 4);
gettimeofday(&beg, (struct timezone *)0);
#if 0
RDKit::INT_VECT iv = mmpicker.lazyPick(MyDist,poolsize,picksize+newpicks,
firstPicks,seed,threshold);
#else
RDKit::INT_VECT iv = ldpicker.lazyPick(MyDist, poolsize, picksize + newpicks,
firstPicks, threshold, 16);
#endif
gettimeofday(&end, (struct timezone *)0);
elapsed = (end.tv_sec + 0.000001 * end.tv_usec) -
(beg.tv_sec + 0.000001 * beg.tv_usec);
fprintf(stderr, "Elapsed time: %g secs\n", elapsed);
unsigned int count = (unsigned int)iv.size();
#if 0
for (unsigned int j=0; j<count; j++)
printf("%d\n",iv[j]);
#endif
for (unsigned int i = 0; i < std::min(iv.size(), (size_t)10); ++i) {
for (unsigned int j = 0; j < i; ++j) {
std::cerr << iv[i] << " " << iv[j] << ": " << MyDist(iv[i], iv[j])
<< std::endl;
}
}
fprintf(stderr, "%u picks\n", count);
fprintf(stderr, "%g threshold\n", threshold);
fprintf(stderr, "%lu comparisons\n", ticks);
// DestroyFingerprints();
return 0;
}