From 6cfc8f36a7171f3e0bb0de42200471ee2e32d1b0 Mon Sep 17 00:00:00 2001 From: Greg Landrum Date: Thu, 24 Oct 2019 13:14:15 +0200 Subject: [PATCH] [WIP] Add Leader picker implementation (#2724) * initial commit of Roger's contrib * add the new file to the headers list * clang-format * stub of the python wrapper * code added for testing * code added for testing * add LeaderPicker.seq.h * default pickSize value * first crude python wrapper * Add C++ tests for the LeaderPicker combine the thread and non-thread versions * that was not really a test * support providing a functor from Python * no longer need the .seq header * temporarily disable the threaded version to allow CI runs to pass * some refactoring and cleanup --- Code/SimDivPickers/CMakeLists.txt | 6 +- Code/SimDivPickers/DistPicker.h | 14 + Code/SimDivPickers/LeaderPicker.h | 430 ++++++++++++++++++++ Code/SimDivPickers/MaxMinPicker.h | 15 +- Code/SimDivPickers/Wrap/CMakeLists.txt | 2 +- Code/SimDivPickers/Wrap/LeaderPicker.cpp | 109 +++++ Code/SimDivPickers/Wrap/MaxMinPicker.cpp | 45 +- Code/SimDivPickers/Wrap/PickerHelpers.h | 60 +++ Code/SimDivPickers/Wrap/rdSimDivPickers.cpp | 4 +- Code/SimDivPickers/Wrap/testPickers.py | 143 ++++--- Code/SimDivPickers/catch_tests.cpp | 88 ++++ Code/SimDivPickers/pickersCLI.cpp | 217 ++++++++++ 12 files changed, 1025 insertions(+), 108 deletions(-) create mode 100644 Code/SimDivPickers/LeaderPicker.h create mode 100644 Code/SimDivPickers/Wrap/LeaderPicker.cpp create mode 100644 Code/SimDivPickers/Wrap/PickerHelpers.h create mode 100644 Code/SimDivPickers/catch_tests.cpp create mode 100644 Code/SimDivPickers/pickersCLI.cpp diff --git a/Code/SimDivPickers/CMakeLists.txt b/Code/SimDivPickers/CMakeLists.txt index 5cab1f464..f6f6db6b6 100644 --- a/Code/SimDivPickers/CMakeLists.txt +++ b/Code/SimDivPickers/CMakeLists.txt @@ -4,12 +4,16 @@ rdkit_library(SimDivPickers DistPicker.cpp MaxMinPicker.cpp HierarchicalClusterPicker.cpp LINK_LIBRARIES hc RDGeneral) -rdkit_headers(DistPicker.h +rdkit_headers(DistPicker.h LeaderPicker.h HierarchicalClusterPicker.h MaxMinPicker.h DEST SimDivPickers) rdkit_test(testSimDivPickers testPickers.cpp LINK_LIBRARIES SimDivPickers RDGeneral) +rdkit_catch_test(pickersTestsCatch catch_tests.cpp + LINK_LIBRARIES SimDivPickers DataStructs RDGeneral) + + if(RDK_BUILD_PYTHON_WRAPPERS) add_subdirectory(Wrap) endif() diff --git a/Code/SimDivPickers/DistPicker.h b/Code/SimDivPickers/DistPicker.h index 43491e2f5..6ca29f372 100644 --- a/Code/SimDivPickers/DistPicker.h +++ b/Code/SimDivPickers/DistPicker.h @@ -73,6 +73,20 @@ class RDKIT_SIMDIVPICKERS_EXPORT DistPicker { virtual RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const = 0; }; + +namespace { +class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor { + public: + distmatFunctor(const double *distMat) : dp_distMat(distMat){}; + double operator()(unsigned int i, unsigned int j) { + return getDistFromLTM(this->dp_distMat, i, j); + } + + private: + const double *dp_distMat; +}; +} // namespace + }; // namespace RDPickers #endif diff --git a/Code/SimDivPickers/LeaderPicker.h b/Code/SimDivPickers/LeaderPicker.h new file mode 100644 index 000000000..5e9acff1e --- /dev/null +++ b/Code/SimDivPickers/LeaderPicker.h @@ -0,0 +1,430 @@ +// +// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC +// Copyright (C) 2017-2019 Greg Landrum and NextMove Software +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#include +#ifndef RD_LEADERPICKER_H +#define RD_LEADERPICKER_H + +#include +#include +#include +#include +#include +#include +#include "DistPicker.h" + +namespace RDPickers { + +/*! \brief Implements the Leader algorithm for picking a subset of item from a + *pool + * + * This class inherits from the DistPicker and implements a specific picking + *strategy aimed at diversity. See documentation for "pick()" member function + *for the algorithm details + */ +class RDKIT_SIMDIVPICKERS_EXPORT LeaderPicker : public DistPicker { + public: + double default_threshold; + int default_nthreads; + + /*! \brief Default Constructor + * + */ + LeaderPicker() : default_threshold(0.0), default_nthreads(1) {} + LeaderPicker(double threshold) + : default_threshold(threshold), default_nthreads(1) {} + LeaderPicker(double threshold, int nthreads) + : default_threshold(threshold), default_nthreads(nthreads) {} + + /*! \brief Contains the implementation for a lazy Leader diversity picker + * + * See the documentation for the pick() method for details about the algorithm + * + * \param func - a function (or functor) taking two unsigned ints as + *arguments and returning the distance (as a double) between those two + *elements. + * + * \param poolSize - the size of the pool to pick the items from. It is + *assumed that the distance matrix above contains the right number of + *elements; i.e. poolSize*(poolSize-1) + * + * \param pickSize - the number items to pick from pool (<= poolSize) + * + * \param firstPicks - (optional)the first items in the pick list + * + * \param seed - (optional) seed for the random number generator. If this is + *<0 the generator will be seeded with a random number. + */ + template + RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize) const; + + template + RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, double threshold) const; + + template + RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, + const RDKit::INT_VECT &firstPicks, + double threshold) const; + + template + RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, + const RDKit::INT_VECT &firstPicks, double threshold, + int nthreads) const; + + /*! \brief Contains the implementation for the Leader diversity picker + * + * \param distMat - distance matrix - a vector of double. It is assumed that + *only the lower triangle element of the matrix are supplied in a 1D array\n + * + * \param poolSize - the size of the pool to pick the items from. It is + *assumed that the distance matrix above contains the right number of + *elements; i.e. poolSize*(poolSize-1) \n + * + * \param pickSize - maximum number items to pick from pool (<= poolSize) + * + * \param firstPicks - indices of the items used to seed the pick set. + */ + RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, + unsigned int pickSize, const RDKit::INT_VECT &firstPicks, + double threshold, int nthreads) const { + CHECK_INVARIANT(distMat, "Invalid Distance Matrix"); + if (!poolSize) throw ValueErrorException("empty pool to pick from"); + if (poolSize < pickSize) + throw ValueErrorException("pickSize cannot be larger than the poolSize"); + distmatFunctor functor(distMat); + return this->lazyPick(functor, poolSize, pickSize, firstPicks, threshold, + nthreads); + } + + /*! \overload */ + RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, + unsigned int pickSize) const { + RDKit::INT_VECT iv; + return pick(distMat, poolSize, pickSize, iv, default_threshold, + default_nthreads); + } +}; + +#ifdef USE_THREADED_LEADERPICKER +// Note that this block of code currently only works on linux (which is why it's disabled by default) +// We will revisit this during the 2020.03 release cycle in order to get a multi-threaded version of +// the LeaderPicker that works on all supported platforms +template +void *LeaderPickerWork(void *arg); + +template +struct LeaderPickerState { + typedef struct { + int *ptr; + unsigned int capacity; + unsigned int len; + unsigned int next[2]; + } LeaderPickerBlock; + typedef struct { + LeaderPickerState *stat; + pthread_t tid; + unsigned int id; + } LeaderPickerThread; + + std::vector threads; + std::vector blocks; + pthread_barrier_t wait; + pthread_barrier_t done; + std::vector v; + LeaderPickerBlock *head_block; + unsigned int thread_op; + unsigned int nthreads; + unsigned int tick; + double threshold; + int query; + T *func; + + LeaderPickerState(unsigned int count, int nt) { + v.resize(count); + for (unsigned int i = 0; i < count; i++) v[i] = i; + + // InitializeBlocks + unsigned int bcount; + unsigned int bsize; + if (nt > 1) { + bsize = 4096; + bcount = (count + (bsize - 1)) / bsize; + unsigned int tasks = (bcount + 1) / 2; + // limit number of threads to available work + if (nt > (int)tasks) nt = tasks; + } else { + bsize = 32768; + bcount = (count + (bsize - 1)) / bsize; + } + blocks.resize(bcount); + head_block = &blocks[0]; + tick = 0; + + if (bcount > 1) { + int *ptr = &v[0]; + unsigned int len = count; + for (unsigned int i = 0; i < bcount; i++) { + LeaderPickerBlock *block = &blocks[i]; + block->ptr = ptr; + if (len > bsize) { + block->capacity = bsize; + block->len = bsize; + block->next[0] = i + 1; + } else { + block->capacity = len; + block->len = len; + block->next[0] = 0; + break; + } + ptr += bsize; + len -= bsize; + } + } else { + head_block->capacity = count; + head_block->len = count; + head_block->next[0] = 0; + head_block->next[1] = 0; + head_block->ptr = &v[0]; + } + + // InitializeThreads + if (nt > 1) { + nthreads = nt; + pthread_barrier_init(&wait, NULL, nthreads + 1); + pthread_barrier_init(&done, NULL, nthreads + 1); + + threads.resize(nt); + for (unsigned int i = 0; i < nthreads; i++) { + threads[i].id = i; + threads[i].stat = this; + pthread_create(&threads[i].tid, NULL, LeaderPickerWork, + (void *)&threads[i]); + } + } else + nthreads = 1; + } + + ~LeaderPickerState() { + if (nthreads > 1) { + thread_op = 1; + pthread_barrier_wait(&wait); + for (unsigned int i = 0; i < nthreads; i++) + pthread_join(threads[i].tid, 0); + pthread_barrier_destroy(&wait); + pthread_barrier_destroy(&done); + } + } + + bool empty() { + while (head_block) { + if (head_block->len) return false; + unsigned int next_tick = head_block->next[tick]; + if (!next_tick) return true; + head_block = &blocks[next_tick]; + } + return true; + } + + unsigned int compact(int *dst, int *src, unsigned int len) { + unsigned int count = 0; + for (unsigned int i = 0; i < len; i++) { + if ((*func)(query, src[i]) > threshold) dst[count++] = src[i]; + } + return count; + } + + void compact_job(unsigned int cycle) { + // On entry, next[tick] for each block is the current linked list. + // On exit, next[tock] is the linked list for the next iteration. + unsigned int tock = tick ^ 1; + + LeaderPickerBlock *list = head_block; + for (;;) { + unsigned int next_tick = list->next[tick]; + if (next_tick) { + LeaderPickerBlock *next = &blocks[next_tick]; + unsigned int next_next_tick = next->next[tick]; + if (cycle == 0) { + list->len = compact(list->ptr, list->ptr, list->len); + if (list->len + next->len <= list->capacity) { + list->len += compact(list->ptr + list->len, next->ptr, next->len); + list->next[tock] = next_next_tick; + } else { + next->len = compact(next->ptr, next->ptr, next->len); + if (next->len) { + list->next[tock] = next_tick; + next->next[tock] = next_next_tick; + } else + list->next[tock] = next_next_tick; + } + cycle = nthreads - 1; + } else + cycle--; + if (next_next_tick) { + list = &blocks[next_next_tick]; + } else + break; + } else { + if (cycle == 0) { + list->len = compact(list->ptr, list->ptr, list->len); + list->next[tock] = 0; + } + break; + } + } + } + + void compact(int pick) { + query = pick; + if (nthreads > 1) { + thread_op = 0; + pthread_barrier_wait(&wait); + pthread_barrier_wait(&done); + } else + compact_job(0); + tick ^= 1; + } + + int compact_next() { + compact(head_block->ptr[0]); + return query; + } +}; + +// This is the loop the worker threads run +template +void *LeaderPickerWork(void *arg) { + typename LeaderPickerState::LeaderPickerThread *thread; + thread = (typename LeaderPickerState::LeaderPickerThread *)arg; + LeaderPickerState *stat = thread->stat; + + for (;;) { + pthread_barrier_wait(&stat->wait); + if (stat->thread_op) return (void *)0; + stat->compact_job(thread->id); + pthread_barrier_wait(&stat->done); + } +} +#else + +template +struct LeaderPickerState { + std::vector v; + unsigned int left; + double threshold; + int query; + T *func; + + LeaderPickerState(unsigned int count, int) { + v.resize(count); + for (unsigned int i = 0; i < count; i++) v[i] = i; + left = count; + } + + bool empty() { return left == 0; } + + unsigned int compact(int *dst, int *src, unsigned int len) { + unsigned int count = 0; + for (unsigned int i = 0; i < len; i++) { + double ld = (*func)(query, src[i]); + // std::cerr << query << "-" << src[i] << " " << ld << std::endl; + if (ld > threshold) dst[count++] = src[i]; + } + return count; + } + + void compact(int pick) { + query = pick; + left = compact(&v[0], &v[0], left); + } + + int compact_next() { + query = v[0]; + left = compact(&v[0], &v[1], left - 1); + return query; + } +}; + +#endif +// we implement this here in order to allow arbitrary functors without link +// errors +template +RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, + const RDKit::INT_VECT &firstPicks, + double threshold, int nthreads) const { + if (!poolSize) throw ValueErrorException("empty pool to pick from"); + + if (poolSize < pickSize) + throw ValueErrorException("pickSize cannot be larger than the poolSize"); + + if (!pickSize) pickSize = poolSize; + RDKit::INT_VECT picks; + + LeaderPickerState stat(poolSize, nthreads); + stat.threshold = threshold; + stat.func = &func; + + unsigned int picked = 0; // picks.size() + unsigned int pick = 0; + + if (!firstPicks.empty()) { + for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin(); + pIdx != firstPicks.end(); ++pIdx) { + pick = static_cast(*pIdx); + if (pick >= poolSize) { + throw ValueErrorException("pick index was larger than the poolSize"); + } + picks.push_back(pick); + stat.compact(pick); + picked++; + } + } + + while (picked < pickSize && !stat.empty()) { + pick = stat.compact_next(); + picks.push_back(pick); + picked++; + } + return picks; +} + +template +RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize) const { + RDKit::INT_VECT firstPicks; + return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, + default_threshold, default_nthreads); +} + +template +RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, + double threshold) const { + RDKit::INT_VECT firstPicks; + return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold, + default_nthreads); +} +template +RDKit::INT_VECT LeaderPicker::lazyPick(T &func, unsigned int poolSize, + unsigned int pickSize, + const RDKit::INT_VECT &firstPicks, + double threshold) const { + return LeaderPicker::lazyPick(func, poolSize, pickSize, firstPicks, threshold, + default_nthreads); +} + +}; // namespace RDPickers + +#endif diff --git a/Code/SimDivPickers/MaxMinPicker.h b/Code/SimDivPickers/MaxMinPicker.h index 0659609fb..f9ffa28f5 100644 --- a/Code/SimDivPickers/MaxMinPicker.h +++ b/Code/SimDivPickers/MaxMinPicker.h @@ -24,19 +24,6 @@ namespace RDPickers { -namespace { -class RDKIT_SIMDIVPICKERS_EXPORT distmatFunctor { - public: - distmatFunctor(const double *distMat) : dp_distMat(distMat){}; - double operator()(unsigned int i, unsigned int j) { - return getDistFromLTM(this->dp_distMat, i, j); - } - - private: - const double *dp_distMat; -}; -} // namespace - /*! \brief Implements the MaxMin algorithm for picking a subset of item from a *pool * @@ -272,7 +259,7 @@ RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize, } while (*prev != 0); // if the current distance is closer then threshold, we're done - if (threshold >= 0.0 && maxOFmin < threshold) break; + if (maxOFmin <= threshold && threshold >= 0.0) break; tmpThreshold = maxOFmin; // now add the new pick to picks and remove it from the pool *pick_prev = pinfo[pick].next; diff --git a/Code/SimDivPickers/Wrap/CMakeLists.txt b/Code/SimDivPickers/Wrap/CMakeLists.txt index c46bd31fb..3718a7c6f 100644 --- a/Code/SimDivPickers/Wrap/CMakeLists.txt +++ b/Code/SimDivPickers/Wrap/CMakeLists.txt @@ -1,6 +1,6 @@ remove_definitions(-DRDKIT_SIMDIVPICKERS_BUILD) rdkit_python_extension(rdSimDivPickers - MaxMinPicker.cpp HierarchicalClusterPicker.cpp + MaxMinPicker.cpp LeaderPicker.cpp HierarchicalClusterPicker.cpp rdSimDivPickers.cpp DEST SimDivFilters LINK_LIBRARIES SimDivPickers diff --git a/Code/SimDivPickers/Wrap/LeaderPicker.cpp b/Code/SimDivPickers/Wrap/LeaderPicker.cpp new file mode 100644 index 000000000..78a0cd07a --- /dev/null +++ b/Code/SimDivPickers/Wrap/LeaderPicker.cpp @@ -0,0 +1,109 @@ +// +// Copyright (C) 2019 Greg Landrum +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// +#define NO_IMPORT_ARRAY + +#define PY_ARRAY_UNIQUE_SYMBOL rdpicker_array_API +#include +#include +#include +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#include +#include + +#include "PickerHelpers.h" + + +#include +#include +#include +#include +#include +#include + +namespace python = boost::python; +namespace RDPickers { +namespace { +template +void LazyLeaderHelper(LeaderPicker *picker, T functor, unsigned int poolSize, + double &threshold, unsigned int pickSize, + python::object firstPicks, RDKit::INT_VECT &res, + int nThreads) { + RDKit::INT_VECT firstPickVect; + for (unsigned int i = 0; + i < python::extract(firstPicks.attr("__len__")()); ++i) { + firstPickVect.push_back(python::extract(firstPicks[i])); + } + res = picker->lazyPick(functor, poolSize, pickSize, firstPickVect, threshold, + nThreads); +} +} // end of anonymous namespace + + + +RDKit::INT_VECT LazyVectorLeaderPicks(LeaderPicker *picker, python::object objs, + int poolSize, double threshold, + int pickSize, python::object firstPicks, + int numThreads) { + std::vector bvs(poolSize); + for (int i = 0; i < poolSize; ++i) { + bvs[i] = python::extract(objs[i]); + } + pyBVFunctor functor(bvs, TANIMOTO); + RDKit::INT_VECT res; + LazyLeaderHelper(picker, functor, poolSize, threshold, pickSize, firstPicks, + res, numThreads); + return res; +} + +RDKit::INT_VECT LazyLeaderPicks(LeaderPicker *picker, python::object distFunc, + int poolSize, double threshold, int pickSize, + python::object firstPicks, int numThreads) { + pyobjFunctor functor(distFunc); + RDKit::INT_VECT res; + LazyLeaderHelper(picker, functor, poolSize, threshold, pickSize, firstPicks, + res, numThreads); + return res; +} + +} // end of namespace RDPickers + +struct LeaderPicker_wrap { + static void wrap() { + python::class_( + "LeaderPicker", + "A class for diversity picking of items using Roger Sayle's Leader " + "algorithm (analogous to sphere exclusion). The algorithm is " + "currently unpublished, but a description is available in this " + "presentation from the 2019 RDKit UGM: " + "https://github.com/rdkit/UGM_2019/raw/master/Presentations/" + "Sayle_Clustering.pdf\n") + .def("LazyBitVectorPick", RDPickers::LazyVectorLeaderPicks, + (python::arg("self"), python::arg("objects"), + python::arg("poolSize"), python::arg("threshold"), + python::arg("pickSize") = 0, + python::arg("firstPicks") = python::tuple(), + python::arg("numThreads") = 1), + "Pick a subset of items from a collection of bit vectors using " + "Tanimoto distance. The threshold value is a " + "*distance* (i.e. 1-similarity). Note that the numThreads " + "argument is currently ignored.") + .def("LazyPick", RDPickers::LazyLeaderPicks, + (python::arg("self"), python::arg("distFunc"), + python::arg("poolSize"), python::arg("threshold"), + python::arg("pickSize") = 0, + python::arg("firstPicks") = python::tuple(), + python::arg("numThreads") = 1), + "Pick a subset of items from a pool of items using the " + "user-provided function to determine distances. Note that the " + "numThreads argument is currently ignored.") + ; + }; +}; + +void wrap_leaderpick() { LeaderPicker_wrap::wrap(); } diff --git a/Code/SimDivPickers/Wrap/MaxMinPicker.cpp b/Code/SimDivPickers/Wrap/MaxMinPicker.cpp index 945a2b43b..16768dcfb 100644 --- a/Code/SimDivPickers/Wrap/MaxMinPicker.cpp +++ b/Code/SimDivPickers/Wrap/MaxMinPicker.cpp @@ -16,6 +16,8 @@ #include #include +#include "PickerHelpers.h" + #include #include #include @@ -55,18 +57,6 @@ RDKit::INT_VECT MaxMinPicks(MaxMinPicker *picker, python::object distMat, return res; } -class pyobjFunctor { - public: - pyobjFunctor(python::object obj) : dp_obj(std::move(obj)) {} - ~pyobjFunctor() {} - double operator()(unsigned int i, unsigned int j) { - return python::extract(dp_obj(i, j)); - } - - private: - python::object dp_obj; -}; - namespace { template void LazyMaxMinHelper(MaxMinPicker *picker, T functor, unsigned int poolSize, @@ -107,37 +97,6 @@ python::tuple LazyMaxMinPicksWithThreshold( return python::make_tuple(res, threshold); } -// NOTE: TANIMOTO and DICE provably return the same results for the diversity -// picking this is still here just in case we ever later want to support other -// methods. -typedef enum { TANIMOTO = 1, DICE } DistanceMethod; - -template -class pyBVFunctor { - public: - pyBVFunctor(const std::vector &obj, DistanceMethod method) - : d_obj(obj), d_method(method) {} - ~pyBVFunctor() {} - double operator()(unsigned int i, unsigned int j) { - double res = 0.0; - switch (d_method) { - case TANIMOTO: - res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]); - break; - case DICE: - res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]); - break; - default: - throw_value_error("unsupported similarity value"); - } - return res; - } - - private: - const std::vector &d_obj; - DistanceMethod d_method; -}; - RDKit::INT_VECT LazyVectorMaxMinPicks(MaxMinPicker *picker, python::object objs, int poolSize, int pickSize, python::object firstPicks, int seed, diff --git a/Code/SimDivPickers/Wrap/PickerHelpers.h b/Code/SimDivPickers/Wrap/PickerHelpers.h new file mode 100644 index 000000000..41f424950 --- /dev/null +++ b/Code/SimDivPickers/Wrap/PickerHelpers.h @@ -0,0 +1,60 @@ +// +// Copyright (C) 2019 Greg Landrum +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#ifndef RDKIT_PICKERHELPERS_H +#define RDKIT_PICKERHELPERS_H + +#include +#include + + +// NOTE: TANIMOTO and DICE provably return the same results for the diversity +// picking this is still here just in case we ever later want to support other +// methods. +typedef enum { TANIMOTO = 1, DICE } DistanceMethod; + +template +class pyBVFunctor { + public: + pyBVFunctor(const std::vector &obj, DistanceMethod method) + : d_obj(obj), d_method(method) {} + ~pyBVFunctor() {} + double operator()(unsigned int i, unsigned int j) { + double res = 0.0; + switch (d_method) { + case TANIMOTO: + res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]); + break; + case DICE: + res = 1. - DiceSimilarity(*d_obj[i], *d_obj[j]); + break; + default: + throw_value_error("unsupported similarity value"); + } + return res; + } + + private: + const std::vector &d_obj; + DistanceMethod d_method; +}; + +class pyobjFunctor { + public: + pyobjFunctor(python::object obj) : dp_obj(std::move(obj)) {} + ~pyobjFunctor() {} + double operator()(unsigned int i, unsigned int j) { + return python::extract(dp_obj(i, j)); + } + + private: + python::object dp_obj; +}; + +#endif // RDKIT_PICKERHELPERS_H \ No newline at end of file diff --git a/Code/SimDivPickers/Wrap/rdSimDivPickers.cpp b/Code/SimDivPickers/Wrap/rdSimDivPickers.cpp index 43cad426b..2fe957b87 100644 --- a/Code/SimDivPickers/Wrap/rdSimDivPickers.cpp +++ b/Code/SimDivPickers/Wrap/rdSimDivPickers.cpp @@ -1,6 +1,6 @@ -// $Id$ // // Copyright (C) 2003-2006 Rational Discovery LLC +// Copyright (C) 2019 Greg Landrum // // @@ All Rights Reserved @@ // This file is part of the RDKit. @@ -15,6 +15,7 @@ namespace python = boost::python; void wrap_maxminpick(); +void wrap_leaderpick(); void wrap_HierarchCP(); BOOST_PYTHON_MODULE(rdSimDivPickers) { @@ -24,5 +25,6 @@ BOOST_PYTHON_MODULE(rdSimDivPickers) { rdkit_import_array(); wrap_maxminpick(); + wrap_leaderpick(); wrap_HierarchCP(); } diff --git a/Code/SimDivPickers/Wrap/testPickers.py b/Code/SimDivPickers/Wrap/testPickers.py index 2460e3455..6b848a725 100755 --- a/Code/SimDivPickers/Wrap/testPickers.py +++ b/Code/SimDivPickers/Wrap/testPickers.py @@ -136,7 +136,7 @@ class TestCase(unittest.TestCase): # we get the occasional dupe randomly, # make sure we don't get three dupes in a row - self.assertTrue(tuple(mm2)!=tuple(mm1)) or (tuple(mm3) != tuple(mm1)) + self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1)) picker = None ds = [] @@ -184,40 +184,41 @@ class TestCase(unittest.TestCase): self.assertEqual(list(mm1), list(mm2)) def testBitVectorMaxMin2(self): - fps = ["11110010101000000000", "00000000000010010000", "11001010000000000001", - "00100110101000001000", "01010110000100011001", "11000110101001000011", - "00000000001100001111", "00011110110000001101", "00000011011110100010", - "11000010110001000000", "00000100010000010000", "10000001000010110010", - "00010010000000010100", "00011100100110101000", "10001001100110100000", - "10000110100110010000", "00101110000101000000", "11011101100011100000", - "10000110000100101000", "00101000100000010001", "01000001000010000000", - "00101101010100000110", "10001000100110110001", "00011000010100000001", - "00101000001000100011", "00010000100010011001", "01100001000100010001", - "10000101000001101101", "00001000011001011000", "11110000100100100000", - "10100110000000011010", "00110100010110010010", "00000000000001010010", - "00100000000010100001", "11110011000010001000", "10110001010100001000", - "00001100100110011011", "00010010100100001110", "10100101100010100010", - "01100100010100000001", "10101110011100000000", "01011000000001000001", - "00000011100110100010", "01100001010001001001", "00001000000001001100", - "10011001110000000100", "10110000001001100100", "00011000000001001011", - "11001011010001100010", "10010000000001001011", "00010000100111100000", - "00001000001110001000", "11010000010001100110", "01101001100000111000", - "01001000001110111000", "10000000000100010010", "11001000010010000000", - "01010010000100110001", "00010001010100100001", "01110010000000010000", - "10001010000011000001", "00000110000000100100", "00010000010001000000", - "11101100011010000011", "00000010100001010001", "00010000110010000101", - "00010001001000111001", "01000010001100100110", "00110110000000100001", - "00100010010110110010", "01000000110011001111", "00011000001000110010", - "01111010101000110100", "00001010000010110110", "00110011000011011010", - "00111010111010000110", "00010011101010000011", "00000001011000010000", - "00011011101110110000", "00010001101000000001", "00010000001010011010", - "00000010100100100010", "00000010001011000100", "11010000000001011100", - "00001000110101000001", "00000010000000110010", "10000000010011000001", - "11110110100100010000", "10001111000110001001", "00100110000110000100", - "00000100100000100100", "00110000101100010100", "00001010100000100000", - "01011000000011000111", "00010000100001010001", "10000010100000010000", - "00001000000000110010", "00001000101011010001", "00011110000100100000", - "11001001010001010100"] + fps = [ + "11110010101000000000", "00000000000010010000", "11001010000000000001", + "00100110101000001000", "01010110000100011001", "11000110101001000011", + "00000000001100001111", "00011110110000001101", "00000011011110100010", + "11000010110001000000", "00000100010000010000", "10000001000010110010", + "00010010000000010100", "00011100100110101000", "10001001100110100000", + "10000110100110010000", "00101110000101000000", "11011101100011100000", + "10000110000100101000", "00101000100000010001", "01000001000010000000", + "00101101010100000110", "10001000100110110001", "00011000010100000001", + "00101000001000100011", "00010000100010011001", "01100001000100010001", + "10000101000001101101", "00001000011001011000", "11110000100100100000", + "10100110000000011010", "00110100010110010010", "00000000000001010010", + "00100000000010100001", "11110011000010001000", "10110001010100001000", + "00001100100110011011", "00010010100100001110", "10100101100010100010", + "01100100010100000001", "10101110011100000000", "01011000000001000001", + "00000011100110100010", "01100001010001001001", "00001000000001001100", + "10011001110000000100", "10110000001001100100", "00011000000001001011", + "11001011010001100010", "10010000000001001011", "00010000100111100000", + "00001000001110001000", "11010000010001100110", "01101001100000111000", + "01001000001110111000", "10000000000100010010", "11001000010010000000", + "01010010000100110001", "00010001010100100001", "01110010000000010000", + "10001010000011000001", "00000110000000100100", "00010000010001000000", + "11101100011010000011", "00000010100001010001", "00010000110010000101", + "00010001001000111001", "01000010001100100110", "00110110000000100001", + "00100010010110110010", "01000000110011001111", "00011000001000110010", + "01111010101000110100", "00001010000010110110", "00110011000011011010", + "00111010111010000110", "00010011101010000011", "00000001011000010000", + "00011011101110110000", "00010001101000000001", "00010000001010011010", + "00000010100100100010", "00000010001011000100", "11010000000001011100", + "00001000110101000001", "00000010000000110010", "10000000010011000001", + "11110110100100010000", "10001111000110001001", "00100110000110000100", + "00000100100000100100", "00110000101100010100", "00001010100000100000", + "01011000000011000111", "00010000100001010001", "10000010100000010000", + "00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100" + ] N = 5 fps = [DataStructs.CreateFromBitString(x) for x in fps] picker = rdSimDivPickers.MaxMinPicker() @@ -235,14 +236,15 @@ class TestCase(unittest.TestCase): fps = [] with open(fname) as infil: for line in infil: - fp = DataStructs.CreateFromFPSText(line.strip()) - fps.append(fp) - mmp =rdSimDivPickers.MaxMinPicker() - ids=list(mmp.LazyBitVectorPick(fps,len(fps),20,seed=42)) + fp = DataStructs.CreateFromFPSText(line.strip()) + fps.append(fp) + mmp = rdSimDivPickers.MaxMinPicker() + ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42)) self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\ 881,516,497,412,718,869,407]) - ids=list(mmp.LazyBitVectorPick(fps,len(fps),20,firstPicks=[374,720,690,339,875],seed=42)) + ids = list( + mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42)) self.assertEqual(ids,[374,720,690,339,875,842,404,725,120,385,115,868,630,\ 881,516,497,412,718,869,407]) @@ -253,18 +255,63 @@ class TestCase(unittest.TestCase): fps = [] with open(fname) as infil: for line in infil: - fp = DataStructs.CreateFromFPSText(line.strip()) - fps.append(fp) - mmp =rdSimDivPickers.MaxMinPicker() - ids,threshold=mmp.LazyBitVectorPickWithThreshold(fps,len(fps),20,-1.0,seed=42) + fp = DataStructs.CreateFromFPSText(line.strip()) + fps.append(fp) + mmp = rdSimDivPickers.MaxMinPicker() + ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42) self.assertEqual(list(ids),[374,720,690,339,875,842,404,725,120,385,115,868,630,\ 881,516,497,412,718,869,407]) - self.assertAlmostEqual(threshold,0.8977,4) + self.assertAlmostEqual(threshold, 0.8977, 4) - ids,threshold=mmp.LazyBitVectorPickWithThreshold(fps,len(fps),20,0.91,seed=42) - self.assertEqual(list(ids),[374,720,690,339,875,842,404,725,120,385,115,868,630]) - self.assertTrue(threshold>=0.91) + ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42) + self.assertEqual(list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630]) + self.assertTrue(threshold >= 0.91) + + def testBitVectorLeader1(self): + # threshold tests + fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', + 'chembl_cyps.head.fps') + fps = [] + with open(fname) as infil: + for line in infil: + fp = DataStructs.CreateFromFPSText(line.strip()) + fps.append(fp) + mmp = rdSimDivPickers.LeaderPicker() + thresh = 0.8 + ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) + self.assertEqual(len(ids), 146) + for i in range(len(ids)): + for j in range(i): + self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), + thresh) + thresh = 0.9 + ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) + self.assertEqual(len(ids), 14) + for i in range(len(ids)): + for j in range(i): + self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), + thresh) + + ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10) + self.assertEqual(len(ids), 10) + for i in range(len(ids)): + for j in range(i): + self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), + thresh) + + def testLazyLeader(self): + pkr = rdSimDivPickers.LeaderPicker() + + def func(i, j): + if i == j: + return 0.0 + if i < j: + j, i = i, j + return i - j + + lres = pkr.LazyPick(func, 100, 20) + self.assertEqual(list(lres), [0, 21, 42, 63, 84]) if __name__ == '__main__': diff --git a/Code/SimDivPickers/catch_tests.cpp b/Code/SimDivPickers/catch_tests.cpp new file mode 100644 index 000000000..766c94c9a --- /dev/null +++ b/Code/SimDivPickers/catch_tests.cpp @@ -0,0 +1,88 @@ +// +// Copyright (C) 2019 +// +// @@ All Rights Reserved @@ +// This file is part of the RDKit. +// The contents are covered by the terms of the BSD license +// which is included in the file license.txt, found at the root +// of the RDKit source tree. +// + +#define CATCH_CONFIG_MAIN // This tells Catch to provide a main() - only do + // this in one cpp file +#include "catch.hpp" +#include +#include +#include +#include +#include + +#include +#include + +template +class BVFunctor { + public: + BVFunctor(const T &obj) : d_obj(obj) {} + ~BVFunctor() {} + double operator()(unsigned int i, unsigned int j) { + double res = 1. - TanimotoSimilarity(*d_obj[i], *d_obj[j]); + return res; + } + const T &d_obj; +}; + +TEST_CASE( + "Leader Picker basics" + "[LeaderPicker]") { + std::string rdbase = getenv("RDBASE"); + std::string fName = + rdbase + "/Code/SimDivPickers/Wrap/test_data/chembl_cyps.head.fps"; + std::ifstream inf(fName); + std::string fpsText; + std::getline(inf, fpsText); + std::vector> fps; + while (!inf.eof() && !fpsText.empty()) { + fps.emplace_back(new ExplicitBitVect(fpsText.size() * 4)); + UpdateBitVectFromFPSText(*fps.back(), fpsText); + std::getline(inf, fpsText); + }; + REQUIRE(fps.size() == 1000); + BVFunctor>> bvf(fps); + RDPickers::LeaderPicker pkr; + SECTION("basics1") { + double threshold = 0.8; + auto res = pkr.lazyPick(bvf, fps.size(), 0, threshold); + CHECK(res.size() == 146); + for (unsigned i = 0; i < res.size(); ++i) { + for (unsigned j = 0; j < i; ++j) { + CHECK(bvf(res[i], res[j]) >= threshold); + } + } + } + SECTION("basics2") { + double threshold = 0.9; + auto res = pkr.lazyPick(bvf, fps.size(), 0, threshold); + CHECK(res.size() == 14); + for (unsigned i = 0; i < res.size(); ++i) { + for (unsigned j = 0; j < i; ++j) { + CHECK(bvf(res[i], res[j]) >= threshold); + } + } + } +#ifdef RDK_THREADSAFE_SSS + SECTION("basics multithreaded") { + double threshold = 0.8; + RDKit::INT_VECT firstPicks; + int nThreads = 0; // use max available + auto res = + pkr.lazyPick(bvf, fps.size(), 0, firstPicks, threshold, nThreads); + CHECK(res.size() == 146); + for (unsigned i = 0; i < res.size(); ++i) { + for (unsigned j = 0; j < i; ++j) { + CHECK(bvf(res[i], res[j]) >= threshold); + } + } + } +#endif +} diff --git a/Code/SimDivPickers/pickersCLI.cpp b/Code/SimDivPickers/pickersCLI.cpp new file mode 100644 index 000000000..8d107d848 --- /dev/null +++ b/Code/SimDivPickers/pickersCLI.cpp @@ -0,0 +1,217 @@ +/* MaxMinPicker.cpp */ +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#if 0 +#include +#include +#else +#include "MaxMinPicker.h" +#include "LeaderPicker.h" +#endif + +std::vector mols; +std::vector fps; +// ExplicitBitVect **fps; + +const char *poolname; +const char *pickname; +unsigned int origsize; +unsigned int picksize; +unsigned int poolsize; +unsigned int newpicks; +int seed; + +unsigned long ticks = 0; + +static unsigned int LoadDatabase(FILE *fp) { + char buffer[32768]; + unsigned int result = 0; + + while (fgets(buffer, 327666, fp)) { + if (buffer[0] == '#' || buffer[0] == ' ' || buffer[0] == '\t') continue; + char *ptr = buffer; + while (*ptr && *ptr != ' ' && *ptr != '\t') ptr++; + if (*ptr) { + *ptr++ = '\0'; + char *end = ptr; + while (*end && *end != '\n' && *end != '\r') end++; + *end = '\0'; + } + + RDKit::RWMol *mol; + try { + mol = RDKit::SmilesToMol(buffer); + } catch (...) { + mol = (RDKit::RWMol *)0; + } + + if (mol) { +#if 0 + if (*ptr) { + static std::string key("_Name"); + mol->setProp(key,std::string(ptr)); + } + mols.push_back(mol); +#endif + ExplicitBitVect *fp; + fp = RDKit::MorganFingerprints::getFingerprintAsBitVect(*mol, 2, 2048); + fps.push_back(fp); + delete mol; + result++; + } + } + fprintf(stderr, "%u molecules\n", result); + return result; +} + +static unsigned int LoadDatabase(const char *fname) { + if (fname && strcmp(fname, "-")) { + FILE *fp = fopen(fname, "rb"); + if (!fp) return 0; + unsigned int result = LoadDatabase(fp); + fclose(fp); + return result; + } else + return LoadDatabase(stdin); +} + +#ifdef UNUSED +static void GenerateFingerprints() { + unsigned int count = (unsigned int)mols.size(); + unsigned int size = (unsigned int)(count * sizeof(void *)); + fps = (ExplicitBitVect **)malloc(size); + + for (unsigned int i = 0; i < count; i++) + fps[i] = + RDKit::MorganFingerprints::getFingerprintAsBitVect(*mols[i], 2, 2048); + fprintf(stderr, "%u Fingerprints\n", count); +} + +static void DestroyFingerprints() { + unsigned int count = (unsigned int)mols.size(); + for (unsigned int i = 0; i < count; i++) delete fps[i]; + free(fps); +} +#endif + +static void DisplayUsage() { + fputs("usage: MaxMinPicker [options] []\n", stderr); + exit(1); +} + +static void ProcessCommandLine(int argc, char *argv[]) { + poolname = (const char *)0; + pickname = (const char *)0; + newpicks = 1000; + + for (int i = 1; i < argc; i++) { + const char *ptr = argv[i]; + if (ptr[0] == '-' && ptr[1]) { + if (ptr[1] >= '0' && ptr[1] <= '9') { + newpicks = atoi(ptr + 1); + } else + DisplayUsage(); + } else if (!poolname) { + poolname = ptr; + } else if (!pickname) { + pickname = ptr; + } else + DisplayUsage(); + } + + if (!poolname) DisplayUsage(); +} + +double MyDist(int i, int j) { + ticks++; + return 1.0 - TanimotoSimilarity(*fps[i], *fps[j]); +} + +int main(int argc, char *argv[]) { + struct timeval beg, end; + double elapsed; + + ProcessCommandLine(argc, argv); + + gettimeofday(&beg, (struct timezone *)0); + origsize = LoadDatabase(poolname); + gettimeofday(&end, (struct timezone *)0); + elapsed = (end.tv_sec + 0.000001 * end.tv_usec) - + (beg.tv_sec + 0.000001 * beg.tv_usec); + fprintf(stderr, "Elapsed time: %g secs\n", elapsed); + + if (pickname) { + gettimeofday(&beg, (struct timezone *)0); + picksize = LoadDatabase(pickname); + gettimeofday(&end, (struct timezone *)0); + elapsed = (end.tv_sec + 0.000001 * end.tv_usec) - + (beg.tv_sec + 0.000001 * beg.tv_usec); + fprintf(stderr, "Elapsed time: %g secs\n", elapsed); + } else + picksize = 0; + poolsize = origsize + picksize; + if (newpicks == 0) newpicks = origsize; + +#ifdef UNUSED + gettimeofday(&beg, (struct timezone *)0); + GenerateFingerprints(); + gettimeofday(&end, (struct timezone *)0); + elapsed = (end.tv_sec + 0.000001 * end.tv_usec) - + (beg.tv_sec + 0.000001 * beg.tv_usec); + fprintf(stderr, "Elapsed time: %g secs\n", elapsed); +#endif + + RDKit::INT_VECT firstPicks; + if (picksize) { + firstPicks.reserve(picksize); + for (unsigned int i = 0; i < picksize; i++) + firstPicks.push_back((int)(i + origsize)); + } + + // RDPickers::MaxMinPicker mmpicker; + double threshold = 0.9; // 0.675; + RDPickers::LeaderPicker ldpicker(threshold, 4); + + gettimeofday(&beg, (struct timezone *)0); +#if 0 + RDKit::INT_VECT iv = mmpicker.lazyPick(MyDist,poolsize,picksize+newpicks, + firstPicks,seed,threshold); +#else + RDKit::INT_VECT iv = ldpicker.lazyPick(MyDist, poolsize, picksize + newpicks, + firstPicks, threshold, 16); +#endif + gettimeofday(&end, (struct timezone *)0); + elapsed = (end.tv_sec + 0.000001 * end.tv_usec) - + (beg.tv_sec + 0.000001 * beg.tv_usec); + fprintf(stderr, "Elapsed time: %g secs\n", elapsed); + + unsigned int count = (unsigned int)iv.size(); +#if 0 + for (unsigned int j=0; j