// // Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC // Copyright (C) 2017 Greg Landrum and NextMove Software // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #ifndef RD_MAXMINPICKER_H #define RD_MAXMINPICKER_H #include #include #include #include #include #include #include "DistPicker.h" #include #include namespace RDPickers { /*! \brief Implements the MaxMin algorithm for picking a subset of item from a *pool * * This class inherits from the DistPicker and implements a specific picking *strategy * aimed at diversity. See documentation for "pick()" member function for the *algorithm details */ class RDKIT_SIMDIVPICKERS_EXPORT MaxMinPicker : public DistPicker { public: /*! \brief Default Constructor * */ MaxMinPicker() {} /*! \brief Contains the implementation for a lazy MaxMin diversity picker * * See the documentation for the pick() method for details about the algorithm * * \param func - a function (or functor) taking two unsigned ints as *arguments * and returning the distance (as a double) between those two *elements. * \param poolSize - the size of the pool to pick the items from. It is *assumed that the * distance matrix above contains the right number of elements; *i.e. * poolSize*(poolSize-1) * \param pickSize - the number items to pick from pool (<= poolSize) * \param firstPicks - (optional)the first items in the pick list * \param seed - (optional) seed for the random number generator. * If this is <0 the generator will be seeded with a * random number. */ template RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const; template RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, int seed = -1) const; template RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, int seed, double &threshold) const; /*! \brief Contains the implementation for the MaxMin diversity picker * * Here is how the picking algorithm works, refer to * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604 * for more detail: * * A subset of k items is to be selected from a pool containing N molecules. * Then the MaxMin method is as follows: * -# Initialise Subset with some appropriately chosen seed * compound and set x = 1. * -# For each of the N-x remaining compounds in Dataset * calculate its dissimilarity with each of the x compounds in * Subset and retain the smallest of these x dissimilarities for * each compound in Dataset. * -# Select the molecule from Dataset with the largest value * for the smallest dissimilarity calculated in Step 2 and * transfer it to Subset. * -# Set x = x + 1 and return to Step 2 if x < k. * * * * \param distMat - distance matrix - a vector of double. It is assumed that *only the * lower triangle element of the matrix are supplied in a 1D *array\n * \param poolSize - the size of the pool to pick the items from. It is *assumed that the * distance matrix above contains the right number of elements; *i.e. * poolSize*(poolSize-1) \n * \param pickSize - the number items to pick from pool (<= poolSize) * \param firstPicks - indices of the items used to seed the pick set. * \param seed - (optional) seed for the random number generator * If this is <0 the generator will be seeded with a * random number. */ RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed = -1) const { CHECK_INVARIANT(distMat, "Invalid Distance Matrix"); if (!poolSize) { throw ValueErrorException("empty pool to pick from"); } if (poolSize < pickSize) { throw ValueErrorException("pickSize cannot be larger than the poolSize"); } distmatFunctor functor(distMat); return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed); } /*! \overload */ RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const override { RDKit::INT_VECT iv; return pick(distMat, poolSize, pickSize, iv); } }; struct MaxMinPickInfo { double dist_bound; // distance to closest reference unsigned int picks; // number of references considered unsigned int next; // singly linked list of candidates }; // we implement this here in order to allow arbitrary functors without link // errors template RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, int seed, double &threshold) const { if (!poolSize) { throw ValueErrorException("empty pool to pick from"); } if (poolSize < pickSize) { throw ValueErrorException("pickSize cannot be larger than the poolSize"); } RDKit::INT_VECT picks; unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo)); std::unique_ptr pinfo(new MaxMinPickInfo[memsize]); memset(pinfo.get(), 0, memsize); picks.reserve(pickSize); unsigned int picked = 0; // picks.size() unsigned int pick = 0; // pick the first entry if (firstPicks.empty()) { // get a seeded random number generator: typedef boost::mt19937 rng_type; typedef boost::uniform_int<> distrib_type; typedef boost::variate_generator source_type; rng_type generator; distrib_type dist(0, poolSize - 1); if (seed >= 0) { generator.seed(static_cast(seed)); } else { generator.seed(std::random_device()()); } source_type randomSource(generator, dist); pick = randomSource(); // add the pick to the picks picks.push_back(pick); // and remove it from the pool pinfo[pick].picks = 1; picked = 1; } else { for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin(); pIdx != firstPicks.end(); ++pIdx) { pick = static_cast(*pIdx); if (pick >= poolSize) { throw ValueErrorException("pick index was larger than the poolSize"); } picks.push_back(pick); pinfo[pick].picks = 1; picked++; } } if (picked >= pickSize) { threshold = -1.0; return picks; } unsigned int pool_list = 0; unsigned int *prev = &pool_list; // enter the pool into a list so that we can pick out of it easily for (unsigned int i = 0; i < poolSize; i++) { if (pinfo[i].picks == 0) { *prev = i; prev = &pinfo[i].next; } } *prev = 0; unsigned int poolIdx; unsigned int pickIdx; // First pass initialize dist_bound prev = &pool_list; pickIdx = picks[0]; do { poolIdx = *prev; pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx); pinfo[poolIdx].picks = 1; prev = &pinfo[poolIdx].next; } while (*prev != 0); // now pick 1 compound at a time double maxOFmin = -1.0; double tmpThreshold = -1.0; while (picked < pickSize) { unsigned int *pick_prev = nullptr; maxOFmin = -1.0; prev = &pool_list; do { poolIdx = *prev; double minTOi = pinfo[poolIdx].dist_bound; if (minTOi > maxOFmin) { unsigned int pi = pinfo[poolIdx].picks; while (pi < picked) { unsigned int picki = picks[pi]; CHECK_INVARIANT(poolIdx != picki, "pool index != pick index"); double dist = func(poolIdx, picki); pi++; if (dist <= minTOi) { minTOi = dist; if (minTOi <= maxOFmin) { break; } } } pinfo[poolIdx].dist_bound = minTOi; pinfo[poolIdx].picks = pi; if (minTOi > maxOFmin) { maxOFmin = minTOi; pick_prev = prev; pick = poolIdx; } } prev = &pinfo[poolIdx].next; } while (*prev != 0); // if the current distance is closer then threshold, we're done if (maxOFmin <= threshold && threshold >= 0.0) { break; } tmpThreshold = maxOFmin; // now add the new pick to picks and remove it from the pool *pick_prev = pinfo[pick].next; picks.push_back(pick); ++picked; } threshold = tmpThreshold; return picks; } template RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize, unsigned int pickSize, const RDKit::INT_VECT &firstPicks, int seed) const { double threshold = -1.0; return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed, threshold); } template RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const { RDKit::INT_VECT firstPicks; double threshold = -1.0; int seed = -1; return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed, threshold); } }; // namespace RDPickers #endif