mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
* Adds C++ Enumeration Engine to the RDKit * Adds Sanitization helpers, wrappers and tests * Clang format * Remove unused enumerationStateOnly flag * Fixes docStrings to current API * Adds doc strings * Removes RGroupPosition, adds getPosition to EnumerationBase * Fixes readability. * Adds EnumerateLibraryBase::reset and getReaction * Added getReagents method to EnumerateLibrary * Make the tests have the same naming * Need to save the initial state for resetting. * Stupid case-insensitive file systems * Moves ResetState to EnumerateLibraryBase * Adds removeNonmatchingReagents helper * Renames currentPosition to getPosition * Adds Enumeration Toolkit tutorial * Fixes Python3 serialization and enumerators * Verified to run on python2 and 3 * Fixes integer issues on windows * The number of enumeration should be unsigned. * Adds deserialization constructor * Moves boost_serialization to the end * Deprecates Clone in favor of copy * Update tests to use copy.copy not Clone * Move RGROUPS and BBS into an EnumerationTypes namespace * Make sure old pickles work * Adds pickle for backwards compatibility * Moves to uint64_t from size_t for public api * Whups, accidentally used the binary archiver. * Commits boost 1.55 serialization * Makes serialization turnoffable Like Filter Catalog * Fixes tests when serialization not available. Adds more enumeration strategy tests * Fixes a syntax error on some versions of python * Fixes sanitizeRxn to actually make proper RGroup atoms * Updates SanitizeRXN python API * Updates Enumeration API to a parameter class - fixes reagent removal * Adds a mess of tests * Change stats to return a string. * Exposes EvenPairSamplingStrategy Stats to python * Fixes a crash bug in SanitizeRxn * Adds better testing of the even pair sampling * Fixes namespace * One more try to fix gcc * Enum classes are c++11 and a microsoft extension. * Fix typo * Fixes np.median for python3 * Fixes atom iterators * Adds virtual tags to derived virtual functions (for clarity) * Fixes size comparison issues * Adds doc string * Small cleanup (has no effect since flags aren’t used) * fixes crash bug on windows * get the tests working on windows * Updates tutorial * Adds Glare implementation to Contrib
282 lines
9.2 KiB
C++
282 lines
9.2 KiB
C++
//
|
|
// Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior written
|
|
// permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
#include "EvenSamplePairs.h"
|
|
#include <boost/format.hpp>
|
|
#include <stdint.h>
|
|
|
|
|
|
namespace RDKit {
|
|
|
|
using namespace EnumerationTypes;
|
|
// Based on an implementation from a correspondance with Bernd Rohde.
|
|
void EvenSamplePairsStrategy::initializeStrategy(const ChemicalReaction &,
|
|
const BBS &bbs) {
|
|
size_t npos = bbs.size();
|
|
used_count.resize(npos);
|
|
std::fill(used_count.begin(), used_count.end(), 0);
|
|
|
|
var_used.resize(npos);
|
|
for (size_t i = 0; i < npos; ++i) {
|
|
var_used[i].resize(m_permutationSizes[i]);
|
|
std::fill(var_used[i].begin(), var_used[i].end(), 0);
|
|
}
|
|
|
|
boost::uint64_t nmonomers = 0;
|
|
for (size_t i = 0; i < bbs.size(); ++i) nmonomers += m_permutationSizes[i];
|
|
|
|
pair_used.resize(nmonomers);
|
|
for (size_t i = 0; i < nmonomers; ++i) {
|
|
pair_used[i].resize(nmonomers);
|
|
std::fill(pair_used[i].begin(), pair_used[i].end(), 0);
|
|
}
|
|
|
|
pair_counts.resize(npos);
|
|
for (size_t i = 0; i < npos; i++) {
|
|
pair_counts[i].resize(npos);
|
|
std::fill(pair_counts[i].begin(), pair_counts[i].end(), 0);
|
|
}
|
|
|
|
/* Initialize random number generator */
|
|
/* Find modulus */
|
|
PRECONDITION(m_numPermutations >= 0,
|
|
"Number of permutations too large to Evenly sample");
|
|
for (M = 1; M < rdcast<size_t>(m_numPermutations); M = 2 * M)
|
|
;
|
|
/* Set factor */
|
|
a = 5;
|
|
b = 7;
|
|
|
|
// control of random number and heuristics
|
|
seed = 0;
|
|
m_numPermutationsProcessed = 0;
|
|
nslack = 0; // increase this to break evenness criteria
|
|
rejected_period = 0;
|
|
rejected_unique = 0;
|
|
rejected_slack_condition = 0;
|
|
rejected_bb_sampling_condition = 0;
|
|
|
|
selected.clear(); // clear the selected (unique) set
|
|
}
|
|
|
|
// Try to add the given encoded seed position into
|
|
// the current set of return groups. This checks to
|
|
// see if the BBS are evenly sampled as pairs. If
|
|
// they currently are not, reject the selection.
|
|
// This is fairly suboptimal for large collections
|
|
// of building blocks and may take a while to
|
|
// terminate...
|
|
bool EvenSamplePairsStrategy::try_add(size_t seed) {
|
|
const RGROUPS &digits = decode(seed);
|
|
const RGROUPS &rgroups = m_permutationSizes;
|
|
size_t islack = 0;
|
|
size_t num_rgroups = m_permutationSizes.size();
|
|
|
|
for (size_t i = 0; i < num_rgroups; ++i) {
|
|
if (var_used[i][digits[i]]) islack += var_used[i][digits[i]];
|
|
if (islack > nslack) {
|
|
// add better heuristic here??
|
|
rejected_slack_condition += 1;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
islack = 0;
|
|
size_t ioffset = 0;
|
|
// check that building block pairs get evenly sampled
|
|
for (size_t i = 0; i < num_rgroups; ++i) {
|
|
size_t joffset = 0;
|
|
for (size_t j = 0; j < num_rgroups; ++j) {
|
|
if (j == i) continue;
|
|
size_t ii = digits[i] + ioffset;
|
|
size_t jj = digits[j] + joffset;
|
|
if (pair_used[ii][jj] > 0) {
|
|
double numer = (double)pair_used[ii][jj];
|
|
double denom = sqrt((double)(rgroups[i]) * (double)(rgroups[j]));
|
|
islack = (int)(numer / denom);
|
|
}
|
|
joffset += rgroups[j];
|
|
}
|
|
ioffset += rgroups[i];
|
|
}
|
|
|
|
if (islack > nslack) {
|
|
rejected_bb_sampling_condition += 1;
|
|
return false;
|
|
}
|
|
|
|
// keep track of bb usage
|
|
for (size_t i = 0; i < num_rgroups; ++i) {
|
|
if (var_used[i][digits[i]] == 0) {
|
|
used_count[i]++;
|
|
}
|
|
var_used[i][digits[i]] += 1;
|
|
if (used_count[i] == rdcast<int64_t>(rgroups[i])) {
|
|
// complete variable scan => initialize
|
|
if (nslack > min_nslack && rgroups[i] > 1) // cleared slack on i
|
|
nslack = min_nslack;
|
|
|
|
used_count[i] = 0;
|
|
for (size_t j = 0; j < rgroups[i]; ++j) {
|
|
var_used[i][j]--;
|
|
assert(var_used[i][j] >= 0);
|
|
if (var_used[i][j] > 0) used_count[i]++;
|
|
}
|
|
} // end scan
|
|
}
|
|
|
|
// keep track of BB Pair usage
|
|
ioffset = 0;
|
|
for (size_t i = 0; i < num_rgroups; ioffset += rgroups[i], ++i) {
|
|
size_t joffset = 0;
|
|
for (size_t j = 0; j < num_rgroups; joffset += rgroups[j], ++j) {
|
|
if (j == i) {
|
|
continue;
|
|
}
|
|
size_t ii = digits[i] + ioffset;
|
|
size_t jj = digits[j] + joffset;
|
|
if (pair_used[ii][jj] == 0) {
|
|
pair_counts[i][j]++;
|
|
}
|
|
pair_used[ii][jj]++;
|
|
if (pair_counts[i][j] >= rgroups[i] * rgroups[j]) { // all pairs visited
|
|
if (nslack > min_nslack && (rgroups[i] > 1 || rgroups[j] > 1)) {
|
|
nslack = min_nslack;
|
|
}
|
|
pair_counts[i][j] = 0;
|
|
for (size_t ii = 0; ii < rgroups[i]; ++ii) {
|
|
for (size_t jj = 0; jj < rgroups[j]; ++jj) {
|
|
pair_used[ioffset + ii][joffset + jj]--;
|
|
if (pair_used[ioffset + ii][joffset + jj] > 0) {
|
|
pair_counts[i][j]++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
selected.insert(seed);
|
|
return true;
|
|
}
|
|
|
|
const RGROUPS &EvenSamplePairsStrategy::next() {
|
|
nslack = 0;
|
|
while (m_numPermutationsProcessed < rdcast<size_t>(m_numPermutations)) {
|
|
bool added = false;
|
|
for (size_t l = 0; l < M; ++l) {
|
|
seed = ((seed * a + b) % M);
|
|
if (seed > rdcast<size_t>(m_numPermutations)) {
|
|
rejected_period += 1;
|
|
continue;
|
|
} else if (selected.find(seed) != selected.end()) {
|
|
rejected_unique += 1;
|
|
continue;
|
|
} else if (try_add(seed)) {
|
|
m_numPermutationsProcessed++;
|
|
added = true;
|
|
return decode(seed);
|
|
}
|
|
}
|
|
|
|
if (!added) {
|
|
// loosen heuristic
|
|
nslack += 1;
|
|
min_nslack += 1;
|
|
}
|
|
}
|
|
|
|
throw EnumerationStrategyException("Ran out of molecules");
|
|
}
|
|
|
|
std::string EvenSamplePairsStrategy::stats() const {
|
|
std::ostringstream ss;
|
|
|
|
size_t npos = m_permutationSizes.size();
|
|
const RGROUPS &nvars = m_permutationSizes;
|
|
size_t i, l, j, ii, jj, ioffset, joffset;
|
|
ss << "#BEGIN# BBSTAT\n";
|
|
for (i = 0; i < npos; i++) {
|
|
size_t maxcount = 0;
|
|
if (nvars[i] == 1) continue;
|
|
for (j = 0; j < nvars[i]; j++)
|
|
if (maxcount < var_used[i][j]) maxcount = var_used[i][j];
|
|
|
|
ss << boost::format("%lu\t%lu\t%6.2f") % (i + 1) % nvars[i] %
|
|
((double)m_numPermutationsProcessed / nvars[i]);
|
|
|
|
for (l = 0; l <= maxcount; l++) {
|
|
size_t n = 0;
|
|
for (j = 0; j < nvars[i]; j++)
|
|
if (var_used[i][j] == l) n++;
|
|
if (n > 0) ss << boost::format("\t%lu|%lu") % l % n;
|
|
}
|
|
ss << std::endl;
|
|
}
|
|
ss << "#END# BBSTAT\n";
|
|
|
|
ss << "#BEGIN# PAIRSTAT\n";
|
|
for (i = 0, ioffset = 0; i < npos; ioffset += nvars[i], i++) {
|
|
if (nvars[i] == 1) continue;
|
|
for (j = 0, joffset = 0; j < npos; joffset += nvars[j], j++) {
|
|
size_t maxcount = 0;
|
|
if (nvars[j] == 1) continue;
|
|
if (j <= i) continue;
|
|
for (ii = 0; ii < nvars[i]; ii++)
|
|
for (jj = 0; jj < nvars[j]; jj++)
|
|
if (maxcount < pair_used[ii + ioffset][jj + joffset])
|
|
maxcount = pair_used[ii + ioffset][jj + joffset];
|
|
ss << boost::format("%lu\t%lu\t%lu\t%lu\t%6.2f") % (i + 1) %
|
|
(j + 1) % nvars[i] % nvars[j] %
|
|
((double)m_numPermutationsProcessed /
|
|
(nvars[i] * nvars[j]));
|
|
for (l = 0; l <= maxcount; l++) {
|
|
int n = 0;
|
|
for (ii = 0; ii < nvars[i]; ii++)
|
|
for (jj = 0; jj < nvars[j]; jj++)
|
|
if (l == pair_used[ii + ioffset][jj + joffset]) n++;
|
|
if (n > 0) ss << boost::format("\t%ld|%d") % l % n;
|
|
}
|
|
ss << boost::format("\n");
|
|
}
|
|
}
|
|
ss << "#END# PAIRSTAT\n";
|
|
|
|
ss << "Rejected Period: " << rejected_period << std::endl;
|
|
ss << "Rejected (dupes): " << rejected_unique << std::endl;
|
|
ss << "Rejected Slack Conditions: " << rejected_slack_condition
|
|
<< std::endl;
|
|
ss << "Rejected Pair Sampling: " << rejected_bb_sampling_condition
|
|
<< std::endl;
|
|
return ss.str();
|
|
}
|
|
}
|