Dev/reaction enumeration (#1111)

* Adds C++ Enumeration Engine to the RDKit

* Adds Sanitization helpers, wrappers and tests

* Clang format

* Remove unused enumerationStateOnly flag

* Fixes docStrings to current API

* Adds doc strings

* Removes RGroupPosition, adds getPosition to EnumerationBase

* Fixes readability.

* Adds EnumerateLibraryBase::reset and getReaction

* Added getReagents method to EnumerateLibrary

* Make the tests have the same naming

* Need to save the initial state for resetting.

* Stupid case-insensitive file systems

* Moves ResetState to EnumerateLibraryBase

* Adds removeNonmatchingReagents helper

* Renames currentPosition to getPosition

* Adds Enumeration Toolkit tutorial

* Fixes Python3 serialization and enumerators

* Verified to run on python2 and 3

* Fixes integer issues on windows

* The number of enumeration should be unsigned.

* Adds deserialization constructor

* Moves boost_serialization to the end

* Deprecates Clone in favor of copy

* Update tests to use copy.copy not Clone

* Move RGROUPS and BBS into an EnumerationTypes namespace

* Make sure old pickles work

* Adds pickle for backwards compatibility

* Moves to uint64_t from size_t for public api

* Whups, accidentally used the binary archiver.

* Commits boost 1.55 serialization

* Makes serialization turnoffable Like Filter Catalog

* Fixes tests when serialization not available.  Adds more enumeration strategy tests

* Fixes a syntax error on some versions of python

* Fixes sanitizeRxn to actually make proper RGroup atoms

* Updates SanitizeRXN python API

* Updates Enumeration API to a parameter class - fixes reagent removal

* Adds a mess of tests

* Change stats to return a string.

* Exposes EvenPairSamplingStrategy Stats to python

* Fixes a crash bug in SanitizeRxn

* Adds better testing of the even pair sampling

* Fixes namespace

* One more try to fix gcc

* Enum classes are c++11 and a microsoft extension.

* Fix typo

* Fixes np.median for python3

* Fixes atom iterators

* Adds virtual tags to derived virtual functions (for clarity)

* Fixes size comparison issues

* Adds doc string

* Small cleanup (has no effect since flags aren’t used)

* fixes crash bug on windows

* get the tests working on windows

* Updates tutorial

* Adds Glare implementation to Contrib
This commit is contained in:
Brian Kelley
2016-11-05 09:42:52 -04:00
committed by Greg Landrum
parent 1b946794f0
commit fa89438358
32 changed files with 5927 additions and 40 deletions

View File

@@ -1,21 +1,47 @@
if(RDK_USE_BOOST_SERIALIZATION AND Boost_SERIALIZATION_LIBRARY)
ADD_DEFINITIONS("-DRDK_USE_BOOST_SERIALIZATION")
else()
message("== Making EnumerateLibrary without boost Serialization support")
endif()
rdkit_library(ChemReactions
Reaction.cpp MDLParser.cpp DaylightParser.cpp ReactionPickler.cpp
ReactionWriter.cpp ReactionDepict.cpp ReactionFingerprints.cpp ReactionUtils.cpp MoleculeParser.cpp ReactionRunner.cpp PreprocessRxn.cpp
LINK_LIBRARIES FilterCatalog Descriptors Fingerprints DataStructs Depictor FileParsers SubstructMatch ChemTransforms)
ReactionWriter.cpp ReactionDepict.cpp ReactionFingerprints.cpp ReactionUtils.cpp MoleculeParser.cpp ReactionRunner.cpp PreprocessRxn.cpp SanitizeRxn.cpp
Enumerate/Enumerate.cpp
Enumerate/EnumerationPickler.cpp
Enumerate/EvenSamplePairs.cpp
LINK_LIBRARIES
FilterCatalog Descriptors Fingerprints DataStructs Depictor
FileParsers SubstructMatch ChemTransforms ${Boost_SERIALIZATION_LIBRARY})
rdkit_headers(Reaction.h
ReactionParser.h
ReactionPickler.h
ReactionFingerprints.h
ReactionUtils.h
ReactionRunner.h PreprocessRxn.h DEST GraphMol/ChemReactions)
ReactionRunner.h
PreprocessRxn.h
SanitizeRxn.h
Enumerate/Enumerate.h
Enumerate/EnumerateBase.h
Enumerate/EnumerationPickler.h
Enumerate/EnumerationStrategyBase.h
Enumerate/CartesianProduct.h
Enumerate/RandomSample.h
Enumerate/RandomSampleAllBBs.h
DEST GraphMol/ChemReactions)
rdkit_test(testReaction testReaction.cpp LINK_LIBRARIES
ChemReactions FilterCatalog ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch
GraphMol RDGeneral RDGeometryLib )
ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch
GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} )
rdkit_test(testReactionFingerprints testReactionFingerprints.cpp LINK_LIBRARIES
ChemReactions FilterCatalog Descriptors Fingerprints Subgraphs DataStructs ChemTransforms Depictor FileParsers SmilesParse SubstructMatch
GraphMol RDGeneral RDGeometryLib )
ChemReactions Descriptors Fingerprints Subgraphs DataStructs ChemTransforms Depictor FileParsers SmilesParse SubstructMatch
GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} )
rdkit_test(testEnumeration Enumerate/testEnumerate.cpp LINK_LIBRARIES
ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch
GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} )
add_subdirectory(Wrap)

View File

@@ -0,0 +1,145 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef CARTESIANPRODUCT_H
#define CARTESIANPRODUCT_H
#include "EnumerationStrategyBase.h"
namespace RDKit {
//! This is a class for enumerating reagents using Cartesian Products of
// reagents.
/*!
CartesianProductStrategy produces a standard walk through all possible
reagent combinations:
(0,0,0), (1,0,0), (2,0,0) ...
basic usage:
\verbatim
std::vector<MOL_SPTR_VECT> bbs;
bbs.push_back( bbs_for_reactants_1 );
bbs.push_back( bbs_for_reactants_2 );
RGRUOPS num_bbs;
num_bbs.push_back(bbs[0].size());
num_bbs.push_back(bbs[1].size());
CartesianProductStrategy rgroups(num_bbs);
for(size_t i=0; i<num_samples && rgroups; ++i) {
MOL_SPTR_VECT rvect = getReactantsFromRGroups(bbs, rgroups.next());
std::vector<MOL_SPTR_VECT> lprops = rxn.RunReactants(rvect);
...
}
\endverbatim
See EnumerationStrategyBase for more details and usage.
*/
class CartesianProductStrategy : public EnumerationStrategyBase {
size_t m_numPermutationsProcessed;
public:
CartesianProductStrategy()
: EnumerationStrategyBase(), m_numPermutationsProcessed() {}
using EnumerationStrategyBase::initialize;
virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) {
m_numPermutationsProcessed = 0;
}
virtual const char *type() const { return "CartesianProductStrategy"; }
//! The current permutation {r1, r2, ...}
virtual const EnumerationTypes::RGROUPS &next() {
if (m_numPermutationsProcessed) {
increment();
} else
++m_numPermutationsProcessed;
return m_permutation;
}
virtual boost::uint64_t getPermutationIdx() const {
return m_numPermutationsProcessed; }
virtual operator bool() const { return hasNext(); }
EnumerationStrategyBase *copy() const {
return new CartesianProductStrategy(*this);
}
private:
void increment() {
next(0);
++m_numPermutationsProcessed;
}
bool hasNext() const {
// Fix me -> use multiprecision int here???
if (m_numPermutations == EnumerationStrategyBase::EnumerationOverflow ||
m_numPermutationsProcessed < rdcast<size_t>(m_numPermutations)) {
return true;
} else {
return false;
}
}
void next(size_t rowToIncrement) {
if (!hasNext()) return;
m_permutation[rowToIncrement] += 1;
size_t max_index_of_row = m_permutationSizes[rowToIncrement] - 1;
if (m_permutation[rowToIncrement] > max_index_of_row) {
m_permutation[rowToIncrement] = 0;
next(rowToIncrement + 1);
}
}
private:
#ifdef RDK_USE_BOOST_SERIALIZATION
friend class boost::serialization::access;
template <class Archive>
void serialize(Archive &ar, const unsigned int /*version*/) {
ar &boost::serialization::base_object<EnumerationStrategyBase>(*this);
ar &m_numPermutationsProcessed;
}
#endif
};
}
#ifdef RDK_USE_BOOST_SERIALIZATION
BOOST_CLASS_VERSION(RDKit::CartesianProductStrategy, 1)
#endif
#endif

View File

@@ -0,0 +1,259 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "Enumerate.h"
#include "CartesianProduct.h"
#include "RandomSample.h"
#include "RandomSampleAllBBs.h"
#include "EvenSamplePairs.h"
#include "../ReactionPickler.h"
#include <GraphMol/MolPickler.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
// Since we are exporting the classes for serialization,
// we should declare the archives types used here
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/shared_ptr.hpp>
#include <boost/serialization/export.hpp>
#include <RDGeneral/BoostEndInclude.h>
BOOST_CLASS_EXPORT(RDKit::EnumerationStrategyBase);
BOOST_CLASS_EXPORT(RDKit::CartesianProductStrategy);
BOOST_CLASS_EXPORT(RDKit::RandomSampleStrategy);
BOOST_CLASS_EXPORT(RDKit::RandomSampleAllBBsStrategy);
BOOST_CLASS_EXPORT(RDKit::EvenSamplePairsStrategy);
BOOST_CLASS_EXPORT(RDKit::EnumerateLibrary);
#endif
namespace RDKit {
using namespace EnumerationTypes;
const RGROUPS &EnumerateLibraryBase::getPosition() const {
return m_enumerator->getPosition();
}
std::string EnumerateLibraryBase::getState() const {
PRECONDITION(m_enumerator.get(), "Null Enumerator");
std::string state;
EnumerationStrategyPickler::pickle(m_enumerator, state);
return state;
}
void EnumerateLibraryBase::setState(const std::string &state) {
m_enumerator = EnumerationStrategyPickler::fromPickle(state);
}
void EnumerateLibraryBase::resetState() {
PRECONDITION(m_initialEnumerator.get(),
"Unset initial enumerator");
m_enumerator.reset(m_initialEnumerator->copy());
}
std::vector<std::vector<std::string> > EnumerateLibraryBase::nextSmiles() {
std::vector<std::vector<std::string> > result;
std::vector<MOL_SPTR_VECT> mols = next();
const bool doisomeric = true;
result.resize(mols.size());
for (size_t i = 0; i < mols.size(); ++i) {
result[i].resize(mols[i].size());
for (size_t j = 0; j < mols[i].size(); ++j) {
if (mols[i][j].get()) result[i][j] = MolToSmiles(*mols[i][j], doisomeric);
}
}
return result;
}
namespace {
size_t countMatches( const ROMol& bb, const ROMol& query, int maxMatches) {
std::vector<MatchVectType> matches;
const bool uniquify = true;
const bool useChirality = true;
const bool useQueryQueryMatches = false;
SubstructMatch(bb, query, matches,
uniquify, true, useChirality, useQueryQueryMatches,
maxMatches+1);
return matches.size();
}
}
BBS removeNonmatchingReagents(const ChemicalReaction &rxn, BBS bbs,
const EnumerationParams &params) {
PRECONDITION(bbs.size() <= rxn.getNumReactantTemplates(),
"Number of Reagents not compatible with reaction templates");
BBS result;
result.resize(bbs.size());
for(size_t reactant_idx=0; reactant_idx < bbs.size(); ++reactant_idx) {
size_t removedCount = 0;
const unsigned int maxMatches = (params.reagentMaxMatchCount == INT_MAX) ?
0 : rdcast<unsigned int>(params.reagentMaxMatchCount);
ROMOL_SPTR reactantTemplate = rxn.getReactants()[reactant_idx];
for(size_t reagent_idx = 0; reagent_idx < bbs[reactant_idx].size(); ++reagent_idx) {
ROMOL_SPTR mol = bbs[reactant_idx][reagent_idx];
size_t matches = countMatches(*mol.get(), *reactantTemplate.get(), maxMatches);
bool removeReagent = false;
if(!matches || matches > rdcast<size_t>(params.reagentMaxMatchCount)) {
removeReagent = true;
}
if(!removeReagent && params.sanePartialProducts) {
// see if we have any sane products in the results
std::vector<MOL_SPTR_VECT> partialProducts = rxn.runReactant(mol, reactant_idx);
for(size_t productTemplate_idx = 0;
productTemplate_idx < partialProducts.size();
++productTemplate_idx) {
int saneProducts = 0;
for(size_t product_idx = 0;
product_idx < partialProducts[productTemplate_idx].size();
++product_idx) {
try {
RWMol *m = dynamic_cast<RWMol*>(
partialProducts[productTemplate_idx][product_idx].get());
MolOps::sanitizeMol(*m);
saneProducts++;
} catch (...) {
}
}
if (!saneProducts) {
// if any product template has no sane products, we bail
removeReagent = true;
break;
}
}
}
if(removeReagent)
removedCount++;
else
result[reactant_idx].push_back(mol);
}
if(removedCount) {
BOOST_LOG(rdInfoLog) << "Removed " << removedCount <<
" non matching reagents at template " << reactant_idx << std::endl;
}
}
return result;
}
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
const EnumerationParams &params)
: EnumerateLibraryBase(rxn, new CartesianProductStrategy),
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
m_enumerator->initialize(m_rxn, m_bbs); // getSizesFromBBs(bbs));
m_initialEnumerator.reset(m_enumerator->copy());
}
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
const EnumerationStrategyBase &enumerator,
const EnumerationParams &params)
: EnumerateLibraryBase(rxn),
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
m_enumerator.reset(enumerator.copy());
m_enumerator->initialize(m_rxn, m_bbs);
m_initialEnumerator.reset(m_enumerator->copy());
}
EnumerateLibrary::EnumerateLibrary(const EnumerateLibrary &rhs)
: EnumerateLibraryBase(rhs), m_bbs(rhs.m_bbs) {}
std::vector<MOL_SPTR_VECT> EnumerateLibrary::next() {
PRECONDITION(static_cast<bool>(*this), "No more enumerations");
const RGROUPS &reactantIndices = m_enumerator->next();
MOL_SPTR_VECT reactants(m_bbs.size());
for (size_t i = 0; i < m_bbs.size(); ++i) {
reactants[i] = m_bbs[i][reactantIndices[i]];
}
return m_rxn.runReactants(reactants);
}
void EnumerateLibrary::toStream(std::ostream &ss) const {
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_oarchive ar(ss);
ar << *this;
#else
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
void EnumerateLibrary::initFromStream(std::istream &ss) {
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_iarchive ar(ss);
ar >> *this;
#else
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
boost::uint64_t computeNumProducts(const RGROUPS &sizes) {
boost::multiprecision::cpp_int myint = 1;
for (size_t i = 0; i < sizes.size(); ++i) {
myint *= sizes[i];
}
if (myint < std::numeric_limits<boost::uint64_t>::max())
return myint.convert_to<boost::uint64_t>();
else
return EnumerationStrategyBase::EnumerationOverflow;
}
MOL_SPTR_VECT getReactantsFromRGroups(const std::vector<MOL_SPTR_VECT> &bbs,
const RGROUPS &rgroups) {
PRECONDITION(bbs.size() == rgroups.size(),
"BBS and RGROUPS must have the same # reactants");
MOL_SPTR_VECT result;
result.reserve(bbs.size());
for (size_t i = 0; i < bbs.size(); ++i) {
result.push_back(bbs[i][rgroups[i]]);
}
return result;
}
bool EnumerateLibraryCanSerialize() {
#ifdef RDK_USE_BOOST_SERIALIZATION
return true;
#else
return false;
#endif
}
}

View File

@@ -0,0 +1,183 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.n
//
#ifndef RDKIT_ENUMERATE_H
#define RDKIT_ENUMERATE_H
#include "EnumerateBase.h"
namespace RDKit {
//! This is a class for providing enumeration options that control
// how enumerations are performed.
/*!
Option
reagentMaxMatchCount [default INT_MAX]
This specifies how many times the reactant template can match a reagent.
sanePartialProducts [default false]
If true, forces all products of the reagent plus the product templates\n\
pass chemical sanitization. Note that if the product template itself\n\
does not pass sanitization, then none of the products will.
*/
struct EnumerationParams
{
int reagentMaxMatchCount;
bool sanePartialProducts;
EnumerationParams() :
reagentMaxMatchCount(INT_MAX), sanePartialProducts(false) {
}
EnumerationParams(const EnumerationParams &rhs) :
reagentMaxMatchCount(rhs.reagentMaxMatchCount),
sanePartialProducts(rhs.sanePartialProducts) {
}
};
//! Helper function, remove reagents that are incompatible
// with the reaction.
// rxn must be sanitized, initialized and preprocessed.
// this happens automatically in EnumerateLibrary
EnumerationTypes::BBS removeNonmatchingReagents(
const ChemicalReaction &rxn,
EnumerationTypes::BBS bbs,
const EnumerationParams &params=EnumerationParams());
//! This is a class for running reactions on sets of reagents.
/*!
This class is a fully self contained reaction engine that can be
serialized and restarted. For example, a million products can
be generated, the engine can be saved for later and reloaded
to retrieve the next million products.
basic usage will be something like:
\verbatim
ChemicalReaction rxn = ...
BBS bbs(num_rgroups);
... somehow LoadRGroups(bbs[0]);
... somehow LoadRGroups(bbs[1]..);
...
EnumerateLibrary enumerator(en, bbs);
for(; (bool)en; ++i) {
// This is the same as rxn.run_Reactants( reagents );
std::vector<MOL_SPTR_VECT> products = en.next();
...
}
\endverbatim
In general, reactions will enumerate to more products than desired,
a standard use is:
\verbatim
for(int i=0;i<num_samples && (bool)en; ++i) {
std::vector<MOL_SPTR_VECT> products = en.next();
...
}
\endverbatim
*/
class EnumerateLibrary : public EnumerateLibraryBase {
EnumerationTypes::BBS m_bbs;
public:
EnumerateLibrary() : EnumerateLibraryBase(), m_bbs() {}
EnumerateLibrary(const std::string &s) : EnumerateLibraryBase(), m_bbs() {
initFromString(s);
}
EnumerateLibrary(const ChemicalReaction &rxn,
const EnumerationTypes::BBS &reagents,
const EnumerationParams & params = EnumerationParams());
EnumerateLibrary(const ChemicalReaction &rxn,
const EnumerationTypes::BBS &reagents,
const EnumerationStrategyBase &enumerator,
const EnumerationParams & params = EnumerationParams());
EnumerateLibrary(const EnumerateLibrary &rhs);
//! Return the reagents used in the library
const EnumerationTypes::BBS &getReagents() const { return m_bbs; }
//! Get the next product set
std::vector<MOL_SPTR_VECT> next();
void toStream(std::ostream &ss) const;
void initFromStream(std::istream &ss);
private:
#ifdef RDK_USE_BOOST_SERIALIZATION
friend class boost::serialization::access;
template <class Archive>
void save(Archive &ar, const unsigned int /*version*/) const {
ar &boost::serialization::base_object<EnumerateLibraryBase>(*this);
size_t sz = m_bbs.size();
ar &sz;
std::string pickle;
for (size_t i = 0; i < m_bbs.size(); ++i) {
sz = m_bbs[i].size();
ar &sz;
for (size_t j = 0; j < m_bbs[i].size(); ++j) {
MolPickler::pickleMol(*m_bbs[i][j], pickle);
ar &pickle;
}
}
}
template <class Archive>
void load(Archive &ar, const unsigned int /*version*/) {
ar &boost::serialization::base_object<EnumerateLibraryBase>(*this);
size_t sz;
ar &sz;
m_bbs.resize(sz);
for (size_t i = 0; i < m_bbs.size(); ++i) {
ar &sz;
m_bbs[i].resize(sz);
std::string pickle;
for (size_t j = 0; j < m_bbs[i].size(); ++j) {
ar &pickle;
RWMol *mol = new RWMol();
MolPickler::molFromPickle(pickle, *mol);
m_bbs[i][j].reset(mol);
}
}
}
BOOST_SERIALIZATION_SPLIT_MEMBER();
#endif
};
bool EnumerateLibraryCanSerialize();
}
#endif

View File

@@ -0,0 +1,200 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RDKIT_ENUMERATEBASE_H
#define RDKIT_ENUMERATEBASE_H
#include <vector>
#include "EnumerateTypes.h"
#include "../Reaction.h"
#include "EnumerationPickler.h"
#include "EnumerationStrategyBase.h"
#include "CartesianProduct.h"
#include "../ReactionPickler.h"
#include <GraphMol/MolPickler.h>
namespace RDKit {
//! Base class for enumerating chemical reactions from collections of
// building blocks and reagents.
/*!
basic usage:
\verbatim
EnumerateLibraryBase &enumerator;
while (enumerator) {
MOL_SPTR_VECT res = enumerator.next();
// do something with enumeration products here
}
\endverbatim
See Reaction.h for more details on how ChemicalReactions are
used.
*/
class EnumerateLibraryBase {
protected:
ChemicalReaction m_rxn;
boost::shared_ptr<EnumerationStrategyBase> m_enumerator;
boost::shared_ptr<EnumerationStrategyBase> m_initialEnumerator;
public:
//! default constructor
EnumerateLibraryBase() : m_rxn(),
m_enumerator(),
m_initialEnumerator() {}
//! construct with a chemical reaction and an enumeration strategy
EnumerateLibraryBase(const ChemicalReaction &rxn,
EnumerationStrategyBase *enumerator = 0)
: m_rxn(rxn),
m_enumerator(enumerator ? enumerator : new CartesianProductStrategy),
m_initialEnumerator( m_enumerator->copy() )
{
m_rxn.initReactantMatchers();
}
//! Copy constructor
EnumerateLibraryBase(const EnumerateLibraryBase &rhs)
: m_rxn(rhs.m_rxn),
m_enumerator(rhs.m_enumerator ? rhs.m_enumerator->copy() : 0),
m_initialEnumerator( m_enumerator->copy() ) {}
virtual ~EnumerateLibraryBase() {}
//! Are there any enumerations left?
virtual operator bool() const {
PRECONDITION(m_enumerator.get(), "Null enumeration strategy");
return static_cast<bool>(*m_enumerator);
}
//! reset the enumeration to the beginning.
void reset() {
if(m_initialEnumerator.get()) {
m_enumerator.reset(m_initialEnumerator->copy());
}
}
//! returns the underlying chemical reaction
const ChemicalReaction &getReaction() const { return m_rxn; }
//! return the current enumeration strategy
const EnumerationStrategyBase &getEnumerator() {
PRECONDITION(m_enumerator.get(), "Null Enumerator");
return *m_enumerator;
}
//! get the next set of products (See run_Reactants) for details
// This returns a vector of a vector of molecules.
// Each result vector corresponds for a product template.
// i.e.
// res = library.next();
// res[0] are the results for library.getReaction().getProdcts()[0]
virtual std::vector<MOL_SPTR_VECT> next() = 0;
//! get the next set of products as smiles
// This returns a vector of a vector strings.
// Each result vector corresponds for a product template.
virtual std::vector<std::vector<std::string> > nextSmiles();
//! Get the current position into the reagent vectors
// Use getState/setState to save/restart the enumeration
// from this position.
const EnumerationTypes::RGROUPS &getPosition() const;
//! Get the current state of the enumerator
// This is the position of the enumerator and the enumerators
// state that can be used to restart enumerating
// from this position.
std::string getState() const;
//! Set the current state of the enumerator
// Restart the enumerator from this position.
void setState(const std::string &);
//! Reset the enumerator to the beginning
void resetState();
//! serializes (pickles) to a stream
virtual void toStream(std::ostream &ss) const = 0;
//! returns a string with a serialized (pickled) representation
virtual std::string Serialize() const {
std::stringstream ss;
toStream(ss);
return ss.str();
}
//! initializes from a stream pickle
virtual void initFromStream(std::istream &ss) = 0;
//! initializes from a string pickle
virtual void initFromString(const std::string &text) {
std::stringstream ss(text);
initFromStream(ss);
}
private:
#ifdef RDK_USE_BOOST_SERIALIZATION
friend class boost::serialization::access;
template <class Archive>
void save(Archive &ar, const unsigned int) const {
std::string pickle;
ReactionPickler::pickleReaction(m_rxn, pickle);
ar &pickle;
ar &m_enumerator;
// we handle the m_initialEnumerator from a string
// for backwards compatibility with a unreleased
// version
EnumerationStrategyPickler::pickle(m_initialEnumerator,
pickle);
ar &pickle;
}
template <class Archive>
void load(Archive &ar, const unsigned int /*version*/) {
std::string pickle;
ar &pickle;
ReactionPickler::reactionFromPickle(pickle, m_rxn);
ar &m_enumerator;
ar &pickle;
m_initialEnumerator = \
EnumerationStrategyPickler::fromPickle(pickle);
}
BOOST_SERIALIZATION_SPLIT_MEMBER();
#endif
};
#ifdef RDK_USE_BOOST_SERIALIZATION
BOOST_SERIALIZATION_ASSUME_ABSTRACT(EnumerateLibraryBase)
#endif
}
#endif

View File

@@ -0,0 +1,58 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef ENUMERATETYPES_H
#define ENUMERATETYPES_H
#include <GraphMol/RDKitBase.h>
namespace RDKit {
namespace EnumerationTypes {
//! BBS - Helper typedef for holding buliding blocks for reactions
//! holds vectors of reagents for each reactant in a Reaction
typedef std::vector<MOL_SPTR_VECT> BBS;
//! RGROUPS Helper typedef for indexing into the BBS vector
//! - The indices into the BBS molecule list to create a product
//! Example
//! RGROUPS groups;
//! groups.push_back(10);
//! groups.push_back(5);
//!
//! Will create a product from the following building blocks:
//! MOL_SPTR_VECT building_blocks;
//! building_blocks.push_back( BBS[0][groups[0] );
//! building_blocks.push_back( BBS[1][groups[1] );
//! rxn.runReactants( building_blocks );
typedef std::vector<boost::uint64_t> RGROUPS;
}
}
#endif

View File

@@ -0,0 +1,110 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "EnumerationPickler.h"
#include "CartesianProduct.h"
#include "RandomSample.h"
#include "RandomSampleAllBBs.h"
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/shared_ptr.hpp>
#include <RDGeneral/BoostEndInclude.h>
#endif
namespace RDKit {
std::string GetClass(const EnumerationStrategyBase *en) {
if (dynamic_cast<const CartesianProductStrategy *>(en)) return "-->cartesian";
if (dynamic_cast<const RandomSampleStrategy *>(en)) return "-->random";
if (dynamic_cast<const RandomSampleAllBBsStrategy *>(en))
return "-->randombbs";
return "Unknown!";
}
namespace EnumerationStrategyPickler {
void pickle(const boost::shared_ptr<EnumerationStrategyBase> &enumerator,
std::ostream &ss) {
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_oarchive ar(ss);
ar &enumerator;
#else
RDUNUSED_PARAM(enumerator);
RDUNUSED_PARAM(ss);
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
void pickle(const boost::shared_ptr<EnumerationStrategyBase> &enumerator,
std::string &s) {
#ifdef RDK_USE_BOOST_SERIALIZATION
std::stringstream ss;
pickle(enumerator, ss);
s = ss.str();
#else
RDUNUSED_PARAM(enumerator);
RDUNUSED_PARAM(s);
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
boost::shared_ptr<EnumerationStrategyBase> fromPickle(std::istream &pickle) {
boost::shared_ptr<EnumerationStrategyBase> enumerator;
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_iarchive ar(pickle);
ar &enumerator;
return enumerator;
#else
RDUNUSED_PARAM(pickle);
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
boost::shared_ptr<EnumerationStrategyBase> fromPickle(
const std::string &pickle) {
#ifdef RDK_USE_BOOST_SERIALIZATION
std::stringstream ss(pickle);
return fromPickle(ss);
#else
RDUNUSED_PARAM(pickle);
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
return boost::shared_ptr<EnumerationStrategyBase>();
#endif
}
}
}

View File

@@ -0,0 +1,56 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef ENUMERATIONPICKLER_H
#define ENUMERATIONPICKLER_H
#include "EnumerationStrategyBase.h"
namespace RDKit {
namespace EnumerationStrategyPickler {
//! pickles a EnumerationStrategy and adds the results to a stream \c ss
void pickle(const boost::shared_ptr<EnumerationStrategyBase> &enumerator,
std::ostream &ss);
void pickle(const boost::shared_ptr<EnumerationStrategyBase> &enumerator,
std::string &s);
//! constructs a EnumerationStrategy from a pickle stored in a string
//! Since an EnumerationStrategyBase is polymorphic, this must return
//! a shared pointer to the EnumerationStrategyBase
boost::shared_ptr<EnumerationStrategyBase> fromPickle(std::istream &pickle);
//! a pointer to the EnumerationStrategyBase
boost::shared_ptr<EnumerationStrategyBase> fromPickle(
const std::string &pickle);
}
}
#endif

View File

@@ -0,0 +1,199 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef ENUMERATION_STRATEGY_H
#define ENUMERATION_STRATEGY_H
#include "EnumerateTypes.h"
#include "../Reaction.h"
#include <vector>
#include <RDGeneral/BoostStartInclude.h>
#include <boost/cstdint.hpp>
#include <boost/multiprecision/cpp_int.hpp>
#include <boost/serialization/assume_abstract.hpp>
#include <boost/serialization/vector.hpp>
#include <boost/serialization/shared_ptr.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <GraphMol/RDKitBase.h>
namespace RDKit {
//! class for flagging enumeration strategy errors
class EnumerationStrategyException : public std::exception {
public:
EnumerationStrategyException(const char *msg) : _msg(msg){};
EnumerationStrategyException(const std::string &msg) : _msg(msg){};
const char *message() const { return _msg.c_str(); };
~EnumerationStrategyException() throw(){};
private:
std::string _msg;
};
//! Return the number of elements per input vector
/*! \param bbs vector<vector<T> >
\result vector<unint64_t> number of elements in each vector
*/
template <class T>
EnumerationTypes::RGROUPS getSizesFromBBs(const std::vector<std::vector<T> > &bbs) {
EnumerationTypes::RGROUPS sizes;
for (size_t i = 0; i < bbs.size(); ++i) sizes.push_back(bbs[i].size());
return sizes;
}
//! getSizesFromReactants
//! Helper function for enumeration, bbs are stored in a
//! std::vector< std::vector<boost:shared_ptr<ROMol> >
//
EnumerationTypes::RGROUPS getSizesFromReactants(const std::vector<MOL_SPTR_VECT> &bbs);
//! getReactantsFromRGroups
//! Helper function for enumeration, bbs are stored in a
//! std::vector< std::vector<boost:shared_ptr<ROMol> >
//
MOL_SPTR_VECT getReactantsFromRGroups(const std::vector<MOL_SPTR_VECT> &bbs,
const EnumerationTypes::RGROUPS &rgroups);
//! computeNumProducts
//! Returns the number of possible product combination from
//! The given numbers of building blocks for each rgroup
//! or EnumerationStrategyBase::EnumerationOverflow if the
//! number will not fit into the machines integer type.
//! n.b. An overflow simply means there are a lot of products
//! not that they cannot be enumerated
boost::uint64_t computeNumProducts(const EnumerationTypes::RGROUPS &sizes);
//! Base Class for enumeration strageties
//! Usage:
//! EnumerationStrategyBase must be initialized with both a reaction
//! and the building block (molecule) vector to be sampled.
//!
//! \verbatim
//! EnumerationStrategyBase &eb = ...
//! if(eb) { // can we get another entry
//! const std::vector<int> &v = eb.next();
//! v[0] // RGroup 0 position
//! v[1] // RGroup 1 position...
//! }
//! \endverbatim
class EnumerationStrategyBase {
protected:
EnumerationTypes::RGROUPS m_permutation; // where are we currently?
EnumerationTypes::RGROUPS m_permutationSizes; // m_permutationSizes num bbs per group
boost::uint64_t m_numPermutations; // total number of permutations for this group
// -1 if > ssize_t::max
public:
static const boost::uint64_t EnumerationOverflow = static_cast<boost::uint64_t>(-1);
EnumerationStrategyBase()
: m_permutation(), m_permutationSizes(), m_numPermutations() {}
virtual ~EnumerationStrategyBase() {}
virtual const char *type() const { return "EnumerationStrategyBase"; }
//! Initialize the enumerator based on the reaction and the
//! supplied building blocks
//! This is the standard API point.
void initialize(const ChemicalReaction &reaction,
const EnumerationTypes::BBS &building_blocks) {
// default initialization, may be overridden (sets the # reactants
// and computes the default # of permutations)
m_permutationSizes = getSizesFromBBs(building_blocks);
m_permutation.resize(m_permutationSizes.size());
m_numPermutations = computeNumProducts(m_permutationSizes);
std::fill(m_permutation.begin(), m_permutation.end(), 0);
initializeStrategy(reaction, building_blocks);
}
// ! Initialize derived class
// ! must exist, EnumerationStrategyBase structures are already initialized
virtual void initializeStrategy(const ChemicalReaction &reaction,
const EnumerationTypes::BBS &building_blocks) = 0;
//! returns true if there are more permutations left
//! random enumerators may always return true...
virtual operator bool() const = 0;
//! The current permutation {r1, r2, ...}
virtual const EnumerationTypes::RGROUPS &next() = 0;
//! copy the enumeration strategy complete with current state
virtual EnumerationStrategyBase *copy() const = 0;
//! The current position in the enumeration
const EnumerationTypes::RGROUPS &getPosition() const { return m_permutation; }
//! a result of EnumerationOverflow indicates that the number of
//! permutations is not computable with the current
//! rdlonglong size.
boost::uint64_t getNumPermutations() const { return m_numPermutations; }
//! Returns how many permutations have been processed by this strategy
virtual boost::uint64_t getPermutationIdx() const = 0;
//! Skip the specified number of permutations (useful for
//! resetting state to a known position)
bool skip(boost::uint64_t skipCount) {
for (boost::uint64_t i = 0; i < skipCount; ++i) next();
return true;
}
protected:
//! Initialize the internal data structures
//! i.e. RGROUPS = {10,40,50};
void internalInitialize(const EnumerationTypes::RGROUPS &rgroups) {
m_permutation.resize(rgroups.size());
m_permutationSizes = rgroups;
m_numPermutations = computeNumProducts(m_permutationSizes);
std::fill(m_permutation.begin(), m_permutation.end(), 0);
}
private:
friend class boost::serialization::access;
template <class Archive>
void serialize(Archive &ar, const unsigned int /*version*/) {
ar &m_permutation;
ar &m_permutationSizes;
ar &m_numPermutations;
}
};
BOOST_SERIALIZATION_ASSUME_ABSTRACT(EnumerationStrategyBase)
}
BOOST_CLASS_VERSION(RDKit::EnumerationStrategyBase, 1)
#endif

View File

@@ -0,0 +1,281 @@
//
// Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "EvenSamplePairs.h"
#include <boost/format.hpp>
#include <stdint.h>
namespace RDKit {
using namespace EnumerationTypes;
// Based on an implementation from a correspondance with Bernd Rohde.
void EvenSamplePairsStrategy::initializeStrategy(const ChemicalReaction &,
const BBS &bbs) {
size_t npos = bbs.size();
used_count.resize(npos);
std::fill(used_count.begin(), used_count.end(), 0);
var_used.resize(npos);
for (size_t i = 0; i < npos; ++i) {
var_used[i].resize(m_permutationSizes[i]);
std::fill(var_used[i].begin(), var_used[i].end(), 0);
}
boost::uint64_t nmonomers = 0;
for (size_t i = 0; i < bbs.size(); ++i) nmonomers += m_permutationSizes[i];
pair_used.resize(nmonomers);
for (size_t i = 0; i < nmonomers; ++i) {
pair_used[i].resize(nmonomers);
std::fill(pair_used[i].begin(), pair_used[i].end(), 0);
}
pair_counts.resize(npos);
for (size_t i = 0; i < npos; i++) {
pair_counts[i].resize(npos);
std::fill(pair_counts[i].begin(), pair_counts[i].end(), 0);
}
/* Initialize random number generator */
/* Find modulus */
PRECONDITION(m_numPermutations >= 0,
"Number of permutations too large to Evenly sample");
for (M = 1; M < rdcast<size_t>(m_numPermutations); M = 2 * M)
;
/* Set factor */
a = 5;
b = 7;
// control of random number and heuristics
seed = 0;
m_numPermutationsProcessed = 0;
nslack = 0; // increase this to break evenness criteria
rejected_period = 0;
rejected_unique = 0;
rejected_slack_condition = 0;
rejected_bb_sampling_condition = 0;
selected.clear(); // clear the selected (unique) set
}
// Try to add the given encoded seed position into
// the current set of return groups. This checks to
// see if the BBS are evenly sampled as pairs. If
// they currently are not, reject the selection.
// This is fairly suboptimal for large collections
// of building blocks and may take a while to
// terminate...
bool EvenSamplePairsStrategy::try_add(size_t seed) {
const RGROUPS &digits = decode(seed);
const RGROUPS &rgroups = m_permutationSizes;
size_t islack = 0;
size_t num_rgroups = m_permutationSizes.size();
for (size_t i = 0; i < num_rgroups; ++i) {
if (var_used[i][digits[i]]) islack += var_used[i][digits[i]];
if (islack > nslack) {
// add better heuristic here??
rejected_slack_condition += 1;
return false;
}
}
islack = 0;
size_t ioffset = 0;
// check that building block pairs get evenly sampled
for (size_t i = 0; i < num_rgroups; ++i) {
size_t joffset = 0;
for (size_t j = 0; j < num_rgroups; ++j) {
if (j == i) continue;
size_t ii = digits[i] + ioffset;
size_t jj = digits[j] + joffset;
if (pair_used[ii][jj] > 0) {
double numer = (double)pair_used[ii][jj];
double denom = sqrt((double)(rgroups[i]) * (double)(rgroups[j]));
islack = (int)(numer / denom);
}
joffset += rgroups[j];
}
ioffset += rgroups[i];
}
if (islack > nslack) {
rejected_bb_sampling_condition += 1;
return false;
}
// keep track of bb usage
for (size_t i = 0; i < num_rgroups; ++i) {
if (var_used[i][digits[i]] == 0) {
used_count[i]++;
}
var_used[i][digits[i]] += 1;
if (used_count[i] == rdcast<int64_t>(rgroups[i])) {
// complete variable scan => initialize
if (nslack > min_nslack && rgroups[i] > 1) // cleared slack on i
nslack = min_nslack;
used_count[i] = 0;
for (size_t j = 0; j < rgroups[i]; ++j) {
var_used[i][j]--;
assert(var_used[i][j] >= 0);
if (var_used[i][j] > 0) used_count[i]++;
}
} // end scan
}
// keep track of BB Pair usage
ioffset = 0;
for (size_t i = 0; i < num_rgroups; ioffset += rgroups[i], ++i) {
size_t joffset = 0;
for (size_t j = 0; j < num_rgroups; joffset += rgroups[j], ++j) {
if (j == i) {
continue;
}
size_t ii = digits[i] + ioffset;
size_t jj = digits[j] + joffset;
if (pair_used[ii][jj] == 0) {
pair_counts[i][j]++;
}
pair_used[ii][jj]++;
if (pair_counts[i][j] >= rgroups[i] * rgroups[j]) { // all pairs visited
if (nslack > min_nslack && (rgroups[i] > 1 || rgroups[j] > 1)) {
nslack = min_nslack;
}
pair_counts[i][j] = 0;
for (size_t ii = 0; ii < rgroups[i]; ++ii) {
for (size_t jj = 0; jj < rgroups[j]; ++jj) {
pair_used[ioffset + ii][joffset + jj]--;
if (pair_used[ioffset + ii][joffset + jj] > 0) {
pair_counts[i][j]++;
}
}
}
}
}
}
selected.insert(seed);
return true;
}
const RGROUPS &EvenSamplePairsStrategy::next() {
nslack = 0;
while (m_numPermutationsProcessed < rdcast<size_t>(m_numPermutations)) {
bool added = false;
for (size_t l = 0; l < M; ++l) {
seed = ((seed * a + b) % M);
if (seed > rdcast<size_t>(m_numPermutations)) {
rejected_period += 1;
continue;
} else if (selected.find(seed) != selected.end()) {
rejected_unique += 1;
continue;
} else if (try_add(seed)) {
m_numPermutationsProcessed++;
added = true;
return decode(seed);
}
}
if (!added) {
// loosen heuristic
nslack += 1;
min_nslack += 1;
}
}
throw EnumerationStrategyException("Ran out of molecules");
}
std::string EvenSamplePairsStrategy::stats() const {
std::ostringstream ss;
size_t npos = m_permutationSizes.size();
const RGROUPS &nvars = m_permutationSizes;
size_t i, l, j, ii, jj, ioffset, joffset;
ss << "#BEGIN# BBSTAT\n";
for (i = 0; i < npos; i++) {
size_t maxcount = 0;
if (nvars[i] == 1) continue;
for (j = 0; j < nvars[i]; j++)
if (maxcount < var_used[i][j]) maxcount = var_used[i][j];
ss << boost::format("%lu\t%lu\t%6.2f") % (i + 1) % nvars[i] %
((double)m_numPermutationsProcessed / nvars[i]);
for (l = 0; l <= maxcount; l++) {
size_t n = 0;
for (j = 0; j < nvars[i]; j++)
if (var_used[i][j] == l) n++;
if (n > 0) ss << boost::format("\t%lu|%lu") % l % n;
}
ss << std::endl;
}
ss << "#END# BBSTAT\n";
ss << "#BEGIN# PAIRSTAT\n";
for (i = 0, ioffset = 0; i < npos; ioffset += nvars[i], i++) {
if (nvars[i] == 1) continue;
for (j = 0, joffset = 0; j < npos; joffset += nvars[j], j++) {
size_t maxcount = 0;
if (nvars[j] == 1) continue;
if (j <= i) continue;
for (ii = 0; ii < nvars[i]; ii++)
for (jj = 0; jj < nvars[j]; jj++)
if (maxcount < pair_used[ii + ioffset][jj + joffset])
maxcount = pair_used[ii + ioffset][jj + joffset];
ss << boost::format("%lu\t%lu\t%lu\t%lu\t%6.2f") % (i + 1) %
(j + 1) % nvars[i] % nvars[j] %
((double)m_numPermutationsProcessed /
(nvars[i] * nvars[j]));
for (l = 0; l <= maxcount; l++) {
int n = 0;
for (ii = 0; ii < nvars[i]; ii++)
for (jj = 0; jj < nvars[j]; jj++)
if (l == pair_used[ii + ioffset][jj + joffset]) n++;
if (n > 0) ss << boost::format("\t%ld|%d") % l % n;
}
ss << boost::format("\n");
}
}
ss << "#END# PAIRSTAT\n";
ss << "Rejected Period: " << rejected_period << std::endl;
ss << "Rejected (dupes): " << rejected_unique << std::endl;
ss << "Rejected Slack Conditions: " << rejected_slack_condition
<< std::endl;
ss << "Rejected Pair Sampling: " << rejected_bb_sampling_condition
<< std::endl;
return ss.str();
}
}

View File

@@ -0,0 +1,193 @@
//
// Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RGROUP_EVEN_SAMPLE_H
#define RGROUP_EVEN_SAMPLE_H
#include "EnumerationStrategyBase.h"
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <boost/serialization/set.hpp>
#endif
#include <stdint.h>
namespace RDKit {
//! EvenSamplePairsStrategy
/*! Randomly sample Pairs evenly from a collection of building blocks
This is a good strategy for choosing a relatively small selection
of building blocks from a larger set. As the amount of work needed
to retrieve the next evenly sample building block grows with the
number of samples, this method performs progressively worse as the
number of samples gets larger.
See EnumeartionStrategyBase for more details.
*/
class EvenSamplePairsStrategy : public EnumerationStrategyBase {
boost::uint64_t m_numPermutationsProcessed;
std::vector<int64_t> used_count;
std::vector<std::vector<size_t> > var_used;
std::vector<std::vector<size_t> > pair_used;
std::vector<std::vector<size_t> > pair_counts;
std::set<size_t> selected;
size_t seed; // last seed for permutation (starts at 0)
size_t M, a, b; // random number stuff
size_t nslack, min_nslack;
size_t rejected_period, rejected_unique;
size_t rejected_slack_condition, rejected_bb_sampling_condition;
public:
EvenSamplePairsStrategy()
: EnumerationStrategyBase(),
m_numPermutationsProcessed(),
used_count(),
var_used(),
pair_used(),
pair_counts(),
selected(),
seed(),
M(),
a(),
b(),
nslack(),
min_nslack(),
rejected_period(),
rejected_unique(),
rejected_slack_condition(),
rejected_bb_sampling_condition() {}
EvenSamplePairsStrategy(const EvenSamplePairsStrategy &rhs)
: EnumerationStrategyBase(rhs),
m_numPermutationsProcessed(rhs.m_numPermutationsProcessed),
used_count(rhs.used_count),
var_used(rhs.var_used),
pair_used(rhs.pair_used),
pair_counts(rhs.pair_counts),
selected(rhs.selected),
seed(rhs.seed),
M(rhs.M),
a(rhs.a),
b(rhs.b),
nslack(rhs.nslack),
min_nslack(rhs.min_nslack),
rejected_period(rhs.rejected_period),
rejected_unique(rhs.rejected_unique),
rejected_slack_condition(rhs.rejected_slack_condition),
rejected_bb_sampling_condition(rhs.rejected_bb_sampling_condition) {}
virtual const char *type() const { return "EvenSamplePairsStrategy"; }
//! This is a class for enumerating RGroups using Cartesian Products of
//! reagents.
/*!
basic usage:
\verbatim
std::vector<MOL_SPTR_VECT> bbs;
bbs.push_back( bbs_for_reactants_1 );
bbs.push_back( bbs_for_reactants_2 );
EvenSamplePairsStrategy rgroups;
rgroups.initialize(rxn, bbs);
for(size_t i=0; i<num_samples && rgroups; ++i) {
MOL_SPTR_VECT rvect = getReactantsFromRGroups(bbs, rgroups.next());
std::vector<MOL_SPTR_VECT> lprops = rxn.RunReactants(rvect);
...
}
\endverbatim
*/
using EnumerationStrategyBase::initialize;
virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &);
//! The current permutation {r1, r2, ...}
virtual const EnumerationTypes::RGROUPS &next();
virtual boost::uint64_t getPermutationIdx() const {
return m_numPermutationsProcessed; }
virtual operator bool() const { return true; }
EnumerationStrategyBase *copy() const {
return new EvenSamplePairsStrategy(*this);
}
std::string stats() const;
private:
friend class boost::serialization::access;
// decode a packed integer into an RGroup selection
const EnumerationTypes::RGROUPS &decode(size_t seed) {
for (int64_t j = m_permutationSizes.size() - 1; j >= 0; j--) {
m_permutation[j] = seed % m_permutationSizes[j];
seed /= m_permutationSizes[j];
}
return m_permutation;
}
bool try_add(size_t seed);
public:
#ifdef RDK_USE_BOOST_SERIALIZATION
template <class Archive>
void serialize(Archive &ar, const unsigned int /*version*/) {
// invoke serialization of the base class
ar &boost::serialization::base_object<EnumerationStrategyBase>(*this);
ar &m_numPermutationsProcessed;
ar &used_count;
ar &var_used;
ar &pair_used;
ar &pair_counts;
ar &selected;
ar &seed;
ar &M;
ar &a;
ar &b;
ar &nslack;
ar &min_nslack;
ar &rejected_period;
ar &rejected_unique;
ar &rejected_slack_condition;
ar &rejected_bb_sampling_condition;
}
#endif
};
}
BOOST_CLASS_VERSION(RDKit::EvenSamplePairsStrategy, 1)
#endif

View File

@@ -0,0 +1,162 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RGROUP_RANDOM_SAMPLE_H
#define RGROUP_RANDOM_SAMPLE_H
#include "EnumerationStrategyBase.h"
#include <boost/random.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <sstream>
namespace RDKit {
//! This is a class for fully randomly sampling reagents.
// Note that this enumerator never halts.
/*!
basic usage:
\verbatim
std::vector<MOL_SPTR_VECT> bbs;
bbs.push_back( bbs_for_reactants_1 );
bbs.push_back( bbs_for_reactants_2 );
RandomSampleStrategy rgroups;
rgroups.initialize(rxn, bbs);
for(size_t i=0; i<num_samples && rgroups; ++i) {
MOL_SPTR_VECT rvect = getReactantsFromRGroups(bbs, rgroups.next());
std::vector<MOL_SPTR_VECT> lprops = rxn.RunReactants(rvect);
...
}
\endverbatim
See EnumerationStrategyBase for more details and usage.
*/
class RandomSampleStrategy : public EnumerationStrategyBase {
boost::uint64_t m_numPermutationsProcessed;
boost::minstd_rand m_rng;
std::vector<boost::random::uniform_int_distribution<> > m_distributions;
public:
RandomSampleStrategy()
: EnumerationStrategyBase(),
m_numPermutationsProcessed(),
m_rng(),
m_distributions() {
for (size_t i = 0; i < m_permutation.size(); ++i) {
m_distributions.push_back(
boost::random::uniform_int_distribution<>(0, m_permutation[i] - 1));
}
}
using EnumerationStrategyBase::initialize;
virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) {
m_distributions.clear();
for (size_t i = 0; i < m_permutationSizes.size(); ++i) {
m_distributions.push_back(boost::random::uniform_int_distribution<>(
0, m_permutationSizes[i] - 1));
}
m_numPermutationsProcessed = 0;
}
virtual const char *type() const { return "RandomSampleStrategy"; }
//! The current permutation {r1, r2, ...}
virtual const EnumerationTypes::RGROUPS &next() {
for (size_t i = 0; i < m_permutation.size(); ++i) {
m_permutation[i] = m_distributions[i](m_rng);
}
++m_numPermutationsProcessed;
return m_permutation;
}
virtual boost::uint64_t getPermutationIdx() const {
return m_numPermutationsProcessed; }
virtual operator bool() const { return true; }
EnumerationStrategyBase *copy() const {
return new RandomSampleStrategy(*this);
}
private:
#ifdef RDK_USE_BOOST_SERIALIZATION
friend class boost::serialization::access;
template <class Archive>
void save(Archive &ar, const unsigned int /*version*/) const {
// invoke serialization of the base class
ar << boost::serialization::base_object<const EnumerationStrategyBase>(
*this);
ar << m_numPermutationsProcessed;
std::stringstream random;
random << m_rng;
std::string s = random.str();
ar << s;
}
template <class Archive>
void load(Archive &ar, const unsigned int /*version*/) {
// invoke serialization of the base class
ar >> boost::serialization::base_object<EnumerationStrategyBase>(*this);
ar >> m_numPermutationsProcessed;
std::string s;
ar >> s;
std::stringstream random(s);
random >> m_rng;
// reset the uniform distributions
m_distributions.clear();
for (size_t i = 0; i < m_permutationSizes.size(); ++i) {
m_distributions.push_back(boost::random::uniform_int_distribution<>(
0, m_permutationSizes[i] - 1));
}
}
template <class Archive>
void serialize(Archive &ar, const unsigned int file_version) {
boost::serialization::split_member(ar, *this, file_version);
}
#endif
};
}
#ifdef RDK_USE_BOOST_SERIALIZATION
BOOST_CLASS_VERSION(RDKit::RandomSampleStrategy, 1)
#endif
#endif

View File

@@ -0,0 +1,186 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RGROUP_RANDOM_SAMPLE_ALLBBS_H
#define RGROUP_RANDOM_SAMPLE_ALLBBS_H
#include "EnumerationStrategyBase.h"
#include <boost/random.hpp>
#include <boost/random/uniform_int_distribution.hpp>
#include <sstream>
namespace RDKit {
//! RandomSampleAllBBsStrategy
//! Randomly sample rgroup indices
//! This is a class for randomly enumerating reagents that ensures all reagents
// are sampled.
/*!
basic usage:
\verbatim
std::vector<MOL_SPTR_VECT> bbs;
bbs.push_back( bbs_for_reactants_1 );
bbs.push_back( bbs_for_reactants_2 );
RandomSampleAllBBsStrategy rgroups;
rgroups.initialize(rxn, bbs);
for(size_t i=0; i<num_samples && rgroups; ++i) {
MOL_SPTR_VECT rvect = getReactantsFromRGroups(bbs, rgroups.next());
std::vector<MOL_SPTR_VECT> lprops = rxn.RunReactants(rvect);
...
}
\endverbatim
See EnumerationStrategyBase for more details and usage.
*/
class RandomSampleAllBBsStrategy : public EnumerationStrategyBase {
boost::uint64_t m_numPermutationsProcessed;
size_t m_offset;
size_t m_maxoffset;
boost::minstd_rand m_rng;
std::vector<boost::random::uniform_int_distribution<> > m_distributions;
public:
RandomSampleAllBBsStrategy()
: EnumerationStrategyBase(),
m_numPermutationsProcessed(0),
m_offset(0),
m_maxoffset(0),
m_rng(),
m_distributions() {
for (size_t i = 0; i < m_permutation.size(); ++i) {
m_distributions.push_back(
boost::random::uniform_int_distribution<>(0, m_permutation[i] - 1));
}
}
using EnumerationStrategyBase::initialize;
void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) {
m_distributions.clear();
m_permutation.resize(m_permutationSizes.size());
m_permutationSizes = m_permutationSizes;
m_offset = 0;
m_maxoffset =
*std::max_element(m_permutationSizes.begin(), m_permutationSizes.end());
for (size_t i = 0; i < m_permutationSizes.size(); ++i) {
m_distributions.push_back(boost::random::uniform_int_distribution<>(
0, m_permutationSizes[i] - 1));
}
m_numPermutationsProcessed = 0;
}
virtual const char *type() const { return "RandomSampleAllBBsStrategy"; }
//! The current permutation {r1, r2, ...}
virtual const EnumerationTypes::RGROUPS &next() {
if (m_offset >= m_maxoffset) {
for (size_t i = 0; i < m_permutation.size(); ++i) {
m_permutation[i] = m_distributions[i](m_rng);
}
m_offset = 0;
} else {
for (size_t i = 0; i < m_permutation.size(); ++i) {
m_permutation[i] = (m_permutation[i] + 1) % m_permutationSizes[i];
}
++m_offset;
}
++m_numPermutationsProcessed;
return m_permutation;
}
virtual boost::uint64_t getPermutationIdx() const {
return m_numPermutationsProcessed; }
virtual operator bool() const { return true; }
EnumerationStrategyBase *copy() const {
return new RandomSampleAllBBsStrategy(*this);
}
private:
#ifdef RDK_USE_BOOST_SERIALIZATION
friend class boost::serialization::access;
template <class Archive>
void save(Archive &ar, const unsigned int /*version*/) const {
// invoke serialization of the base class
ar << boost::serialization::base_object<const EnumerationStrategyBase>(
*this);
ar << m_numPermutationsProcessed;
std::stringstream random;
random << m_rng;
std::string s = random.str();
ar << s;
ar << m_offset;
ar << m_maxoffset;
}
template <class Archive>
void load(Archive &ar, const unsigned int /*version*/) {
// invoke serialization of the base class
ar >> boost::serialization::base_object<EnumerationStrategyBase>(*this);
ar >> m_numPermutationsProcessed;
std::string s;
ar >> s;
std::stringstream random(s);
random >> m_rng;
ar >> m_offset;
ar >> m_maxoffset;
// reset the uniform distributions
m_distributions.clear();
for (size_t i = 0; i < m_permutationSizes.size(); ++i) {
m_distributions.push_back(boost::random::uniform_int_distribution<>(
0, m_permutationSizes[i] - 1));
}
}
template <class Archive>
void serialize(Archive &ar, const unsigned int file_version) {
boost::serialization::split_member(ar, *this, file_version);
}
#endif
};
}
#ifdef RDK_USE_BOOST_SERIALIZATION
BOOST_CLASS_VERSION(RDKit::RandomSampleAllBBsStrategy, 1)
#endif
#endif

View File

@@ -0,0 +1,299 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include <RDGeneral/utils.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/ChemReactions/Enumerate/CartesianProduct.h>
#include <GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSample.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h>
#include <GraphMol/ChemReactions/Enumerate/Enumerate.h>
#include <GraphMol/ChemReactions/ReactionParser.h>
#include <GraphMol/ChemReactions/ReactionUtils.h>
#include <GraphMol/ChemReactions/SanitizeRxn.h>
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <RDGeneral/BoostEndInclude.h>
#endif
using namespace RDKit;
#ifdef RDK_USE_BOOST_SERIALIZATION
// for each starting point check to see that the archive
// starts at the same point
void pickleTest(EnumerationStrategyBase &en, size_t len) {
boost::shared_ptr<EnumerationStrategyBase> base(en.copy());
TEST_ASSERT(std::string(base->type()) == std::string(en.type()));
for (size_t i = 0; i < len; ++i) {
std::stringstream ss;
{
boost::archive::text_oarchive ar(ss);
ar &base;
}
boost::shared_ptr<EnumerationStrategyBase> copy;
{
boost::archive::text_iarchive ar(ss);
ar &copy;
}
TEST_ASSERT(std::string(base->type()) == std::string(copy->type()));
TEST_ASSERT(base->next() == copy->next());
TEST_ASSERT(base->getPosition() == en.next());
}
}
#endif
void testSamplers() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
for (int i = 0; i < 10; ++i)
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
for (int i = 0; i < 5; ++i)
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
for (int i = 0; i < 6; ++i)
bbs[2].push_back(
boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction rxn;
CartesianProductStrategy cart;
cart.initialize(rxn, bbs);
RandomSampleStrategy rand;
rand.initialize(rxn, bbs);
RandomSampleAllBBsStrategy randBBs;
randBBs.initialize(rxn, bbs);
EvenSamplePairsStrategy even;
even.initialize(rxn, bbs);
std::vector<boost::shared_ptr<EnumerationStrategyBase> > enumerators;
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(cart.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(rand.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(randBBs.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(even.copy()));
#ifdef RDK_USE_BOOST_SERIALIZATION
for (size_t i = 0; i < enumerators.size(); ++i) {
TEST_ASSERT(enumerators[i]->getNumPermutations() == 10 * 5 * 6);
pickleTest(*enumerators[i], 10 * 5 * 6);
}
#endif
// for(auto&& i: enumerators) {
// TEST_ASSERT(i->getNumPermutations() == 10*5*6);
//}
}
void testEvenSamplers() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
unsigned long R1 = 6000;
unsigned long R2 = 500;
unsigned long R3 = 10000;
for (unsigned long i = 0; i < R1; ++i)
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
for (unsigned long i = 0; i < R2; ++i)
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
for (unsigned long i = 0; i < R3; ++i)
bbs[2].push_back(
boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction rxn;
EvenSamplePairsStrategy even;
even.initialize(rxn, bbs);
std::cout << even.getNumPermutations() << " " << R1 * R2 * R3 << std::endl;
TEST_ASSERT(even.getNumPermutations() == R1 * R2 * R3);
for (size_t i = 0; i < 5000; ++i) {
even.next();
}
even.stats();
}
const char *smiresults[] = {
"C=CCNC(=S)NCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCc1ncc(Cl)cc1Br",
"C=CCNC(=S)NCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCc1ncc(Cl)cc1Br",
"C=CCNC(=S)NCCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br"};
void testEnumerations() {
EnumerationTypes::BBS bbs;
bbs.resize(2);
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CC=CCN=C=S")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCCc1ncc(Cl)cc1Br")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction *rxn = RxnSmartsToChemicalReaction(
"[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);"
"!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:"
"2]");
{
EnumerateLibrary en(*rxn, bbs);
size_t i = 0;
for (; (bool)en; ++i) {
std::vector<std::vector<std::string> > res = en.nextSmiles();
TEST_ASSERT(res.size() == 1);
TEST_ASSERT(res[0].size() == 1);
TEST_ASSERT(res[0][0] == smiresults[i]);
TEST_ASSERT(i<=6);
}
TEST_ASSERT(i == 6);
// tests reset
en.resetState();
i = 0;
for (; (bool)en; ++i) {
std::vector<std::vector<std::string> > res = en.nextSmiles();
TEST_ASSERT(res.size() == 1);
TEST_ASSERT(res[0].size() == 1);
TEST_ASSERT(res[0][0] == smiresults[i]);
TEST_ASSERT(i<=6);
}
TEST_ASSERT(i == 6);
}
#ifdef RDK_USE_BOOST_SERIALIZATION
{
boost::shared_ptr<EnumerateLibrary> en(
new EnumerateLibrary(*rxn, bbs, RandomSampleStrategy()));
std::vector<std::vector<std::vector<std::string> > >smir;
for (size_t j = 0; j < 10; ++j) {
std::vector<std::vector<std::string> > smiles = en->nextSmiles();
smir.push_back(smiles);
}
en->resetState();
for (size_t i = 0; i < 1000; ++i) {
// pickle and unpickle
std::stringstream ss;
{
boost::archive::text_oarchive ar(ss);
ar &en;
}
boost::shared_ptr<EnumerateLibrary> copy;
{
boost::archive::text_iarchive ar(ss);
ar &copy;
}
for (size_t j = 0; j < 10; ++j) {
TEST_ASSERT(en->nextSmiles() == copy->nextSmiles());
}
copy->resetState();
for (size_t j = 0; j < 10; ++j) {
TEST_ASSERT(smir[j] == copy->nextSmiles());
}
}
}
#endif
delete rxn;
}
const char *rxndata = "$RXN\nUntitled Document-1\n ChemDraw10291618492D\n\n 3 1\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n -0.4125 0.0000 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n -0.4125 0.0000 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n 0.4125 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n -0.4125 0.0000 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 14 15 0 0 0 0 0 0 0 0999 V2000\n 0.5072 -0.5166 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5072 0.3084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2949 -0.7616 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 1.7817 -0.0880 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2967 0.5794 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.5558 -1.5443 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n -0.2073 0.7208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9218 0.3083 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9217 -0.5167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.2073 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.6362 0.7208 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n 1.5452 1.3661 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n 2.3507 1.5443 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n -2.3507 0.3083 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 2 0 0\n 1 3 1 0 0\n 3 4 1 0 0\n 4 5 1 0 0\n 5 2 1 0 0\n 3 6 1 0 0\n 2 7 1 0 0\n 7 8 2 0 0\n 8 9 1 0 0\n 9 10 2 0 0\n 10 1 1 0 0\n 8 11 1 0 0\n 12 13 1 0 0\n 11 14 1 0 0\n 12 5 1 0 0\nM END\n";
void testInsaneEnumerations() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
ChemicalReaction *rxn2 = RxnBlockToChemicalReaction(rxndata);
//RxnOps::sanitizeRxn(*rxn2, MolOps::AdjustQueryParameters());
MatchVectType tvect;
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CCNCC")));
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCC")));
std::cerr << "0,0 " << (int)SubstructMatch(*bbs[0][0].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl;
std::cerr << "0,1 " << (int)SubstructMatch(*bbs[0][1].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl;
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("ClC1CCC1")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("ClC1CCC1Cl")));
std::cerr << "1,0 " << (int)SubstructMatch(*bbs[1][0].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl;
std::cerr << "1,1 " << (int)SubstructMatch(*bbs[1][1].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl;
bbs[2].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CCNCC")));
bbs[2].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCC")));
std::cerr << "2,0 " << (int)SubstructMatch(*bbs[2][0].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl;
std::cerr << "2,1 " << (int)SubstructMatch(*bbs[2][1].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl;
{
ChemicalReaction *rxn = RxnBlockToChemicalReaction(rxndata);
RxnOps::sanitizeRxn(*rxn, MolOps::AdjustQueryParameters());
std::cerr << ChemicalReactionToRxnBlock(*rxn) << std::endl;
EnumerationParams ThereCanBeOnlyOne;
ThereCanBeOnlyOne.reagentMaxMatchCount = 1;
EnumerationTypes::BBS bbs2 = removeNonmatchingReagents(
*rxn, bbs,
ThereCanBeOnlyOne);
TEST_ASSERT(bbs2[0].size() == 1);
TEST_ASSERT(bbs2[1].size() == 1);
TEST_ASSERT(bbs2[2].size() == 1);
delete rxn;
}
delete rxn2;
}
int main(int argc, char *argv[]) {
RDLog::InitLogs();
bool doLong = false;
if (argc > 1) {
if (!strncmp(argv[1], "-l", 2)) {
doLong = true;
}
}
/*
testSamplers();
testEvenSamplers();
testEnumerations();
*/
testInsaneEnumerations();
}

View File

@@ -89,6 +89,7 @@ bool preprocessReaction(ChemicalReaction &rxn,
const std::map<std::string, ROMOL_SPTR> &queries,
const std::string &propName) {
rxn.setImplicitPropertiesFlag(true);
rxn.initReactantMatchers();
if (rxn.validate(numWarnings, numErrors)) {
addRecursiveQueriesToReaction(rxn,

View File

@@ -0,0 +1,391 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "SanitizeRxn.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/QueryAtom.h>
namespace RDKit {
namespace RxnOps {
// molAtomMapNumber ==> int
// molFileRLabel ==> unsigned int
namespace {
template<class T>
T getMaxProp(ChemicalReaction &rxn, const std::string &prop) {
T max_atom = (T)0;
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
T map;
if (atom->getPropIfPresent<T>(prop, map)) {
if (map > max_atom)
max_atom = map;
}
}
}
for(MOL_SPTR_VECT::iterator it = rxn.beginAgentTemplates();
it != rxn.endAgentTemplates();
++it) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
T map;
if (atom->getPropIfPresent<T>(prop, map)) {
if (map > max_atom)
max_atom = map;
}
}
}
for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates();
it != rxn.endProductTemplates();
++it) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
T map;
if (atom->getPropIfPresent<T>(prop, map)) {
if (map > max_atom)
max_atom = map;
}
}
}
return max_atom;
}
struct AtomInfo {
Atom * atom;
unsigned int templateIdx;
unsigned int rlabel;
int atomMap;
int isotope;
std::string dummyLabel;
AtomInfo(Atom *at, unsigned int templateIdx) :
atom(at), templateIdx(templateIdx), rlabel(0), atomMap(0),
isotope(at->getIsotope()), dummyLabel() {
atom->getPropIfPresent(common_properties::_MolFileRLabel, rlabel);
atom->getPropIfPresent(common_properties::molAtomMapNumber, atomMap);
atom->getPropIfPresent(common_properties::dummyLabel, dummyLabel);
//std::cerr << atom->getIdx() << " : " << atom->getAtomicNum() << " " <<
// " rgroup: " << rlabel << " atomMap " << atomMap << " isotope " << isotope <<
// " label " << dummyLabel <<
// std::endl;
}
bool NeedsRLabel() {
return atom->getAtomicNum() == 0 && rlabel == 0;
}
unsigned int bestGuessRLabel() {
if (rlabel) return rlabel;
if (isotope) return isotope;
if (atomMap) return atomMap;
if (dummyLabel.size()) {
try {
return boost::lexical_cast<unsigned int>(
dummyLabel.substr(1,dummyLabel.size()-1));
} catch (...) {
return 0;
}
}
return 0;
}
void setRLabel(unsigned int rlabel) {
PRECONDITION(atom, "Internal error in SanitizeRxn - null atom");
RWMol &mol = dynamic_cast<RWMol&>(atom->getOwningMol());
QueryAtom qatom(*atom);
qatom.setProp(common_properties::_MolFileRLabel, rlabel);
std::string dLabel = "R" + boost::lexical_cast<std::string>(rlabel);
qatom.setProp(common_properties::dummyLabel, dLabel);
if (rlabel > 0 && rlabel < 999) {
qatom.setIsotope(rlabel);
}
qatom.setQuery(makeAtomNullQuery());
unsigned int idx = atom->getIdx();
mol.replaceAtom(idx, &qatom);
atom = mol.getAtomWithIdx(idx);
}
void setAtomMap(int map) {
atom->setProp(common_properties::molAtomMapNumber, map);
}
};
std::string makeReactantErrorMessage(const std::string &error,
const AtomInfo &at) {
std::ostringstream str;
str << error << " for reactant idx: " << at.templateIdx <<
" atom: " << at.atom->getIdx();
return str.str();
}
std::string makeProductErrorMessage(const std::string &error,
const AtomInfo &at) {
std::ostringstream str;
str << error << " for product idx: " << at.templateIdx <<
" atom: " << at.atom->getIdx();
return str.str();
}
}
// if we have query atoms without rlabels, make proper rlabels if possible
// ensure that every rlabel in the reactant has one in the product
void fixRGroups(ChemicalReaction &rxn) {
std::map<unsigned int, unsigned int> remapped;
std::vector<AtomInfo> reactantAtomsToFix;
std::vector<AtomInfo> productAtomsToFix;
unsigned int templateIdx = 0;
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it, ++templateIdx) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
AtomInfo at(atom, templateIdx);
if (at.NeedsRLabel())
reactantAtomsToFix.push_back(at);
}
}
templateIdx = 0;
for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates();
it != rxn.endProductTemplates();
++it, ++templateIdx) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
AtomInfo at(atom, templateIdx);
if (at.NeedsRLabel())
productAtomsToFix.push_back(at);
}
}
if (!reactantAtomsToFix.size() && !productAtomsToFix.size())
return;
if( reactantAtomsToFix.size() != productAtomsToFix.size() ) {
std::ostringstream str;
str << "Mismatched rlabels: " <<
reactantAtomsToFix.size() << " unmapped reactant rlabels," <<
productAtomsToFix.size() << " unmappped product rlabels" ;
throw RxnSanitizeException(str.str());
}
unsigned int max_rlabel = getMaxProp<unsigned int>(rxn,
common_properties::_MolFileRLabel);
int max_atom_map = getMaxProp<int>(rxn,
common_properties::molAtomMapNumber);
BOOST_FOREACH(AtomInfo &rat, reactantAtomsToFix) {
bool found = false;
unsigned int bestGuess = rat.bestGuessRLabel();
if (!bestGuess) {
throw RxnSanitizeException(makeReactantErrorMessage(
"Could not deduce RLabel", rat));
}
BOOST_FOREACH(AtomInfo &pat, productAtomsToFix) {
if (!pat.atom) continue;
if(rat.bestGuessRLabel() == pat.bestGuessRLabel()) {
// if the atomMaps don't match, this is bad, no atomMap is ok(==0)
if (rat.atomMap == pat.atomMap) {
found = true;
rat.setRLabel( max_rlabel + rat.bestGuessRLabel() );
pat.setRLabel( max_rlabel + pat.bestGuessRLabel() );
if (!rat.atomMap) { // set atom mapping as well
rat.setAtomMap(max_atom_map + rat.bestGuessRLabel());
pat.setAtomMap(max_atom_map + rat.bestGuessRLabel());
}
pat.atom = NULL; // don't match again
break;
}
}
}
if(!found) {
throw RxnSanitizeException(makeReactantErrorMessage(
"Could not find RLabel mapping", rat));
}
}
return;
}
// if we have query atoms without rlabels, make proper rlabels if possible
// ensure that every rlabel in the reactant has one in the product
void fixAtomMaps(ChemicalReaction &rxn) {
int max_atom_map = getMaxProp<int>(
rxn,
common_properties::molAtomMapNumber);
std::map<unsigned int, int> potential_mappings;
unsigned int templateIdx = 0;
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it, ++templateIdx) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
AtomInfo at(atom, templateIdx);
if(at.rlabel && !at.atomMap) {
if(potential_mappings.find(at.rlabel) != potential_mappings.end()) {
throw RxnSanitizeException(std::string("Duplicated RLabels"));
}
int map = potential_mappings[at.rlabel] = rdcast<int>(at.rlabel)+max_atom_map;
at.setAtomMap(map);
}
}
}
if (!potential_mappings.size())
return; // everything is ok!
templateIdx = 0;
for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates();
it != rxn.endProductTemplates();
++it, ++templateIdx) {
for (ROMol::AtomIterator atIt = (*it)->beginAtoms();
atIt != (*it)->endAtoms();
++atIt) {
Atom *atom = (*atIt);
AtomInfo at(atom, templateIdx);
if(at.rlabel) {
if(!at.atomMap) {
at.setAtomMap(potential_mappings[at.rlabel]);
} else {
if (at.atomMap != potential_mappings[at.rlabel]) {
throw RxnSanitizeException(makeProductErrorMessage(
"RLabel is mapped in product but not in reactant", at));
}
}
}
}
}
}
// might throw mol sanitization exception??? wrap in RxnSanitize?
void fixReactantTemplateAromaticity(ChemicalReaction &rxn) {
unsigned int ops;
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it) {
RWMol * rw = dynamic_cast<RWMol*>(it->get());
if (rw)
sanitizeMol(*rw, ops, MolOps::SANITIZE_SETAROMATICITY);
else
PRECONDITION(rw, "Oops, not really a RWMol?");
}
}
void fixHs(ChemicalReaction &rxn) {
const bool mergeUnmappedOnly = true;
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it) {
RWMol * rw = dynamic_cast<RWMol*>(it->get());
if (rw)
MolOps::mergeQueryHs(*rw, mergeUnmappedOnly);
else
PRECONDITION(rw, "Oops, not really a RWMol?");
}
}
void adjustTemplates(ChemicalReaction &rxn,
const MolOps::AdjustQueryParameters &params) {
if(!params.adjustDegree && !params.adjustRingCount) {
return;
}
for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates();
it != rxn.endReactantTemplates();
++it) {
RWMol * rw = dynamic_cast<RWMol*>(it->get());
if (rw)
adjustQueryProperties(*rw, &params);
else
PRECONDITION(rw, "Oops, not really a RWMol?");
}
}
void sanitizeRxn(ChemicalReaction &rxn,
unsigned int &operationsThatFailed,
unsigned int ops,
const MolOps::AdjustQueryParameters &params)
{
operationsThatFailed = SANITIZE_NONE;
if (ops & SANITIZE_RGROUP_NAMES) {
operationsThatFailed = SANITIZE_RGROUP_NAMES;
fixRGroups(rxn);
}
if (ops & SANITIZE_ATOM_MAPS) {
operationsThatFailed = SANITIZE_ATOM_MAPS;
fixAtomMaps(rxn);
}
if (ops & SANITIZE_ADJUST_REACTANTS) {
operationsThatFailed = SANITIZE_ADJUST_REACTANTS;
adjustTemplates(rxn, params);
}
if (ops & SANITIZE_MERGEHS) {
operationsThatFailed = SANITIZE_MERGEHS;
fixHs(rxn);
}
operationsThatFailed = SANITIZE_NONE;
}
void sanitizeRxn(ChemicalReaction &rxn, const MolOps::AdjustQueryParameters &params) {
unsigned int ops = 0;
return sanitizeRxn(rxn, ops, SANITIZE_ALL, params);
}
}
}

View File

@@ -0,0 +1,150 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#ifndef RDKIT_SANITIZERXN_H
#define RDKIT_SANITIZERXN_H
#include "Reaction.h"
#include <GraphMol/MolOps.h>
#include <string>
#include <exception>
namespace RDKit {
//! class for flagging sanitization errors
class RxnSanitizeException : public std::exception {
public:
RxnSanitizeException(const char *msg) : _msg(msg){};
RxnSanitizeException(const std::string &msg) : _msg(msg){};
const char *message() const { return _msg.c_str(); };
~RxnSanitizeException() throw(){};
private:
std::string _msg;
};
namespace RxnOps {
//! Any dummy atom with a map but no RGroup label, should be an RGroup
//! in RDKit's view of a reaction.
//! See if these atoms can be salvaged into RGroups.
void fixRGroups(ChemicalReaction &rxn);
//! If atom maps are not defined on rgroups, attempt to deduce them from the RGroup
//! labels, or add new ones if possible.
void fixAtomMaps(ChemicalReaction &rxn);
//! Adjusts the reactant templates to properly match reagents
void adjustTemplates(ChemicalReaction &rxn, const MolOps::AdjustQueryParameters &params);
//! merge query Hs if appropriate
void fixHs(ChemicalReaction &rxn);
// Default adjustment parameters for matching reagents
inline const MolOps::AdjustQueryParameters DefaultRxnAdjustParams() {
MolOps::AdjustQueryParameters params;
params.adjustDegree = false;
params.adjustDegreeFlags = MolOps::ADJUST_IGNOREALL;
params.adjustRingCount = false;
params.adjustRingCountFlags = MolOps::ADJUST_IGNOREALL;
params.makeDummiesQueries = false;
params.aromatizeIfPossible = true;
return params;
}
// Default adjustment parameters for ChemDraw style matching of reagents
inline const MolOps::AdjustQueryParameters ChemDrawRxnAdjustParams() {
MolOps::AdjustQueryParameters params;
params.adjustDegree = true;
params.adjustDegreeFlags = MolOps::ADJUST_IGNOREDUMMIES;
params.adjustRingCount = false;
params.adjustRingCountFlags = MolOps::ADJUST_IGNORENONE;
params.makeDummiesQueries = false;
params.aromatizeIfPossible = true;
return params;
}
typedef enum {
SANITIZE_NONE = 0x0,
SANITIZE_RGROUP_NAMES = 0x1,
SANITIZE_ATOM_MAPS = 0x2,
SANITIZE_ADJUST_REACTANTS = 0x4,
SANITIZE_MERGEHS = 0x8,
SANITIZE_ALL = 0xFFFFFFFF
} SanitizeRxnFlags;
//! \brief carries out a collection of tasks for cleaning up a reaction and
// ensuring
//! that it makes "chemical sense" in the context of RDKit reacitons
/*!
This functions calls the following in sequence
-# RxnOps::fixRGroups()
-# RxnOps::fixupAtomMaps()
-# RxnOps::fixupTemplateAromaticity()
-# RxnOps::mergeHs()
\param rxn : the ChemicalReaction to be cleaned
\param operationThatFailed : the first (if any) sanitization operation that
fails is set here.
The values are taken from the \c SanitizeFlags
enum.
On success, the value is \c
SanitizeFlags::SANITIZE_NONE
\param sanitizeOps : the bits here are used to set which sanitization
operations are carried
out. The elements of the \c SanitizeFlags enum define
the operations.
<b>Notes:</b>
- This attempts to fix known issues with certain reaction drawers.
HOWEVER, if any flag is returned in operationsPerformed,
the reaction may still be suspect to its validity.
- Aromaticity can be tricky when starting with Kekule structures that
have query features, aromaticity works well for non-query rings, however
certain structures (substitutions on Kekule rings that should really be
aromatic) may not have enough information.
*/
void sanitizeRxn(ChemicalReaction &rxn,
unsigned int &operationsThatFailed,
unsigned int sanitizeOps = SANITIZE_ALL,
const MolOps::AdjustQueryParameters &params = DefaultRxnAdjustParams());
//! \overload
void sanitizeRxn(ChemicalReaction &rxn,
const MolOps::AdjustQueryParameters &params = DefaultRxnAdjustParams());
}
}
#endif

View File

@@ -1,9 +1,15 @@
rdkit_python_extension(rdChemReactions
Enumerate.cpp
rdChemReactions.cpp
DEST Chem
LINK_LIBRARIES
ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch GraphMol Catalogs FilterCatalog RDGeneral RDGeometryLib RDBoost)
ChemReactions FilterCatalog ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch GraphMol Catalogs FilterCatalog RDGeneral RDGeometryLib RDBoost ${Boost_SERIALIZATION_LIBRARY})
add_pytest(pyChemReactions
${CMAKE_CURRENT_SOURCE_DIR}/testReactionWrapper.py)
add_pytest(pyChemReactionEnumerations
${CMAKE_CURRENT_SOURCE_DIR}/testEnumerations.py)
add_pytest(pyChemReactionSanitize
${CMAKE_CURRENT_SOURCE_DIR}/testSanitize.py)

View File

@@ -0,0 +1,435 @@
//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutues for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include <boost/python.hpp>
#include <RDBoost/Wrap.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSample.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h>
#include <GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h>
#include <GraphMol/ChemReactions/Enumerate/Enumerate.h>
#include <boost/python/stl_iterator.hpp>
namespace python = boost::python;
namespace RDKit {
template<class T>
std::vector<RDKit::MOL_SPTR_VECT> ConvertToVect(T bbs) {
std::vector<RDKit::MOL_SPTR_VECT> vect;
size_t num_bbs = python::extract<unsigned int>(bbs.attr("__len__")());
vect.resize(num_bbs);
for(size_t i=0; i<num_bbs; ++i) {
unsigned int len1 = python::extract<unsigned int>(bbs[i].attr("__len__")());
RDKit::MOL_SPTR_VECT &reacts = vect[i];
reacts.reserve(len1);
for(unsigned int j=0;j<len1;++j){
RDKit::ROMOL_SPTR mol = python::extract<RDKit::ROMOL_SPTR>(bbs[i][j]);
if(mol)
reacts.push_back(mol);
else {
throw_value_error("reaction called with non molecule reactant");
}
}
}
return vect;
}
bool EnumerateLibraryBase__nonzero__(RDKit::EnumerateLibraryBase *base) {
return static_cast<bool>(*base);
}
bool EnumerationStrategyBase__nonzero__(RDKit::EnumerationStrategyBase *base) {
return static_cast<bool>(*base);
}
inline python::object pass_through(python::object const& o) { return o; }
PyObject *EnumerateLibraryBase__next__(RDKit::EnumerateLibraryBase *base) {
if (!static_cast<bool>(*base)) {
PyErr_SetString(PyExc_StopIteration, "Enumerations exhausted");
boost::python::throw_error_already_set();
}
std::vector<RDKit::MOL_SPTR_VECT> mols;
{
NOGIL gil;
mols = base->next();
}
PyObject *res=PyTuple_New(mols.size());
for(unsigned int i=0;i<mols.size();++i){
PyObject *lTpl =PyTuple_New(mols[i].size());
for(unsigned int j=0;j<mols[i].size();++j){
PyTuple_SetItem(lTpl,j,
python::converter::shared_ptr_to_python(mols[i][j]));
}
PyTuple_SetItem(res,i,lTpl);
}
return res;
}
python::object EnumerateLibraryBase_Serialize(const EnumerateLibraryBase &en) {
std::string res = en.Serialize();
python::object retval = python::object(
python::handle<>(PyBytes_FromStringAndSize(res.c_str(), res.length())));
return retval;
}
class EnumerateLibraryWrap : public RDKit::EnumerateLibrary {
public:
EnumerateLibraryWrap() : RDKit::EnumerateLibrary() {}
EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::list ob,
const EnumerationParams & params = EnumerationParams()
) :
RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), params) {
}
EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::tuple ob,
const EnumerationParams & params = EnumerationParams()
) :
RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), params) {
}
EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::list ob,
const EnumerationStrategyBase &enumerator,
const EnumerationParams & params = EnumerationParams()
) :
RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), enumerator, params) {
}
EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::tuple ob,
const EnumerationStrategyBase &enumerator,
const EnumerationParams & params = EnumerationParams()) :
RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), enumerator, params) {
}
};
namespace {
template< typename T >
inline
std::vector< T > to_std_vector( const python::object& iterable )
{
return std::vector< T >( python::stl_input_iterator< T >( iterable ),
python::stl_input_iterator< T >( ) );
}
}
void ToBBS(EnumerationStrategyBase &rgroup, ChemicalReaction &rxn, python::list ob) {
rgroup.initialize(rxn, ConvertToVect(ob));
}
typedef std::vector<uint64_t> VectSizeT;
typedef std::vector<std::vector<std::string> > VectStringVect;
typedef std::vector<MOL_SPTR_VECT > VectMolVect;
struct enumeration_wrapper {
static void wrap() {
std::string docString;
python::class_<VectStringVect>("VectorOfStringVectors")
.def(python::vector_indexing_suite<VectStringVect, false>() );
python::class_<VectSizeT>("VectSizeT")
.def(python::vector_indexing_suite<VectSizeT, false>() );
python::class_<VectMolVect>("VectMolVect")
.def(python::vector_indexing_suite<VectMolVect, false>() );
python::class_<RDKit::EnumerateLibraryBase, RDKit::EnumerateLibraryBase *,
RDKit::EnumerateLibraryBase &, boost::noncopyable>(
"EnumerateLibraryBase", python::no_init)
.def("__nonzero__", &EnumerateLibraryBase__nonzero__)
.def("__bool__", &EnumerateLibraryBase__nonzero__)
.def("__iter__", &pass_through)
.def("next", &EnumerateLibraryBase__next__,
"Return the next molecule from the enumeration.")
.def("__next__", &EnumerateLibraryBase__next__,
"Return the next molecule from the enumeration.")
.def("nextSmiles", &RDKit::EnumerateLibraryBase::nextSmiles,
"Return the next smiles string from the enumeration.")
.def("Serialize", &EnumerateLibraryBase_Serialize,
"Serialize the library to a binary string.\n"
"Note that the position in the library is serialized as well. Care should\n"
"be taken when serializing. See GetState/SetState for position manipulation.")
.def("InitFromString", &RDKit::EnumerateLibraryBase::initFromString,
python::arg("data"),
"Inititialize the library from a binary string")
.def("GetPosition", &RDKit::EnumerateLibraryBase::getPosition,
"Returns the current enumeration position into the reagent vectors",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
.def("GetState", &RDKit::EnumerateLibraryBase::getState,
"Returns the current enumeration state (position) of the library.\n"
"This position can be used to restart the library from a known position")
.def("SetState", &RDKit::EnumerateLibraryBase::setState,
python::arg("state"),
"Sets the enumeration state (position) of the library.")
.def("ResetState", &RDKit::EnumerateLibraryBase::resetState,
"Returns the current enumeration state (position) of the library to the start.")
.def("GetReaction", &RDKit::EnumerateLibraryBase::getReaction,
"Returns the chemical reaction for this library",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
.def("GetEnumerator", &RDKit::EnumerateLibraryBase::getEnumerator,
"Returns the enumation strategy for the current library",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >());
docString = \
"EnumerationParams\n\
Controls some aspects of how the enumeration is performed.\n\
Options:\n\
reagentMaxMatchCount [ default Infinite ]\n\
This specifies how many times the reactant template can match a reagent.\n\
\n\
sanePartialProducts [default false]\n\
If true, forces all products of the reagent plus the product templates\n\
pass chemical sanitization. Note that if the product template itself\n\
does not pass sanitization, then none of the products will.\n\
";
python::class_<RDKit::EnumerationParams,
RDKit::EnumerationParams*,
RDKit::EnumerationParams&>("EnumerationParams",
docString.c_str(),
python::init<>())
.def_readwrite("reagentMaxMatchCount",
&RDKit::EnumerationParams::reagentMaxMatchCount)
.def_readwrite("sanePartialProducts",
&RDKit::EnumerationParams::sanePartialProducts);
docString = \
"EnumerateLibrary\n\
This class allows easy enumeration of reactions. Simply provide a reaction\n\
and a set of reagents and you are off the the races.\n\
\n\
EnumerateLibrary follows the python enumerator protocol, for example:\n\
\n\
library = EnumerateLibrary(rxn, bbs)\n\
for products in library:\n\
... do something with the product\n\
\n\
It is useful to sanitize reactions before hand:\n\
\n\
SanitizeRxn(rxn)\n\
library = EnumerateLibrary(rxn, bbs)\n\
\n\
If ChemDraw style reaction semantics are prefereed, you can apply\n\
the ChemDraw parameters:\n\
\n\
SanitizeRxn(rxn, params=GetChemDrawRxnAdjustParams())\n\
\n\
For one, this enforces only matching RGroups and assumes all atoms\n\
have fully satisfied valences.\n\
\n\
Each product has the same output as applying a set of reagents to\n\
the libraries reaction.\n\
\n\
This can be a bit confusing as each product can have multiple molecules\n\
generated. The returned data structure is as follows:\n\
\n\
[ [products1], [products2],... ]\n\
Where products1 are the molecule products for the reactions first product\n\
template and products2 are the molecule products for the second product\n\
template. Since each reactant can match more than once, there may be\n\
multiple product molecules for each template.\n\
\n\
for result in library:\n\
for results_for_product_template in products:\n\
for mol in results_for_product_template:\n\
Chem.MolToSmiles(mol) # finally have a molecule!\n\
\n\
For sufficiently large libraries, using this iteration strategy is not\n\
recommended as the library may contain more products than atoms in the\n\
universe. To help with this, you can supply an enumeration strategy.\n\
The default strategy is a CartesianProductStrategy which enumerates\n\
everything. RandomSampleStrategy randomly samples the products but\n\
this strategy never terminates, however, python supplies itertools:\n\
\n\
import itertools\n\
library = EnumerateLibrary(rxn, bbs, rdChemReactions.RandomSampleStrategy())\n\
for result in itertools.islice(libary, 1000):\n\
# do something with the first 1000 samples\n\
\n\
for result in itertools.islice(libary, 1000):\n\
# do something with the next 1000 samples\n\
\n\
Libraries are also serializable, including their current state:\n\
\n\
s = library.Serialize()\n\
library2 = EnumerateLibrary()\n\
library2.InitFromString(s)\n\
for result in itertools.islice(libary2, 1000):\n\
# do something with the next 1000 samples\n\
";
python::class_<EnumerateLibraryWrap,
EnumerateLibraryWrap*,EnumerateLibraryWrap&,
python::bases<RDKit::EnumerateLibraryBase> >(
"EnumerateLibrary", docString.c_str(),
python::init<>())
.def(python::init<
const RDKit::ChemicalReaction &,
python::list,
python::optional<const RDKit::EnumerationParams&>
>(python::args("rxn", "reagents", "params")))
.def(python::init<
const RDKit::ChemicalReaction &,
python::tuple,
python::optional<const RDKit::EnumerationParams&>
>(python::args("rxn", "reagents", "params")))
.def(python::init<const RDKit::ChemicalReaction &,
python::list,
const RDKit::EnumerationStrategyBase &,
python::optional<const RDKit::EnumerationParams&>
>(python::args(
"rxn", "reagents", "enumerator", "params")))
.def(python::init<const RDKit::ChemicalReaction &,
python::tuple,
const RDKit::EnumerationStrategyBase &,
python::optional<const RDKit::EnumerationParams&>
>(python::args(
"rxn", "reagents", "enumerator", "params")))
.def("GetReagents", &RDKit::EnumerateLibrary::getReagents,
"Return the reagents used in this library.",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
;
//iterator_wrappers<EnumerateLibrary>().wrap("EnumerateLibraryIterator");
python::class_<RDKit::EnumerationStrategyBase,
RDKit::EnumerationStrategyBase *,
RDKit::EnumerationStrategyBase &, boost::noncopyable>(
"EnumerationStrategyBase", python::no_init)
.def("__nonzero__", &EnumerationStrategyBase__nonzero__)
.def("__bool__", &EnumerationStrategyBase__nonzero__)
.def("Type", &EnumerationStrategyBase::type,
"Returns the enumeration strategy type as a string.")
.def("Skip", &EnumerationStrategyBase::skip,
python::args("skipCount"),
"Skip the next Nth results. note: this may be an expensive "
"operation\n"
"depending on the enumeration strategy used. It is recommended to "
"use\n"
"the enumerator state to advance to a known position")
.def("__copy__", python::pure_virtual(&EnumerationStrategyBase::copy),
python::return_value_policy<python::manage_new_object>())
.def("GetNumPermutations", &EnumerationStrategyBase::getNumPermutations,
"Returns the total number of results for this enumeration strategy.\n"
"Note that some strategies are effectively infinite.")
.def("GetPosition", &EnumerationStrategyBase::getPosition,
"Return the current indices into the arrays of reagents",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
.def("next", python::pure_virtual(&EnumerationStrategyBase::next),
"Return the next indices into the arrays of reagents",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
.def("__next__", python::pure_virtual(&EnumerationStrategyBase::next),
"Return the next indices into the arrays of reagents",
python::return_internal_reference<
1, python::with_custodian_and_ward_postcall<0, 1> >())
.def("Initialize", ToBBS);
docString = "CartesianProductStrategy produces a standard walk through all possible\n"
"reagent combinations:\n"
"\n"
"(0,0,0), (1,0,0), (2,0,0) ...\n";
python::class_<RDKit::CartesianProductStrategy,
RDKit::CartesianProductStrategy*,
RDKit::CartesianProductStrategy&,
python::bases<EnumerationStrategyBase> >("CartesianProductStrategy",
docString.c_str(),
python::init<>())
.def("__copy__", &RDKit::CartesianProductStrategy::copy,
python::return_value_policy<python::manage_new_object>())
;
docString = "RandomSampleStrategy simply randomly samples from the reagent sets.\n"
"Note that this strategy never halts and can produce duplicates.";
python::class_<RDKit::RandomSampleStrategy,
RDKit::RandomSampleStrategy*,
RDKit::RandomSampleStrategy&,
python::bases<EnumerationStrategyBase> >("RandomSampleStrategy",
docString.c_str(),
python::init<>())
.def("__copy__", &RDKit::RandomSampleStrategy::copy,
python::return_value_policy<python::manage_new_object>())
;
docString = "RandomSampleAllBBsStrategy randomly samples from the reagent sets\n"
"with the constraint that all building blocks are samples as early as possible.\n"
"Note that this strategy never halts and can produce duplicates.";
python::class_<RDKit::RandomSampleAllBBsStrategy,
RDKit::RandomSampleAllBBsStrategy*,
RDKit::RandomSampleAllBBsStrategy&,
python::bases<EnumerationStrategyBase> >("RandomSampleAllBBsStrategy",
docString.c_str(),
python::init<>())
.def("__copy__", &RDKit::RandomSampleAllBBsStrategy::copy,
python::return_value_policy<python::manage_new_object>())
;
docString = "Randomly sample Pairs evenly from a collection of building blocks\n"
"This is a good strategy for choosing a relatively small selection\n"
"of building blocks from a larger set. As the amount of work needed\n"
"to retrieve the next evenly sample building block grows with the\n"
"number of samples, this method performs progressively worse as the\n"
"number of samples gets larger.\n"
"See EnumerationStrategyBase for more details.\n";
python::class_<RDKit::EvenSamplePairsStrategy,
RDKit::EvenSamplePairsStrategy*,
RDKit::EvenSamplePairsStrategy&,
python::bases<EnumerationStrategyBase> >("EvenSamplePairsStrategy",
docString.c_str(),
python::init<>())
.def("__copy__", &RDKit::EvenSamplePairsStrategy::copy,
python::return_value_policy<python::manage_new_object>())
.def("Stats", &RDKit::EvenSamplePairsStrategy::stats,
"Return the a statisics log of the pairs used in the current enumeration.")
;
python::def("EnumerateLibraryCanSerialize", EnumerateLibraryCanSerialize,
"Returns True if the EnumerateLibrary is serializable "
"(requires boost serialization");
}
};
}// end of namespace
void wrap_enumeration() {
RDKit::enumeration_wrapper::wrap();
}

View File

@@ -36,6 +36,7 @@
#include <GraphMol/ChemReactions/ReactionParser.h>
#include <GraphMol/ChemReactions/ReactionRunner.h>
#include <GraphMol/ChemReactions/PreprocessRxn.h>
#include <GraphMol/ChemReactions/SanitizeRxn.h>
#include <GraphMol/Depictor/DepictUtils.h>
#include <GraphMol/FilterCatalog/FunctionalGroupHierarchy.h>
@@ -328,7 +329,7 @@ python::object AddRecursiveQueriesToReaction(ChemicalReaction &self,
python::object PreprocessReaction(ChemicalReaction &reaction,
python::dict queryDict,
std::string propName) {
// transform dictionary into map
std::map<std::string, ROMOL_SPTR> queries;
unsigned int size = python::extract<unsigned int>(queryDict.keys().attr("__len__")());
@@ -353,11 +354,11 @@ python::object PreprocessReaction(ChemicalReaction &reaction,
reaction.validate(nWarn, nError);
std::vector<
std::vector<std::pair<unsigned int,std::string> > > labels;
if (!nError) {
preprocessReaction(reaction, nWarn, nError, labels, queries, propName);
}
// transform labels into python::tuple(python::tuple(python::tuple))
python::list reactantLabels;
for (unsigned int i = 0; i < labels.size(); ++i) {
@@ -374,6 +375,30 @@ python::object PreprocessReaction(ChemicalReaction &reaction,
python::tuple(reactantLabels));
}
#ifdef RDK_32BIT_BUILD
typedef int sanitize_ops;
#else
typedef unsigned int sanitize_ops;
#endif
RxnOps::SanitizeRxnFlags sanitizeReaction(
ChemicalReaction &rxn,
sanitize_ops sanitizeOps,
const MolOps::AdjustQueryParameters &params,
bool catchErrors) {
unsigned int operationsThatFailed = 0;
try {
RxnOps::sanitizeRxn(rxn, operationsThatFailed, sanitizeOps, params);
} catch(...) {
if (!catchErrors)
throw;
}
return static_cast<RxnOps::SanitizeRxnFlags>(operationsThatFailed);
}
}
void wrap_enumeration();
BOOST_PYTHON_MODULE(rdChemReactions) {
python::scope().attr("__doc__") =
@@ -661,7 +686,7 @@ of the replacements argument.",
"Caution: This is an expert-user function which will change a property (molInversionFlag) of your products.\
This function is called by default using the RXN or SMARTS parser for reactions and should really only be called if reactions have been constructed some other way.\
The function updates the stereochemistry of the product by considering 4 different cases: inversion, retention, removal, and introduction");
python::def(
"ReduceProductToSideChains", RDKit::reduceProductToSideChains,
(python::arg("product"), python::arg("addDummyAtoms") = true),
@@ -669,7 +694,7 @@ of the replacements argument.",
The output is a molecule with attached wildcards indicating where the product was attached.\
The dummy atom has the same reaction-map number as the product atom (if available).",
python::return_value_policy<python::manage_new_object>());
python::def("RemoveMappingNumbersFromReactions",
RDKit::removeMappingNumbersFromReactions,
(python::arg("reaction")),
@@ -774,10 +799,36 @@ Sample Usage:\n\
True\n\
";
python::def("PreprocessReaction", PreprocessReaction,
python::def("PreprocessReaction", RDKit::PreprocessReaction,
(python::arg("reaction"),
python::arg("queries")=python::dict(),
python::arg("propName")=common_properties::molFileValue),
python::arg("propName")=RDKit::common_properties::molFileValue),
docString.c_str());
}
python::enum_<RDKit::RxnOps::SanitizeRxnFlags>("SanitizeFlags")
.value("SANITIZE_NONE", RDKit::RxnOps::SANITIZE_NONE)
.value("SANITIZE_ATOM_MAPS", RDKit::RxnOps::SANITIZE_ATOM_MAPS)
.value("SANITIZE_RGROUP_NAMES", RDKit::RxnOps::SANITIZE_RGROUP_NAMES)
.value("SANITIZE_ADJUST_REACTANTS", RDKit::RxnOps::SANITIZE_ADJUST_REACTANTS)
.value("SANITIZE_MERGEHS", RDKit::RxnOps::SANITIZE_MERGEHS)
.value("SANITIZE_ALL", RDKit::RxnOps::SANITIZE_ALL)
.export_values();
;
python::def("GetDefaultAdjustParams", RDKit::RxnOps::DefaultRxnAdjustParams,
"Returns the default adjustment parameters for reactant templates");
python::def("GetChemDrawRxnAdjustParams", RDKit::RxnOps::ChemDrawRxnAdjustParams,
"Returns the chemdraw style adjustment parameters for reactant templates");
std::string docstring = "feed me";
python::def(
"SanitizeRxn", RDKit::sanitizeReaction,
(python::arg("rxn"), python::arg("sanitizeOps") = rdcast<unsigned int>(RDKit::RxnOps::SANITIZE_ALL),
python::arg("params") = RDKit::RxnOps::DefaultRxnAdjustParams(),
python::arg("catchErrors") = false),
docString.c_str());
wrap_enumeration();
}

View File

@@ -0,0 +1,654 @@
# Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
from __future__ import print_function
import unittest
import os,sys, copy
from rdkit.six.moves import cPickle
from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem import AllChem,rdChemReactions
from rdkit import Geometry
from rdkit import RDConfig
import itertools, time
import numpy as np
def log(s):
Chem.LogErrorMsg("== " + s)
class TestCase(unittest.TestCase) :
def setUp(self):
self.dataDir = os.path.join(RDConfig.RDBaseDir,'Code','GraphMol','ChemReactions','testData')
def testCartesianProduct(self):
log("testCartesianProduct")
rxn = rdChemReactions.ChemicalReaction();
rgroups = [[Chem.MolFromSmiles("C")]*10,
[Chem.MolFromSmiles("N")]*5,
[Chem.MolFromSmiles("O")]*6]
cartProd = rdChemReactions.CartesianProductStrategy()
cartProd.Initialize(rxn, rgroups)
self.assertEquals(cartProd.GetNumPermutations(), 10*5*6)
groups = []
count = 0
print (cartProd.__bool__())
while cartProd:
groups.append(tuple(cartProd.next()))
# count += 1
# assert count <= cartProd.GetNumPermutations()
self.assertEquals(len(groups), 10*5*6)
# see if we are equal to the Python implementation
g = list(itertools.product( list(range(10)), list(range(5)), list(range(6)) ))
self.assertEquals(set(g), set(groups))
copy.copy(cartProd)
def testRandomSample(self):
log("testRandomSample")
rgroups = [[Chem.MolFromSmiles("C")]*10,
[Chem.MolFromSmiles("N")]*5,
[Chem.MolFromSmiles("O")]*6]
rxn = rdChemReactions.ChemicalReaction();
randProd = rdChemReactions.RandomSampleStrategy()
randProd.Initialize(rxn, rgroups)
self.assertEquals(randProd.GetNumPermutations(), 10*5*6)
groups = []
for i in range(10*5*6):
groups.append(tuple(randProd.next()))
print( len(set(groups)), "out of", 10*5*6 )
randProd = rdChemReactions.RandomSampleStrategy()
randProd.Initialize(rxn, rgroups)
self.assertEquals(randProd.GetNumPermutations(), 10*5*6)
groups = []
for i in range(10):
groups.append(tuple(randProd.next()))
for i in range(3):
print( i, len(set([g[i] for g in groups])), "out of", [10,5,6][i] )
copy.copy(randProd)
def testRandomSampleAllBBs(self):
log("testRandomSampleAllBBs")
rxn = rdChemReactions.ChemicalReaction();
rgroups = [[Chem.MolFromSmiles("C")]*10,
[Chem.MolFromSmiles("N")]*5,
[Chem.MolFromSmiles("O")]*6]
randProd = rdChemReactions.RandomSampleAllBBsStrategy()
randProd.Initialize(rxn, rgroups)
self.assertEquals(randProd.GetNumPermutations(), 10*5*6)
groups = []
for i in range(10*5*6):
groups.append(tuple(randProd.next()))
print( len(set(groups)), "out of", 10*5*6 )
randProd = rdChemReactions.RandomSampleAllBBsStrategy()
randProd.Initialize(rxn, rgroups)
self.assertEquals(randProd.GetNumPermutations(), 10*5*6)
groups = []
for i in range(10):
groups.append(tuple(randProd.next()))
for i in range(3):
print( i, len(set([g[i] for g in groups])), "out of", [10,5,6][i] )
self.assertEquals(len(set([g[i] for g in groups])), [10,5,6][i])
copy.copy(randProd)
def testTimings(self):
log("testTimings")
rxn = rdChemReactions.ChemicalReaction();
rgroups = [[Chem.MolFromSmiles("C")]*17000,
[Chem.MolFromSmiles("N")]*50000,
[Chem.MolFromSmiles("O")]*4000]
cartProd = rdChemReactions.CartesianProductStrategy()
randProd = rdChemReactions.RandomSampleStrategy()
randAllBBs = rdChemReactions.RandomSampleAllBBsStrategy()
for r in [cartProd, randProd, randAllBBs]:
r.Initialize(rxn, rgroups)
num = 10000000
t1 = time.time()
r.Skip(num)
t2 = time.time()
print("%s Skipped %s in %s seconds"%(r, num, t2-t1))
def testEvenPairsSampling(self):
rxn = rdChemReactions.ChemicalReaction();
rgroups = [[Chem.MolFromSmiles("C")]*10,
[Chem.MolFromSmiles("N")]*10,
[Chem.MolFromSmiles("O")]*10]
rxn = rdChemReactions.ChemicalReaction();
count = 0
pairs01 = {}
pairs12 = {}
pairs02 = {}
strategy = rdChemReactions.EvenSamplePairsStrategy()
strategy.Initialize(rxn, rgroups)
# try 100 samples
while count < 100:
v = strategy.next()
p01 = (v[0], v[1])
p12 = (v[1], v[2])
p02 = (v[0], v[2])
pairs01[p01] = pairs01.get(p01, 0) + 1
pairs12[p01] = pairs12.get(p12, 0) + 1
pairs02[p01] = pairs02.get(p02, 0) + 1
count += 1
# each pair should be used rougly once
self.assertEquals(np.median(list(pairs01.values())), 1.0)
self.assertEquals(np.median(list(pairs02.values())), 1.0)
self.assertEquals(np.median(list(pairs12.values())), 1.0)
# now try 1000
pairs01 = {}
pairs12 = {}
pairs02 = {}
strategy = rdChemReactions.EvenSamplePairsStrategy()
strategy.Initialize(rxn, rgroups)
count = 0
while count < 1000:
v = strategy.next()
p01 = (v[0], v[1])
p12 = (v[1], v[2])
p02 = (v[0], v[2])
pairs01[p01] = pairs01.get(p01, 0) + 1
pairs12[p01] = pairs12.get(p12, 0) + 1
pairs02[p01] = pairs02.get(p02, 0) + 1
count += 1
# each pair should be used roughly 10 times
self.assertTrue( 9 <= np.median(list(pairs01.values())) <= 11)
self.assertTrue( 9 <= np.median(list(pairs02.values())) <= 11)
self.assertTrue( 9 <= np.median(list(pairs12.values())) <= 11)
# now try 500
pairs01 = {}
pairs12 = {}
pairs02 = {}
strategy = rdChemReactions.EvenSamplePairsStrategy()
strategy.Initialize(rxn, rgroups)
count = 0
while count < 500:
v = strategy.next()
p01 = (v[0], v[1])
p12 = (v[1], v[2])
p02 = (v[0], v[2])
pairs01[p01] = pairs01.get(p01, 0) + 1
pairs12[p01] = pairs12.get(p12, 0) + 1
pairs02[p01] = pairs02.get(p02, 0) + 1
count += 1
# each pair should be used roughly 5 times
self.assertTrue( 4 <= np.median(list(pairs01.values())) <= 6)
self.assertTrue( 4 <= np.median(list(pairs02.values())) <= 6)
self.assertTrue( 4 <= np.median(list(pairs12.values())) <= 6)
self.assertTrue("PAIRSTAT" in strategy.Stats())
def testEnumerateLibrary(self):
log("testEnumerateLibrary")
smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]"
rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea)
reagents = [
[Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')],
[Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'),
]
]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents)
self.assertTrue(enumerator)
# need to initialize the reaction before getting the binary serialization
rxn.Initialize()
self.assertEquals(rxn.ToBinary(), enumerator.GetReaction().ToBinary())
bbs = enumerator.GetReagents()
for i in range(len(bbs)):
for j in range(len(bbs[i])):
self.assertTrue(Chem.MolToSmiles(reagents[i][j]) == Chem.MolToSmiles(bbs[i][j]))
smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br']
results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults]
enumerators = [enumerator]
# add serialized enumerators as well for testing if possible
if rdChemReactions.EnumerateLibraryCanSerialize():
pickle = enumerator.Serialize()
enumerator2 = rdChemReactions.EnumerateLibrary()
enumerator2.InitFromString(pickle)
# make sure old pickles work
enumerator3 = rdChemReactions.EnumerateLibrary()
enumerator3.InitFromString(open(os.path.join(self.dataDir, "enumeration.pickle"), 'rb').read())
print("==", enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type())
self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type())
enumerators.append(enumerator2)
enumerators.append(enumerator3)
# check for fully sampled and deterministic ordering in final index values
expected_positions = [[0, 0],[1, 0],[0, 1],[1, 1],[0, 2],[1, 2]]
out = []
for en in enumerators:
i = 0
positions = []
for i, prods in enumerate(en):
positions.append( list(en.GetPosition()) )
for mols in prods:
self.assertEquals(len(mols), 1)
smi = Chem.MolToSmiles(mols[0])
if en is enumerator:
out.append(smi)
self.assertEquals(smi, results[i])
if en is enumerator and i == 1 and rdChemReactions.EnumerateLibraryCanSerialize():
# save the state not at the start
pickle_at_2 = enumerator.Serialize()
self.assertEquals(i, 5)
self.assertEquals(positions, expected_positions)
if rdChemReactions.EnumerateLibraryCanSerialize():
# see if we can restore the enumeration from the middle
out3 = []
enumerator3 = rdChemReactions.EnumerateLibrary()
enumerator3.InitFromString(pickle_at_2)
for prods in enumerator3:
for mols in prods:
self.assertEquals(len(mols), 1)
smi = Chem.MolToSmiles(mols[0])
out3.append(smi)
self.assertEquals(out[2:], out3)
# test smiles interface
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents)
i = 0
while enumerator:
for mols in enumerator.nextSmiles():
self.assertEquals(len(mols), 1)
self.assertEquals(mols[0], results[i])
i += 1
self.assertEquals(i, 6)
def testRandomEnumerateLibrary(self):
log("testRandomEnumerateLibrary")
smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]"
rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea)
reagents = [
[Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')],
[Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'),
]
]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleStrategy())
self.assertTrue(enumerator)
smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br']
results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleStrategy())
iteren = iter(enumerator)
res = set()
count = 0
while res != set(results):
count += 1
if count > 100000:
print("Unable to find enumerate set with 100,000 random samples!", file=sys.stderr)
self.assertEquals(res,set(results))
prod = iteren.next()
for mols in prod:
smi1 = Chem.MolToSmiles(mols[0])
res.add(smi1)
if rdChemReactions.EnumerateLibraryCanSerialize():
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleStrategy())
pickle = enumerator.Serialize()
enumerator2 = rdChemReactions.EnumerateLibrary()
enumerator2.InitFromString(pickle)
self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type())
iteren = iter(enumerator)
iteren2 = iter(enumerator2)
outsmiles = []
for i in range(10):
prods1 = iteren.next()
prods2 = iteren2.next()
self.assertEquals(len(prods1), len(prods2))
for mols1, mols2 in zip(prods1, prods2):
self.assertEquals(len(mols1), 1)
smi1 = Chem.MolToSmiles(mols1[0])
self.assertEquals(smi1, Chem.MolToSmiles(mols2[0]))
outsmiles.append(smi1)
if i == 1:
pickle_at_2 = enumerator.Serialize()
# make sure we can pickle the state as well
enumerator3 = rdChemReactions.EnumerateLibrary()
enumerator3.InitFromString(pickle_at_2)
iteren3 = iter(enumerator3)
outsmiles2 = []
for i in range(8):
prods3 = iteren3.next()
for mols3 in prods3:
self.assertEquals(len(mols3), 1)
smi1 = Chem.MolToSmiles(mols3[0])
self.assertEquals(smi1, Chem.MolToSmiles(mols3[0]))
outsmiles2.append(smi1)
self.assertEquals(outsmiles2, outsmiles[2:])
def testRandomEnumerateAllBBsLibrary(self):
log("testRandomEnumerateAllBBsLibrary")
smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]"
rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea)
reagents = [
[Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')],
[Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'),
]
]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleAllBBsStrategy())
self.assertTrue(enumerator)
# test the BB sampling here
strategy = iter(enumerator)
r1 = set()
r2 = set()
strategy.next()
groups = strategy.GetPosition()
print("**", list(groups), file=sys.stderr)
r1.add(groups[0])
r2.add(groups[1])
strategy.next()
groups = strategy.GetPosition()
print("**", list(groups),file=sys.stderr)
r1.add(groups[0])
r2.add(groups[1])
self.assertEquals(r1, set([0,1])) # two bbs at reagent one all sampled at one iteration
strategy.next()
groups = strategy.GetPosition()
print("**", list(groups),file=sys.stderr)
r1.add(groups[0])
r2.add(groups[1])
self.assertEquals(r2, set([0,1,2])) # three bbs at reagent one all sampled in three iterations
smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br']
results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults]
if rdChemReactions.EnumerateLibraryCanSerialize():
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleAllBBsStrategy())
self.assertTrue(enumerator)
pickle = enumerator.Serialize()
enumerator2 = rdChemReactions.EnumerateLibrary()
enumerator2.InitFromString(pickle)
self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type())
iteren = iter(enumerator)
iteren2 = iter(enumerator2)
outsmiles = []
for i in range(10):
prods1 = iteren.next()
prods2 = iteren2.next()
self.assertEquals(len(prods1), len(prods2))
for mols1, mols2 in zip(prods1, prods2):
self.assertEquals(len(mols1), 1)
smi1 = Chem.MolToSmiles(mols1[0])
self.assertEquals(smi1, Chem.MolToSmiles(mols2[0]))
outsmiles.append(smi1)
if i == 1:
pickle_at_2 = enumerator.Serialize()
# make sure we can pickle the state as well
enumerator3 = rdChemReactions.EnumerateLibrary()
enumerator3.InitFromString(pickle_at_2)
self.assertEquals(enumerator.GetEnumerator().Type(), enumerator3.GetEnumerator().Type())
iteren3 = iter(enumerator3)
outsmiles2 = []
for i in range(8):
prods3 = iteren3.next()
for mols3 in prods3:
self.assertEquals(len(mols3), 1)
smi1 = Chem.MolToSmiles(mols3[0])
self.assertEquals(smi1, Chem.MolToSmiles(mols3[0]))
outsmiles2.append(smi1)
self.assertEquals(outsmiles2, outsmiles[2:])
def testRGroupState(self):
if not rdChemReactions.EnumerateLibraryCanSerialize():
print("-- Skipping testRGroupState, serialization of EnumerateLibrary not enabled", file=sys.stderr)
return
log("testRGroupState")
smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]"
rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea)
reagents = [
[Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')],
[Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'),
]
]
def tostr(l):
return [[str(x) for x in v] for v in l]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents)
state = enumerator.GetState()
p = enumerator.nextSmiles()
p2 = enumerator.nextSmiles()
enumerator.SetState(state)
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p))
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2))
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleStrategy())
state = enumerator.GetState()
p = enumerator.nextSmiles()
p2 = enumerator.nextSmiles()
enumerator.SetState(state)
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p))
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2))
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents,
rdChemReactions.RandomSampleAllBBsStrategy())
state = enumerator.GetState()
p = enumerator.nextSmiles()
p2 = enumerator.nextSmiles()
enumerator.SetState(state)
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p))
self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2))
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents)
smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br',
'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br',
'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br']
smiresults = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults]
enumerator.GetEnumerator().Skip(10)
enumerator.ResetState()
results = []
for result in enumerator:
for prodSet in result:
for mol in prodSet:
results.append( Chem.MolToSmiles(mol) )
self.assertEquals(results, smiresults)
def testRemovingBadMatches(self):
log("testRemoveBadMatches")
smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]"
rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea)
# invert matches so nothing matches
reagents = [
[Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'),
Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'),
],
[Chem.MolFromSmiles('C=CCN=C=S'),
Chem.MolFromSmiles('CC=CCN=C=S'),
Chem.MolFromSmiles('CCC'),
Chem.MolFromSmiles('CCCCC'),
],
]
enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents)
self.assertEquals([], list(enumerator))
def testRemoveInsaneReagents(self):
rxndata = "$RXN\nUntitled Document-1\n ChemDraw10291618492D\n\n 3 1\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n -0.4125 0.0000 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n -0.4125 0.0000 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n 0.4125 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n -0.4125 0.0000 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 14 15 0 0 0 0 0 0 0 0999 V2000\n 0.5072 -0.5166 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5072 0.3084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2949 -0.7616 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 1.7817 -0.0880 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2967 0.5794 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.5558 -1.5443 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n -0.2073 0.7208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9218 0.3083 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9217 -0.5167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.2073 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.6362 0.7208 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n 1.5452 1.3661 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n 2.3507 1.5443 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n -2.3507 0.3083 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 2 0 0\n 1 3 1 0 0\n 3 4 1 0 0\n 4 5 1 0 0\n 5 2 1 0 0\n 3 6 1 0 0\n 2 7 1 0 0\n 7 8 2 0 0\n 8 9 1 0 0\n 9 10 2 0 0\n 10 1 1 0 0\n 8 11 1 0 0\n 12 13 1 0 0\n 11 14 1 0 0\n 12 5 1 0 0\nM END\n";
rxn = AllChem.ReactionFromRxnBlock(rxndata)
bbs = []
r1 = [ Chem.MolFromSmiles("CCNCC"),
Chem.MolFromSmiles("NCC"),
]
r2 = [ Chem.MolFromSmiles("ClC1CCCC1"),
Chem.MolFromSmiles("ClC1CCCC1Cl"),
]
r3 = [ Chem.MolFromSmiles("CCNCC"),
Chem.MolFromSmiles("NCC"),
]
bbs = [r1, r2, r3]
# nothing matches!
for i,reagent in enumerate(rxn.GetReactants()):
for bb in bbs[i]:
self.assertFalse(bb.HasSubstructMatch(reagent))
# everything matches - yay sanitization!
rdChemReactions.SanitizeRxn(rxn)
for i,reagent in enumerate(rxn.GetReactants()):
for bb in bbs[i]:
self.assertTrue(bb.HasSubstructMatch(reagent))
en = rdChemReactions.EnumerateLibrary(rxn, bbs)
self.assertTrue(len(en.GetReagents()[0]) == 2)
self.assertTrue(len(en.GetReagents()[1]) == 2)
self.assertTrue(len(en.GetReagents()[2]) == 2)
#####################################################################################
# Match only at rgroups (ChemDraw style)
rxn = AllChem.ReactionFromRxnBlock(rxndata)
expected_matches = [[False,True], [True,True],[False, True] ]
rdChemReactions.SanitizeRxn(rxn, params=rdChemReactions.GetChemDrawRxnAdjustParams())
for i,(reagent, expected) in enumerate(zip(rxn.GetReactants(), expected_matches)):
match = [bb.HasSubstructMatch(reagent) for reagent in bbs[i]]
self.assertTrue(match, expected)
# Now try EnumerateLibrary
en = rdChemReactions.EnumerateLibrary(rxn, bbs)
self.assertTrue(len(en.GetReagents()[0]) == 1)
self.assertTrue(len(en.GetReagents()[1]) == 2)
self.assertTrue(len(en.GetReagents()[2]) == 1)
#####################################################################################
# now set the removal options ot only make one product per reagent set
rxn = AllChem.ReactionFromRxnBlock(rxndata)
rdChemReactions.SanitizeRxn(rxn)
opts = rdChemReactions.EnumerationParams()
opts.reagentMaxMatchCount = 1
en = rdChemReactions.EnumerateLibrary(rxn, bbs, params=opts)
self.assertTrue(len(en.GetReagents()[0]) == 1)
self.assertTrue(len(en.GetReagents()[1]) == 1)
self.assertTrue(len(en.GetReagents()[2]) == 1)
#####################################################################################
# now set the removal options ot only make one product per reagent set
# but wt
rxn = AllChem.ReactionFromRxnBlock(rxndata)
rdChemReactions.SanitizeRxn(rxn,
params=rdChemReactions.GetChemDrawRxnAdjustParams())
opts = rdChemReactions.EnumerationParams()
opts.reagentMaxMatchCount = 1
en = rdChemReactions.EnumerateLibrary(rxn, bbs, params=opts)
self.assertTrue(len(en.GetReagents()[0]) == 1)
self.assertTrue(len(en.GetReagents()[1]) == 1)
self.assertTrue(len(en.GetReagents()[2]) == 1)
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,239 @@
# Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
# * Neither the name of Novartis Institutes for BioMedical Research Inc.
# nor the names of its contributors may be used to endorse or promote
# products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
from __future__ import print_function
import unittest
import os,sys
from rdkit.six.moves import cPickle
from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem import rdChemReactions, AllChem
from rdkit import Geometry
from rdkit import RDConfig
import itertools, time
test_data = [("good", '''$RXN
ISIS 052820091627
2 1
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
-3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
-3.9875 -7.4667 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0
1 2 1 0 0 0 0
V 1 halogen.bromine.aromatic
M RGP 1 2 1
M END
$MOL
-ISIS- 05280916272D
4 3 0 0 0 0 0 0 0 0999 V2000
3.4375 -7.7917 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0
4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
2 3 1 0 0 0 0
1 2 1 0 0 0 0
2 4 1 0 0 0 0
V 2 boronicacid
M RGP 1 1 2
M END
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
11.2667 -7.3417 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0
11.9811 -6.9292 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0
1 2 1 0 0 0 0
M RGP 2 1 1 2 2
M END'''),
("bad", '''$RXN
ISIS 052820091627
2 1
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
-3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
-3.9875 -7.4667 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
V 1 halogen.bromine.aromatic
M RGP 1 2 1
M END
$MOL
-ISIS- 05280916272D
4 3 0 0 0 0 0 0 0 0999 V2000
3.4375 -7.7917 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
2 3 1 0 0 0 0
1 2 1 0 0 0 0
2 4 1 0 0 0 0
V 2 boronicacid
M RGP 1 1 2
M END
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
11.2667 -7.3417 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
11.9811 -6.9292 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
M RGP 2 1 1 2 2
M END'''),
# chemdraw style
("bad", '''$RXN
ISIS 052820091627
2 1
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
-3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
-3.9875 -7.4667 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
V 1 halogen.bromine.aromatic
M END
$MOL
-ISIS- 05280916272D
4 3 0 0 0 0 0 0 0 0999 V2000
3.4375 -7.7917 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
2 3 1 0 0 0 0
1 2 1 0 0 0 0
2 4 1 0 0 0 0
V 2 boronicacid
M END
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
11.2667 -7.3417 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0
11.9811 -6.9292 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
M END'''),
("fail", '''$RXN
ISIS 052820091627
2 1
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
-3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
-3.9875 -7.4667 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
V 1 halogen.bromine.aromatic
M END
$MOL
-ISIS- 05280916272D
4 3 0 0 0 0 0 0 0 0999 V2000
3.4375 -7.7917 0.0000 R3 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0
4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
2 3 1 0 0 0 0
1 2 1 0 0 0 0
2 4 1 0 0 0 0
V 2 boronicacid
M END
$MOL
-ISIS- 05280916272D
2 1 0 0 0 0 0 0 0 0999 V2000
11.2667 -7.3417 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0
11.9811 -6.9292 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0
M END'''),
]
good_res = (0,0,2,1,(((0, 'halogen.bromine.aromatic'),), ((1, 'boronicacid'),)))
bad_res = (3,0,2,1,(((0, 'halogen.bromine.aromatic'),), ((1, 'boronicacid'),)))
class TestCase(unittest.TestCase) :
def test_sanitize(self):
for status, block in test_data:
print("*"*44)
rxna = AllChem.ReactionFromRxnBlock(block)
rxnb = AllChem.ReactionFromRxnBlock(block)
rxna.Initialize()
res = rdChemReactions.PreprocessReaction(rxna)
print(AllChem.ReactionToRxnBlock(rxna))
if status == "good":
self.assertEquals(res, good_res)
elif status == "bad":
self.assertEquals(res, bad_res)
print (">"*44)
rxnb.Initialize()
try:
rdChemReactions.SanitizeRxn(rxnb)
res = rdChemReactions.PreprocessReaction(rxnb)
print(AllChem.ReactionToRxnBlock(rxnb))
self.assertEquals(res, good_res)
assert not status == "fail"
except:
print ("$RXN Failed")
if status == "fail":
continue
if __name__ == '__main__':
unittest.main()

View File

@@ -1,11 +1,15 @@
import sys
tests = [("python", "testReactionWrapper.py", {}), ]
tests=[
("python", "testReactionWrapper.py",{}),
("python", "testEnumerations.py",{}),
]
longTests = []
longTests=[
]
if __name__ == '__main__':
if __name__=='__main__':
import sys
from rdkit import TestRunner
failed, tests = TestRunner.RunScript('test_list.py', 0, 1)
failed,tests = TestRunner.RunScript('test_list.py',0,1)
sys.exit(len(failed))

View File

@@ -5943,6 +5943,8 @@ void testCopyConstructor() {
removeMappingNumbersFromReactions(*rxn_new);
std::string smi2 = ChemicalReactionToRxnSmiles(*rxn);
std::string new_smi = ChemicalReactionToRxnSmiles(*rxn_new);
std::cerr << "smi1 " << smi1 << std::endl;
std::cerr << "smi2 " << smi2 << std::endl;
TEST_ASSERT(smi1 == smi2);
TEST_ASSERT(smi2 != new_smi);
TEST_ASSERT(new_smi == "CCC(N)(O)Cl>>CC(C)(N)O.Cl");

File diff suppressed because it is too large Load Diff

View File

@@ -116,10 +116,19 @@ class FilterMatcherBase
virtual bool hasMatch(const ROMol &mol) const = 0;
//------------------------------------
//! Clone
//! Clone - deprecated
// Clones the current FilterMatcherBase into one that
// can be passed around safely.
virtual boost::shared_ptr<FilterMatcherBase> Clone() const = 0;
virtual boost::shared_ptr<FilterMatcherBase> Clone() const {
BOOST_LOG(rdWarningLog) << "FilterMatcherBase::Clone is deprecated, use copy instead" << std::endl;
return copy();
}
//------------------------------------
//! copy
// copies the current FilterMatcherBase into one that
// can be passed around safely.
virtual boost::shared_ptr<FilterMatcherBase> copy() const = 0;
private:
#ifdef RDK_USE_BOOST_SERIALIZATION

View File

@@ -91,7 +91,7 @@ bool SmartsMatcher::getMatches(const ROMol &mol,
if (d_min_count == 1 && d_max_count == UINT_MAX) {
RDKit::MatchVectType match;
onPatExists = RDKit::SubstructMatch(mol, *d_pattern.get(), match);
if (onPatExists) matchVect.push_back(FilterMatch(Clone(), match));
if (onPatExists) matchVect.push_back(FilterMatch(copy(), match));
} else { // need to count
const bool uniquify = true;
unsigned int count =
@@ -99,7 +99,7 @@ bool SmartsMatcher::getMatches(const ROMol &mol,
onPatExists = (count >= d_min_count &&
(d_max_count == UINT_MAX || count <= d_max_count));
if (onPatExists) {
boost::shared_ptr<FilterMatcherBase> clone = Clone();
boost::shared_ptr<FilterMatcherBase> clone = copy();
for (size_t i = 0; i < matches.size(); ++i) {
matchVect.push_back(FilterMatch(clone, matches[i]));
}

View File

@@ -58,7 +58,7 @@ class And : public FilterMatcherBase {
//! True if arg1 and arg2 FilterMatchers are true
And(const FilterMatcherBase &arg1, const FilterMatcherBase &arg2)
: FilterMatcherBase("And"), arg1(arg1.Clone()), arg2(arg2.Clone()) {}
: FilterMatcherBase("And"), arg1(arg1.copy()), arg2(arg2.copy()) {}
And(const boost::shared_ptr<FilterMatcherBase> &arg1,
const boost::shared_ptr<FilterMatcherBase> &arg2)
@@ -93,7 +93,7 @@ class And : public FilterMatcherBase {
return false;
}
boost::shared_ptr<FilterMatcherBase> Clone() const {
boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new And(*this));
}
@@ -122,7 +122,7 @@ class Or : public FilterMatcherBase {
//! Constructs or Ander
//! true if arg1 or arg2 are true
Or(const FilterMatcherBase &arg1, const FilterMatcherBase &arg2)
: FilterMatcherBase("Or"), arg1(arg1.Clone()), arg2(arg2.Clone()) {}
: FilterMatcherBase("Or"), arg1(arg1.copy()), arg2(arg2.copy()) {}
Or(const boost::shared_ptr<FilterMatcherBase> &arg1,
const boost::shared_ptr<FilterMatcherBase> &arg2)
@@ -154,7 +154,7 @@ class Or : public FilterMatcherBase {
return res1 || res2;
}
boost::shared_ptr<FilterMatcherBase> Clone() const {
boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new Or(*this));
}
@@ -182,7 +182,7 @@ class Not : public FilterMatcherBase {
// from getMatches since a false internal match matches
// nothing!
Not(const FilterMatcherBase &arg1)
: FilterMatcherBase("Not"), arg1(arg1.Clone()) {}
: FilterMatcherBase("Not"), arg1(arg1.copy()) {}
Not(const boost::shared_ptr<FilterMatcherBase> &arg1)
: FilterMatcherBase("Not"), arg1(arg1) {}
@@ -208,7 +208,7 @@ class Not : public FilterMatcherBase {
return !arg1->getMatches(mol, matchVect);
}
boost::shared_ptr<FilterMatcherBase> Clone() const {
boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new Not(*this));
}
@@ -321,7 +321,7 @@ class SmartsMatcher : public FilterMatcherBase {
virtual bool getMatches(const ROMol &mol,
std::vector<FilterMatch> &matchVect) const;
virtual bool hasMatch(const ROMol &mol) const;
virtual boost::shared_ptr<FilterMatcherBase> Clone() const {
virtual boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new SmartsMatcher(*this));
}
@@ -403,7 +403,7 @@ class ExclusionList : public FilterMatcherBase {
void addPattern(const FilterMatcherBase &base) {
PRECONDITION(base.isValid(), "Invalid FilterMatcherBase");
d_offPatterns.push_back(base.Clone());
d_offPatterns.push_back(base.copy());
}
void setExclusionPatterns(
@@ -433,7 +433,7 @@ class ExclusionList : public FilterMatcherBase {
return result;
}
virtual boost::shared_ptr<FilterMatcherBase> Clone() const {
virtual boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new ExclusionList(*this));
}
@@ -469,7 +469,7 @@ public:
*/
FilterHierarchyMatcher(const FilterMatcherBase &matcher) :
FilterMatcherBase(),
d_matcher(matcher.Clone()) {
d_matcher(matcher.copy()) {
}
//! Return the name for this node (from the underlying FilterMatcherBase)
@@ -491,7 +491,7 @@ public:
*/
void setPattern(const FilterMatcherBase & matcher) {
PRECONDITION(matcher.isValid(), "Adding invalid patterns is not allowed.");
d_matcher = matcher.Clone();
d_matcher = matcher.copy();
PRECONDITION(getName() == d_matcher->getName(), "Opps");
}
@@ -527,8 +527,8 @@ public:
return getMatches(mol, temp);
}
//! Clones the FilterHierarchyMatcher into a FilterMatcherBase
virtual boost::shared_ptr<FilterMatcherBase> Clone() const {
//! copys the FilterHierarchyMatcher into a FilterMatcherBase
virtual boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new FilterHierarchyMatcher(*this));
}
private:

View File

@@ -71,7 +71,7 @@ void SetOffPatterns(ExclusionList &fc, boost::python::object list) {
std::vector<boost::shared_ptr<FilterMatcherBase> > temp;
for (; begin != end; ++begin) {
temp.push_back((*begin)->Clone());
temp.push_back((*begin)->copy());
}
fc.setExclusionPatterns(temp);
}
@@ -147,7 +147,7 @@ class PythonFilterMatch : public FilterMatcherBase {
functor(self),
incref(false){};
// ONLY CALLED FROM C++ from the Clone operation
// ONLY CALLED FROM C++ from the copy operation
PythonFilterMatch(const PythonFilterMatch &rhs)
: FilterMatcherBase(rhs), functor(rhs.functor), incref(true) {
python::incref(functor);
@@ -174,7 +174,7 @@ class PythonFilterMatch : public FilterMatcherBase {
return python::call_method<bool>(functor, "HasMatch", boost::ref(mol));
}
virtual boost::shared_ptr<FilterMatcherBase> Clone() const {
virtual boost::shared_ptr<FilterMatcherBase> copy() const {
return boost::shared_ptr<FilterMatcherBase>(new PythonFilterMatch(*this));
}
};

49
Contrib/Glare/README.txt Normal file
View File

@@ -0,0 +1,49 @@
Glare Algorithm.
Implementation of
GLARE: A New Approach for Filtering Large Reagent Lists in
Combinatorial Library Design Using Product Properties
Jean-Francois Truchon* and Christopher I. Bayly
http://pubs.acs.org/doi/pdf/10.1021/ci0504871
Usage:
# somehow make sidechains1/2 with props [mw, alogp, tpsa]
r1 = RGroups(sidechains1)
r2 = RGroups(sidechains2)
lib = Library([r1, r2])
props = [
Property("mw", 0, 0, 500),
Property("alogp", 1, -2.4, 5),
Property("tpsa", 2, 0, 90)
]
glare = Glare()
glare.optimize(lib, props)
Notes:
Some nomenclature:
A Libary is made of RGroups
RGroups are a collection of sidechains (the paper uses Fragments)
that can populate the rgroup position.
We desire to optimize the Library so that we have a good chance
of making the desired products.
From the testing code, using Fake data:
r1 = RGroups(makeFakeSidechains("aldehydes", num=1000))
r2 = RGroups(makeFakeSidechains("boronic_acids", num=1500))
libs = Library([r1,r2])
props = [
Property("mw", propIdx=0, minValue=0, maxValue=500),
Property("alogp", propIdx=1, minValue=-2.4, maxValue=5),
Property("tpsa", propIdx=2, minValue=0, maxValue=90)
]
glare = Glare()
# optimize the library...
glare.optimize(libs, props)

444
Contrib/Glare/glare.py Executable file
View File

@@ -0,0 +1,444 @@
from __future__ import print_function
import random, operator, itertools, math
"""
Glare Algorithm
Some nomenclature:
A Libary is made of RGroups
RGroups are a collection of sidechains (the paper uses Fragments)
that can populate the rgroup position.
We desire to optimize the Library so that we have a good chance
of making the desired products.
Example From the testing code, using Fake data:
r1 = RGroups(makeFakeSidechains("aldehydes", num=1000))
r2 = RGroups(makeFakeSidechains("boronic_acids", num=1500))
lib = Library([r1,r2])
props = [
Property("mw", propIdx=0, minValue=0, maxValue=500),
Property("alogp", propIdx=1, minValue=-2.4, maxValue=5),
Property("tpsa", propIdx=2, minValue=0, maxValue=90)
]
glare = Glare()
# optimize the library...
glare.optimize(lib, props)
"""
class Property:
def __init__(self, name, propIdx, minValue, maxValue, scaffoldoffset=0.0):
"""name, propIdx, minValue, maxValue, scaffoldoffset -> initial a Property
name is the name of the property.
propIdx: the index of the property in the property vector
minValue: the minimum acceptable value for the property
maxValue: the maximum acceptable value for the property
scaffoldoffset: any offset from the reaction scaffold (defaults to 0)
"""
self.name = name
self.propIdx = propIdx
self.minValue = minValue
self.maxValue = maxValue
self.offset = scaffoldoffset
def evaluate(self, sidechains):
"""sidechains -> Evaluate a list of sidechains to see if they
pass the property values.
Each sidechain must have a property vector e.g. (s.props for s in sidechains)
which is a vector of values where s.props[propIdx] is the property
being inspected
"""
product = self.offset
propIdx = self.propIdx
for s in sidechains:
product += s.props[propIdx]
return self.minValue <= product <= self.maxValue
class Sidechain:
"""Holds the name (identifier) and property list for the
given sidechain/fragment. Properties are assumed to
be numerical values"""
def __init__(self, name, props, goodCount=0):
"""name, props, goodCount=0 -> initialize a Sidechain
initialize a sidechain.
name: the unique name for the sidechain
props: the property vector (see Properties class for details)
goodCount: the number of times this reagent belongs to
a good product, where good is a product that is in the desired
property space.
"""
self.name = name
self.props = props
self.good_count = goodCount # shared variable
self.dropped = False # shared variable
def __str__(self):
return "Sidechain %s(%s, goodCount=%s)"%(self.name,
self.props, self.good_count)
def __repr__(self):
return "Sidechain(%r, %r, %s)"%(self.name, self.props, self.good_count)
class RGroups:
"""Holds a collection of sidechains for the given RGroup"""
def __init__(self, sidechains):
"""Sidechains -> RGroups
sidechains: the list of Sidechains that make up the potential
sidechains at this rgroup position"""
self.sidechains = sidechains
self.rejected = [] # list of rejected sidechains
self.initial_size = len(sidechains)
def count(self):
"""Returns the number of possible sidechains"""
return len(self.sidechains)
def randomize(self):
"""Randomly shuffles the sidechains and reset the goodness counts"""
random.shuffle(self.sidechains)
for s in self.sidechains:
s.good_count = 0
def effectiveness(self):
"""-> return the current effectiveness of this collection
effectiveness is the number of items left divided by the
initial amount"""
return len(self.sidechains)/float(self.initial_size)
def chunk_size(self, num_chunks):
"""num_chunks -> return the number of sidechains in each chunk
if the sidechains are split into num_chunks chunks"""
return int(math.ceil(float(len(self.sidechains))/num_chunks))
def chunk(self, chunk_idx, num_chunks):
"""chunk_idx, num)chunks -> RGroups
return the chunk_idxth chunk given num_chunks total chunks"""
assert chunk_idx >=0 and chunk_idx < num_chunks, "%s %s"%(
chunk_idx, num_chunks)
n = self.chunk_size(num_chunks)
return RGroups(self.sidechains[chunk_idx*n:(chunk_idx+1)*n])
def prune(self, fractionToKeep):
"""fractionToKeep -> Sort the sidechains from the most often
found if good products to the least, and keep the best
fractionToKeep percentage"""
assert 0 < fractionToKeep <= 1.0, "fractionToKeep: %s"%fractionToKeep
self.sidechains.sort(lambda x,y: -cmp(x.good_count, y.good_count))
fragment_index = int(len(self.sidechains) * fractionToKeep + 0.5)
# update rejected set
self.rejected += self.sidechains[fragment_index:]
self.sidechains = self.sidechains[:fragment_index]
class Library:
"""A library is a collection of RGroups that need to be combinitorially
combined"""
def __init__(self, rgroups):
"""rgroups -> Initialize the Library.
rgroups: the list of possible RGroups that is combinitorially
combined to make the library"""
self.rgroups = rgroups
def isValid(self):
"""If we have an empty set for any rgroup, return False"""
for rg in self.rgroups:
if len(rg.sidechains) == 0:
return False
return True
def randomize(self):
"""randomize the order of the sidechains"""
for rg in self.rgroups:
rg.randomize()
def getSidechainsPerPartition( self, total_num_partitions_per_rgroup ):
"""total_num_partitions -> [num_fragments/partition for rgroup1,
num_fragments/partition for rgroup2]
return the number of sidechains in a partition
for each rgroup"""
sizes = [ (libIdx, max(rg.count()/total_num_partitions_per_rgroup, 1))
for libIdx, rg in enumerate(self.rgroups) ]
# "optimially" apportion the partitions according the
# the glare paper see Appendix eq (8) and (9)
# sort by size
sizes.sort(lambda x,y: cmp(x[1], y[1]))
last_size = 1
opt_sizes = []
for libIdx, current_size in sizes[:-1]:
opt_sizes.append( (libIdx,
current_size - (current_size % last_size)) )
last_size = current_size
# From the Glare paper:
# the last library size is set equal to the second to last
# From Table 3, it is easy to understand that, if the fourth dimension
# was split in 24 instead of 12, a factor of 2 would be gained from the
# reduced size of the sublibraries. However, twice as many sublibraries
# would be needed, and the net speedup would be null, hence, the decision to
# set p4=p3. (p4 here is the last library)
libIdx, current_size = sizes[-1]
opt_sizes.append((libIdx, last_size))
# back to the original library order
opt_sizes.sort()
res = [size for libIdx, size in opt_sizes]
return res
def chunk(self, num_partitions):
"""num_partitions -> [Library(..), Library(...)]
Return new libraries that are chunks of this one.
These are the libraries that get sampled to see of
sidechains participate in good products.
"""
partitions = self.getSidechainsPerPartition(num_partitions)
max_subsets = max(partitions)
enumeration_indices = []
for i in xrange(max_subsets):
combinations = []
for size in partitions:
combinations.append( i % size )
enumeration_indices.append( combinations )
library_sets = []
for subset_index, combinations in enumerate(enumeration_indices):
libs = []
partitioned_rgroups = []
for lib_index, libpart_index in enumerate(combinations):
lib = self.rgroups[lib_index]
num_chunks = partitions[lib_index]
partitioned_rgroups.append( lib.chunk(chunk_idx=libpart_index,
num_chunks=num_chunks))
lib = Library(partitioned_rgroups)
if lib.isValid():
library_sets.append(lib)
return library_sets
def effectiveness(self):
"""-> returns the average effectiveness of this library set"""
sum = 0.0
for rg in self.rgroups:
sum += rg.effectiveness()
return sum/len(self.rgroups)
def evaluate(self, props):
"""props -> num_good_enumerations, total_enumerations
props: a list of Property evaluators for the fragments.
returns the number of good enumerations and the total number of
enumerations for this Library"""
frags = [rg.sidechains for rg in self.rgroups]
good = 0
bad = 0
for i,frag in enumerate(itertools.product(*frags)):
for p in props:
if not p.evaluate(frag):
bad += 1
break
else:
good += 1
for sidechain in frag:
sidechain.good_count += 1
return good, i+1
class Glare:
"""Glare Algorithm. Implementation of
GLARE: A New Approach for Filtering Large Reagent Lists in
Combinatorial Library Design Using Product Properties
Jean-Francois Truchon* and Christopher I. Bayly
http://pubs.acs.org/doi/pdf/10.1021/ci0504871
Usage:
# somehow make sidechains1/2 with props [mw, alogp, tpsa]
r1 = RGroups(sidechains1)
r2 = RGroups(sidechains2)
lib = Library([r1, r2])
props = [
Property("mw", 0, 0, 500),
Property("alogp", 1, -2.4, 5),
Property("tpsa", 2, 0, 90)
]
glare = Glare()
glare.optimize(lib, props)
"""
def __init__(self,
desiredFinalGoodness=0.95,
maxIterations=100,
rgroupScale=6.0, # None if no scaling
initialFraction=None,#None=auto -100.,
numPartitions=16):
self.fractionGood = self.desiredFinalGoodness = desiredFinalGoodness
self.maxIterations = maxIterations
self.rgroupScale = rgroupScale
if initialFraction is not None:
self.initialFraction = initialFraction/100.
else:
self.initialFraction = initialFraction
self.numPartitions = numPartitions
def optimize(self, library, props):
"""library, props
Given a Library and the list of Propery evaluators,
optimize the library.
The library is modified in place by removing building blocks
(sidechains) that are not likely to pass the property
criteria.
"""
# attempt to generate report like glare application
print ("------- PARAMETERS: --------------")
print ("GOOODNESS THRESHOLD : %s%%"%(self.desiredFinalGoodness * 100))
print ("MIN PARTITION SIZE : %s"%self.numPartitions)
if self.initialFraction is None or self.initialFraction > 0.999:
print ("INITIAL FRACTION TO KEEP : AUTOMATIC")
else:
print ("INITAL FRACTION TO KEEP : %s%%"%(self.initialFraction*100))
print ("Actual SIZE : %s = %s"%(
" x ".join([str(len(rg.sidechains)) for rg in library.rgroups]),
reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups])
))
running_total = 0.0
Gt = self.desiredFinalGoodness
for iteration in range(1, self.maxIterations+1):
# chunk of the total library into smaller more managable sets
# and run combinitorial analysis on the sub libraries
# each of these records the number of times a sidechain is used
# in a successful enumeration which is then used to prune the
# library at the end
#
for rg in library.rgroups:
rg.randomize()
good = total = 0.0
chunked_libs = library.chunk(self.numPartitions)
# for each chunk, do the combinitorial check to see
# if reagents make good products
for libidx, chunk in enumerate(chunked_libs):
g,t = chunk.evaluate(props)
good += g
total += t
running_total += total
Gi = good/total # current goodness
if Gi < 1e-12:
# I think we're done here :)
fraction = 0.0
elif iteration == 1:
G0 = Gi # Goodness at first iteration
# the first time, use the initalFraction or a "good enough"
# value
if self.initialFraction is not None:
fraction = K0 = self.initialFraction
else:
# auto choose the fraction based on the current good percentage
# and the desired
fraction = K0 = min(-1.1 * ( Gt - G0) + 1.2,
0.9)
else:
# the second time, gradually eliminate reagents slowing
# down as the number of iterations increases
# see equation (5) in reference
if abs(Gt-G0) < 1e-4:
Ki = 1.0
else:
Ki = (1.0 - K0) * (Gi - G0) / (Gt - G0) + K0;
fraction = min(1.0, Ki)
# prune the library to keep the highest occuring sidechains
# note that even if all sidechains are acceptable,
# some will always get pruned
max_size = float(max([len(rg.sidechains) for rg in library.rgroups]))
for rg in library.rgroups:
scale = 1.0
if self.rgroupScale is not None:
# scale differently size rgroups via equation (6) in paper
numSidechains = len(rg.sidechains)
numer = 1.0
denom = 1.0 + math.exp(-self.rgroupScale *
((numSidechains/max_size) - 0.5))
scale = numer/denom
fraction_to_reject = (1.0 - fraction) * scale
# keep the best fraction...
rg.prune(1.0 - fraction_to_reject)
print ("-------------- ITERATION : %s ----------------------"%iteration)
print ("GOODNESS : %s%%"%(Gi * 100))
print ("NUMBER EVAL : %s"%(total))
print ("CUMUL EVAL : %s"%(running_total))
print ("KEPT IN STEP : %s%%"%(fraction*100.))
if not iteration:
print ("GOODNESS THRESHOLD : %s"%self.desiredFinalGoodness)
print ("MIN PARTITION SIZE : %s"%self.numPartitions)
print ("INITIAL FRACTION TO KEEP : ")
if self.fractionToKeep > 0.999:
print ("AUTOMATIC")
else:
print ("%s%%"%self.fractionToKeep)
print ("Actual SIZE : %s = %s"%(
" x ".join([str(len(rg.sidechains)) for rg in library.rgroups]),
reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups])
))
print ("EFFECTIVENESS : %s%%"%(library.effectiveness()*100.))
# stopping critieria
if iteration and Gi < 1e-12:
return
elif abs(Gi - self.desiredFinalGoodness) < 0.001 or \
Gi > self.desiredFinalGoodness:
return
######################################################################
# testing codes
def makeFakeProps():
mw = random.randint(10,500)
alogp = random.randint(-10,10)
tpsa = random.randint(0,180)
return [mw, alogp, tpsa]
def makeFakeSidechains(lib, num):
res = []
for i in range(num):
res.append(Sidechain(lib + "_" + str(i), makeFakeProps()))
return res
def testGlare():
a = RGroups(makeFakeSidechains("aldehydes", 1000))
b = RGroups(makeFakeSidechains("boronic_acids", 1500))
lib = Library([a,b])
props = [
Property("mw", 0, 0, 500, 230.1419),
Property("alogp", 1, -2.4, 5, 2.212749),
Property("tpsa", 2, 0, 90, 24.5)
]
glare = Glare()
glare.optimize(lib, props)
if __name__ == "__main__":
testGlare()