From fa89438358d651825f97da67242dfd1325f5564f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sat, 5 Nov 2016 09:42:52 -0400 Subject: [PATCH] Dev/reaction enumeration (#1111) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds C++ Enumeration Engine to the RDKit * Adds Sanitization helpers, wrappers and tests * Clang format * Remove unused enumerationStateOnly flag * Fixes docStrings to current API * Adds doc strings * Removes RGroupPosition, adds getPosition to EnumerationBase * Fixes readability. * Adds EnumerateLibraryBase::reset and getReaction * Added getReagents method to EnumerateLibrary * Make the tests have the same naming * Need to save the initial state for resetting. * Stupid case-insensitive file systems * Moves ResetState to EnumerateLibraryBase * Adds removeNonmatchingReagents helper * Renames currentPosition to getPosition * Adds Enumeration Toolkit tutorial * Fixes Python3 serialization and enumerators * Verified to run on python2 and 3 * Fixes integer issues on windows * The number of enumeration should be unsigned. * Adds deserialization constructor * Moves boost_serialization to the end * Deprecates Clone in favor of copy * Update tests to use copy.copy not Clone * Move RGROUPS and BBS into an EnumerationTypes namespace * Make sure old pickles work * Adds pickle for backwards compatibility * Moves to uint64_t from size_t for public api * Whups, accidentally used the binary archiver. * Commits boost 1.55 serialization * Makes serialization turnoffable Like Filter Catalog * Fixes tests when serialization not available. Adds more enumeration strategy tests * Fixes a syntax error on some versions of python * Fixes sanitizeRxn to actually make proper RGroup atoms * Updates SanitizeRXN python API * Updates Enumeration API to a parameter class - fixes reagent removal * Adds a mess of tests * Change stats to return a string. * Exposes EvenPairSamplingStrategy Stats to python * Fixes a crash bug in SanitizeRxn * Adds better testing of the even pair sampling * Fixes namespace * One more try to fix gcc * Enum classes are c++11 and a microsoft extension. * Fix typo * Fixes np.median for python3 * Fixes atom iterators * Adds virtual tags to derived virtual functions (for clarity) * Fixes size comparison issues * Adds doc string * Small cleanup (has no effect since flags aren’t used) * fixes crash bug on windows * get the tests working on windows * Updates tutorial * Adds Glare implementation to Contrib --- Code/GraphMol/ChemReactions/CMakeLists.txt | 40 +- .../Enumerate/CartesianProduct.h | 145 +++ .../ChemReactions/Enumerate/Enumerate.cpp | 259 ++++ .../ChemReactions/Enumerate/Enumerate.h | 183 +++ .../ChemReactions/Enumerate/EnumerateBase.h | 200 +++ .../ChemReactions/Enumerate/EnumerateTypes.h | 58 + .../Enumerate/EnumerationPickler.cpp | 110 ++ .../Enumerate/EnumerationPickler.h | 56 + .../Enumerate/EnumerationStrategyBase.h | 199 +++ .../Enumerate/EvenSamplePairs.cpp | 281 +++++ .../ChemReactions/Enumerate/EvenSamplePairs.h | 193 +++ .../ChemReactions/Enumerate/RandomSample.h | 162 +++ .../Enumerate/RandomSampleAllBBs.h | 186 +++ .../ChemReactions/Enumerate/testEnumerate.cpp | 299 +++++ Code/GraphMol/ChemReactions/PreprocessRxn.cpp | 1 + Code/GraphMol/ChemReactions/SanitizeRxn.cpp | 391 ++++++ Code/GraphMol/ChemReactions/SanitizeRxn.h | 150 +++ .../ChemReactions/Wrap/CMakeLists.txt | 8 +- .../GraphMol/ChemReactions/Wrap/Enumerate.cpp | 435 +++++++ .../ChemReactions/Wrap/rdChemReactions.cpp | 67 +- .../ChemReactions/Wrap/testEnumerations.py | 654 ++++++++++ .../ChemReactions/Wrap/testSanitize.py | 239 ++++ Code/GraphMol/ChemReactions/Wrap/test_list.py | 12 +- .../ChemReactions/testData/enumeration.pickle | Bin 0 -> 4601 bytes Code/GraphMol/ChemReactions/testReaction.cpp | 2 + .../tutorial/EnumerationToolkit.ipynb | 1095 +++++++++++++++++ .../FilterCatalog/FilterMatcherBase.h | 13 +- .../GraphMol/FilterCatalog/FilterMatchers.cpp | 4 +- Code/GraphMol/FilterCatalog/FilterMatchers.h | 26 +- .../FilterCatalog/Wrap/FilterCatalog.cpp | 6 +- Contrib/Glare/README.txt | 49 + Contrib/Glare/glare.py | 444 +++++++ 32 files changed, 5927 insertions(+), 40 deletions(-) create mode 100644 Code/GraphMol/ChemReactions/Enumerate/CartesianProduct.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/Enumerate.cpp create mode 100644 Code/GraphMol/ChemReactions/Enumerate/Enumerate.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EnumerateBase.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EnumerateTypes.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.cpp create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EnumerationStrategyBase.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.cpp create mode 100644 Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/RandomSample.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h create mode 100644 Code/GraphMol/ChemReactions/Enumerate/testEnumerate.cpp create mode 100644 Code/GraphMol/ChemReactions/SanitizeRxn.cpp create mode 100644 Code/GraphMol/ChemReactions/SanitizeRxn.h create mode 100644 Code/GraphMol/ChemReactions/Wrap/Enumerate.cpp create mode 100644 Code/GraphMol/ChemReactions/Wrap/testEnumerations.py create mode 100644 Code/GraphMol/ChemReactions/Wrap/testSanitize.py create mode 100644 Code/GraphMol/ChemReactions/testData/enumeration.pickle create mode 100644 Code/GraphMol/ChemReactions/tutorial/EnumerationToolkit.ipynb create mode 100644 Contrib/Glare/README.txt create mode 100755 Contrib/Glare/glare.py diff --git a/Code/GraphMol/ChemReactions/CMakeLists.txt b/Code/GraphMol/ChemReactions/CMakeLists.txt index c8cde5951..4ce36225e 100644 --- a/Code/GraphMol/ChemReactions/CMakeLists.txt +++ b/Code/GraphMol/ChemReactions/CMakeLists.txt @@ -1,21 +1,47 @@ +if(RDK_USE_BOOST_SERIALIZATION AND Boost_SERIALIZATION_LIBRARY) + ADD_DEFINITIONS("-DRDK_USE_BOOST_SERIALIZATION") +else() + message("== Making EnumerateLibrary without boost Serialization support") +endif() + rdkit_library(ChemReactions Reaction.cpp MDLParser.cpp DaylightParser.cpp ReactionPickler.cpp - ReactionWriter.cpp ReactionDepict.cpp ReactionFingerprints.cpp ReactionUtils.cpp MoleculeParser.cpp ReactionRunner.cpp PreprocessRxn.cpp - LINK_LIBRARIES FilterCatalog Descriptors Fingerprints DataStructs Depictor FileParsers SubstructMatch ChemTransforms) + ReactionWriter.cpp ReactionDepict.cpp ReactionFingerprints.cpp ReactionUtils.cpp MoleculeParser.cpp ReactionRunner.cpp PreprocessRxn.cpp SanitizeRxn.cpp + Enumerate/Enumerate.cpp + Enumerate/EnumerationPickler.cpp + Enumerate/EvenSamplePairs.cpp + + LINK_LIBRARIES + FilterCatalog Descriptors Fingerprints DataStructs Depictor + FileParsers SubstructMatch ChemTransforms ${Boost_SERIALIZATION_LIBRARY}) rdkit_headers(Reaction.h ReactionParser.h ReactionPickler.h ReactionFingerprints.h ReactionUtils.h - ReactionRunner.h PreprocessRxn.h DEST GraphMol/ChemReactions) + ReactionRunner.h + PreprocessRxn.h + SanitizeRxn.h + Enumerate/Enumerate.h + Enumerate/EnumerateBase.h + Enumerate/EnumerationPickler.h + Enumerate/EnumerationStrategyBase.h + Enumerate/CartesianProduct.h + Enumerate/RandomSample.h + Enumerate/RandomSampleAllBBs.h + DEST GraphMol/ChemReactions) rdkit_test(testReaction testReaction.cpp LINK_LIBRARIES -ChemReactions FilterCatalog ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch -GraphMol RDGeneral RDGeometryLib ) +ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch +GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} ) rdkit_test(testReactionFingerprints testReactionFingerprints.cpp LINK_LIBRARIES -ChemReactions FilterCatalog Descriptors Fingerprints Subgraphs DataStructs ChemTransforms Depictor FileParsers SmilesParse SubstructMatch -GraphMol RDGeneral RDGeometryLib ) +ChemReactions Descriptors Fingerprints Subgraphs DataStructs ChemTransforms Depictor FileParsers SmilesParse SubstructMatch +GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} ) + +rdkit_test(testEnumeration Enumerate/testEnumerate.cpp LINK_LIBRARIES +ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch +GraphMol RDGeneral RDGeometryLib ${Boost_SERIALIZATION_LIBRARY} ) add_subdirectory(Wrap) diff --git a/Code/GraphMol/ChemReactions/Enumerate/CartesianProduct.h b/Code/GraphMol/ChemReactions/Enumerate/CartesianProduct.h new file mode 100644 index 000000000..9b6397f8e --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/CartesianProduct.h @@ -0,0 +1,145 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef CARTESIANPRODUCT_H +#define CARTESIANPRODUCT_H + +#include "EnumerationStrategyBase.h" + +namespace RDKit { +//! This is a class for enumerating reagents using Cartesian Products of +// reagents. +/*! + CartesianProductStrategy produces a standard walk through all possible + reagent combinations: + + (0,0,0), (1,0,0), (2,0,0) ... + + basic usage: + + \verbatim + std::vector bbs; + bbs.push_back( bbs_for_reactants_1 ); + bbs.push_back( bbs_for_reactants_2 ); + + RGRUOPS num_bbs; + num_bbs.push_back(bbs[0].size()); + num_bbs.push_back(bbs[1].size()); + + CartesianProductStrategy rgroups(num_bbs); + for(size_t i=0; i lprops = rxn.RunReactants(rvect); + ... + } + \endverbatim + +See EnumerationStrategyBase for more details and usage. +*/ + +class CartesianProductStrategy : public EnumerationStrategyBase { + size_t m_numPermutationsProcessed; + + public: + CartesianProductStrategy() + : EnumerationStrategyBase(), m_numPermutationsProcessed() {} + + using EnumerationStrategyBase::initialize; + + virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) { + m_numPermutationsProcessed = 0; + } + + virtual const char *type() const { return "CartesianProductStrategy"; } + + //! The current permutation {r1, r2, ...} + virtual const EnumerationTypes::RGROUPS &next() { + if (m_numPermutationsProcessed) { + increment(); + } else + ++m_numPermutationsProcessed; + + return m_permutation; + } + + virtual boost::uint64_t getPermutationIdx() const { + return m_numPermutationsProcessed; } + + virtual operator bool() const { return hasNext(); } + + EnumerationStrategyBase *copy() const { + return new CartesianProductStrategy(*this); + } + + private: + void increment() { + next(0); + ++m_numPermutationsProcessed; + } + + bool hasNext() const { + // Fix me -> use multiprecision int here??? + if (m_numPermutations == EnumerationStrategyBase::EnumerationOverflow || + m_numPermutationsProcessed < rdcast(m_numPermutations)) { + return true; + } else { + return false; + } + } + + void next(size_t rowToIncrement) { + if (!hasNext()) return; + m_permutation[rowToIncrement] += 1; + size_t max_index_of_row = m_permutationSizes[rowToIncrement] - 1; + if (m_permutation[rowToIncrement] > max_index_of_row) { + m_permutation[rowToIncrement] = 0; + next(rowToIncrement + 1); + } + } + + private: +#ifdef RDK_USE_BOOST_SERIALIZATION + friend class boost::serialization::access; + template + void serialize(Archive &ar, const unsigned int /*version*/) { + ar &boost::serialization::base_object(*this); + ar &m_numPermutationsProcessed; + } +#endif +}; +} + +#ifdef RDK_USE_BOOST_SERIALIZATION +BOOST_CLASS_VERSION(RDKit::CartesianProductStrategy, 1) +#endif + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/Enumerate.cpp b/Code/GraphMol/ChemReactions/Enumerate/Enumerate.cpp new file mode 100644 index 000000000..8aa746036 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/Enumerate.cpp @@ -0,0 +1,259 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "Enumerate.h" +#include "CartesianProduct.h" +#include "RandomSample.h" +#include "RandomSampleAllBBs.h" +#include "EvenSamplePairs.h" +#include "../ReactionPickler.h" +#include +#include + +// Since we are exporting the classes for serialization, +// we should declare the archives types used here +#ifdef RDK_USE_BOOST_SERIALIZATION +#include +#include +#include +#include +#include +#include + +BOOST_CLASS_EXPORT(RDKit::EnumerationStrategyBase); +BOOST_CLASS_EXPORT(RDKit::CartesianProductStrategy); +BOOST_CLASS_EXPORT(RDKit::RandomSampleStrategy); +BOOST_CLASS_EXPORT(RDKit::RandomSampleAllBBsStrategy); +BOOST_CLASS_EXPORT(RDKit::EvenSamplePairsStrategy); +BOOST_CLASS_EXPORT(RDKit::EnumerateLibrary); +#endif + +namespace RDKit { +using namespace EnumerationTypes; + +const RGROUPS &EnumerateLibraryBase::getPosition() const { + return m_enumerator->getPosition(); +} + +std::string EnumerateLibraryBase::getState() const { + PRECONDITION(m_enumerator.get(), "Null Enumerator"); + std::string state; + EnumerationStrategyPickler::pickle(m_enumerator, state); + return state; +} + +void EnumerateLibraryBase::setState(const std::string &state) { + m_enumerator = EnumerationStrategyPickler::fromPickle(state); +} + +void EnumerateLibraryBase::resetState() { + PRECONDITION(m_initialEnumerator.get(), + "Unset initial enumerator"); + m_enumerator.reset(m_initialEnumerator->copy()); +} + +std::vector > EnumerateLibraryBase::nextSmiles() { + std::vector > result; + std::vector mols = next(); + const bool doisomeric = true; + result.resize(mols.size()); + for (size_t i = 0; i < mols.size(); ++i) { + result[i].resize(mols[i].size()); + for (size_t j = 0; j < mols[i].size(); ++j) { + if (mols[i][j].get()) result[i][j] = MolToSmiles(*mols[i][j], doisomeric); + } + } + return result; +} + +namespace { +size_t countMatches( const ROMol& bb, const ROMol& query, int maxMatches) { + std::vector matches; + const bool uniquify = true; + const bool useChirality = true; + const bool useQueryQueryMatches = false; + + SubstructMatch(bb, query, matches, + uniquify, true, useChirality, useQueryQueryMatches, + maxMatches+1); + return matches.size(); +} +} +BBS removeNonmatchingReagents(const ChemicalReaction &rxn, BBS bbs, + const EnumerationParams ¶ms) { + PRECONDITION(bbs.size() <= rxn.getNumReactantTemplates(), + "Number of Reagents not compatible with reaction templates"); + BBS result; + result.resize(bbs.size()); + + for(size_t reactant_idx=0; reactant_idx < bbs.size(); ++reactant_idx) { + size_t removedCount = 0; + const unsigned int maxMatches = (params.reagentMaxMatchCount == INT_MAX) ? + 0 : rdcast(params.reagentMaxMatchCount); + + ROMOL_SPTR reactantTemplate = rxn.getReactants()[reactant_idx]; + for(size_t reagent_idx = 0; reagent_idx < bbs[reactant_idx].size(); ++reagent_idx) { + ROMOL_SPTR mol = bbs[reactant_idx][reagent_idx]; + size_t matches = countMatches(*mol.get(), *reactantTemplate.get(), maxMatches); + + bool removeReagent = false; + if(!matches || matches > rdcast(params.reagentMaxMatchCount)) { + removeReagent = true; + } + + if(!removeReagent && params.sanePartialProducts) { + // see if we have any sane products in the results + std::vector partialProducts = rxn.runReactant(mol, reactant_idx); + for(size_t productTemplate_idx = 0; + productTemplate_idx < partialProducts.size(); + ++productTemplate_idx) { + int saneProducts = 0; + for(size_t product_idx = 0; + product_idx < partialProducts[productTemplate_idx].size(); + ++product_idx) { + try { + RWMol *m = dynamic_cast( + partialProducts[productTemplate_idx][product_idx].get()); + MolOps::sanitizeMol(*m); + saneProducts++; + } catch (...) { + } + } + + if (!saneProducts) { + // if any product template has no sane products, we bail + removeReagent = true; + break; + } + } + } + + if(removeReagent) + removedCount++; + else + result[reactant_idx].push_back(mol); + } + + + if(removedCount) { + BOOST_LOG(rdInfoLog) << "Removed " << removedCount << + " non matching reagents at template " << reactant_idx << std::endl; + } + } + return result; +} + +EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs, + const EnumerationParams ¶ms) + : EnumerateLibraryBase(rxn, new CartesianProductStrategy), + m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) { + m_enumerator->initialize(m_rxn, m_bbs); // getSizesFromBBs(bbs)); + m_initialEnumerator.reset(m_enumerator->copy()); +} + +EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs, + const EnumerationStrategyBase &enumerator, + const EnumerationParams ¶ms) + : EnumerateLibraryBase(rxn), + m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) { + m_enumerator.reset(enumerator.copy()); + m_enumerator->initialize(m_rxn, m_bbs); + m_initialEnumerator.reset(m_enumerator->copy()); +} + +EnumerateLibrary::EnumerateLibrary(const EnumerateLibrary &rhs) + : EnumerateLibraryBase(rhs), m_bbs(rhs.m_bbs) {} + +std::vector EnumerateLibrary::next() { + PRECONDITION(static_cast(*this), "No more enumerations"); + const RGROUPS &reactantIndices = m_enumerator->next(); + MOL_SPTR_VECT reactants(m_bbs.size()); + + for (size_t i = 0; i < m_bbs.size(); ++i) { + reactants[i] = m_bbs[i][reactantIndices[i]]; + } + + return m_rxn.runReactants(reactants); +} + +void EnumerateLibrary::toStream(std::ostream &ss) const { +#ifdef RDK_USE_BOOST_SERIALIZATION + boost::archive::text_oarchive ar(ss); + ar << *this; +#else + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); +#endif +} + +void EnumerateLibrary::initFromStream(std::istream &ss) { +#ifdef RDK_USE_BOOST_SERIALIZATION + boost::archive::text_iarchive ar(ss); + ar >> *this; +#else + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); +#endif +} + +boost::uint64_t computeNumProducts(const RGROUPS &sizes) { + boost::multiprecision::cpp_int myint = 1; + + for (size_t i = 0; i < sizes.size(); ++i) { + myint *= sizes[i]; + } + + if (myint < std::numeric_limits::max()) + return myint.convert_to(); + else + return EnumerationStrategyBase::EnumerationOverflow; +} + +MOL_SPTR_VECT getReactantsFromRGroups(const std::vector &bbs, + const RGROUPS &rgroups) { + PRECONDITION(bbs.size() == rgroups.size(), + "BBS and RGROUPS must have the same # reactants"); + MOL_SPTR_VECT result; + result.reserve(bbs.size()); + for (size_t i = 0; i < bbs.size(); ++i) { + result.push_back(bbs[i][rgroups[i]]); + } + return result; +} + +bool EnumerateLibraryCanSerialize() { +#ifdef RDK_USE_BOOST_SERIALIZATION + return true; +#else + return false; +#endif +} + +} diff --git a/Code/GraphMol/ChemReactions/Enumerate/Enumerate.h b/Code/GraphMol/ChemReactions/Enumerate/Enumerate.h new file mode 100644 index 000000000..38800a961 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/Enumerate.h @@ -0,0 +1,183 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.n +// +#ifndef RDKIT_ENUMERATE_H +#define RDKIT_ENUMERATE_H +#include "EnumerateBase.h" + +namespace RDKit { + +//! This is a class for providing enumeration options that control +// how enumerations are performed. +/*! + Option + reagentMaxMatchCount [default INT_MAX] + This specifies how many times the reactant template can match a reagent. + + sanePartialProducts [default false] + If true, forces all products of the reagent plus the product templates\n\ + pass chemical sanitization. Note that if the product template itself\n\ + does not pass sanitization, then none of the products will. +*/ +struct EnumerationParams +{ + int reagentMaxMatchCount; + bool sanePartialProducts; + EnumerationParams() : + reagentMaxMatchCount(INT_MAX), sanePartialProducts(false) { + } + + EnumerationParams(const EnumerationParams &rhs) : + reagentMaxMatchCount(rhs.reagentMaxMatchCount), + sanePartialProducts(rhs.sanePartialProducts) { + } +}; + + +//! Helper function, remove reagents that are incompatible +// with the reaction. +// rxn must be sanitized, initialized and preprocessed. +// this happens automatically in EnumerateLibrary +EnumerationTypes::BBS removeNonmatchingReagents( + const ChemicalReaction &rxn, + EnumerationTypes::BBS bbs, + const EnumerationParams ¶ms=EnumerationParams()); + +//! This is a class for running reactions on sets of reagents. +/*! + This class is a fully self contained reaction engine that can be + serialized and restarted. For example, a million products can + be generated, the engine can be saved for later and reloaded + to retrieve the next million products. + + basic usage will be something like: + \verbatim + ChemicalReaction rxn = ... + BBS bbs(num_rgroups); + ... somehow LoadRGroups(bbs[0]); + ... somehow LoadRGroups(bbs[1]..); + ... + EnumerateLibrary enumerator(en, bbs); + for(; (bool)en; ++i) { + // This is the same as rxn.run_Reactants( reagents ); + std::vector products = en.next(); + ... + } + \endverbatim + + In general, reactions will enumerate to more products than desired, + a standard use is: + + \verbatim + for(int i=0;i products = en.next(); + ... + } + \endverbatim + */ + + +class EnumerateLibrary : public EnumerateLibraryBase { + EnumerationTypes::BBS m_bbs; + + public: + EnumerateLibrary() : EnumerateLibraryBase(), m_bbs() {} + EnumerateLibrary(const std::string &s) : EnumerateLibraryBase(), m_bbs() { + initFromString(s); + } + + EnumerateLibrary(const ChemicalReaction &rxn, + const EnumerationTypes::BBS &reagents, + const EnumerationParams & params = EnumerationParams()); + EnumerateLibrary(const ChemicalReaction &rxn, + const EnumerationTypes::BBS &reagents, + const EnumerationStrategyBase &enumerator, + const EnumerationParams & params = EnumerationParams()); + EnumerateLibrary(const EnumerateLibrary &rhs); + + //! Return the reagents used in the library + const EnumerationTypes::BBS &getReagents() const { return m_bbs; } + + //! Get the next product set + std::vector next(); + + void toStream(std::ostream &ss) const; + void initFromStream(std::istream &ss); + + private: +#ifdef RDK_USE_BOOST_SERIALIZATION + friend class boost::serialization::access; + template + void save(Archive &ar, const unsigned int /*version*/) const { + ar &boost::serialization::base_object(*this); + size_t sz = m_bbs.size(); + ar &sz; + + std::string pickle; + for (size_t i = 0; i < m_bbs.size(); ++i) { + sz = m_bbs[i].size(); + ar &sz; + for (size_t j = 0; j < m_bbs[i].size(); ++j) { + MolPickler::pickleMol(*m_bbs[i][j], pickle); + ar &pickle; + } + } + } + template + void load(Archive &ar, const unsigned int /*version*/) { + ar &boost::serialization::base_object(*this); + + size_t sz; + ar &sz; + + m_bbs.resize(sz); + + for (size_t i = 0; i < m_bbs.size(); ++i) { + ar &sz; + m_bbs[i].resize(sz); + std::string pickle; + for (size_t j = 0; j < m_bbs[i].size(); ++j) { + ar &pickle; + RWMol *mol = new RWMol(); + MolPickler::molFromPickle(pickle, *mol); + m_bbs[i][j].reset(mol); + } + } + } + + BOOST_SERIALIZATION_SPLIT_MEMBER(); +#endif +}; + +bool EnumerateLibraryCanSerialize(); + +} +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/EnumerateBase.h b/Code/GraphMol/ChemReactions/Enumerate/EnumerateBase.h new file mode 100644 index 000000000..0c101555d --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EnumerateBase.h @@ -0,0 +1,200 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef RDKIT_ENUMERATEBASE_H +#define RDKIT_ENUMERATEBASE_H + +#include +#include "EnumerateTypes.h" +#include "../Reaction.h" +#include "EnumerationPickler.h" + +#include "EnumerationStrategyBase.h" +#include "CartesianProduct.h" +#include "../ReactionPickler.h" +#include + +namespace RDKit { +//! Base class for enumerating chemical reactions from collections of +// building blocks and reagents. +/*! + basic usage: + + \verbatim + EnumerateLibraryBase &enumerator; + while (enumerator) { + MOL_SPTR_VECT res = enumerator.next(); + // do something with enumeration products here + } + \endverbatim + + See Reaction.h for more details on how ChemicalReactions are + used. +*/ +class EnumerateLibraryBase { + protected: + ChemicalReaction m_rxn; + boost::shared_ptr m_enumerator; + boost::shared_ptr m_initialEnumerator; + public: + //! default constructor +EnumerateLibraryBase() : m_rxn(), + m_enumerator(), + m_initialEnumerator() {} + + //! construct with a chemical reaction and an enumeration strategy + EnumerateLibraryBase(const ChemicalReaction &rxn, + EnumerationStrategyBase *enumerator = 0) + : m_rxn(rxn), + m_enumerator(enumerator ? enumerator : new CartesianProductStrategy), + m_initialEnumerator( m_enumerator->copy() ) + { + m_rxn.initReactantMatchers(); + } + + //! Copy constructor + EnumerateLibraryBase(const EnumerateLibraryBase &rhs) + : m_rxn(rhs.m_rxn), + m_enumerator(rhs.m_enumerator ? rhs.m_enumerator->copy() : 0), + m_initialEnumerator( m_enumerator->copy() ) {} + + virtual ~EnumerateLibraryBase() {} + + //! Are there any enumerations left? + virtual operator bool() const { + PRECONDITION(m_enumerator.get(), "Null enumeration strategy"); + return static_cast(*m_enumerator); + } + + //! reset the enumeration to the beginning. + void reset() { + if(m_initialEnumerator.get()) { + m_enumerator.reset(m_initialEnumerator->copy()); + } + } + + //! returns the underlying chemical reaction + const ChemicalReaction &getReaction() const { return m_rxn; } + + //! return the current enumeration strategy + const EnumerationStrategyBase &getEnumerator() { + PRECONDITION(m_enumerator.get(), "Null Enumerator"); + return *m_enumerator; + } + + //! get the next set of products (See run_Reactants) for details + // This returns a vector of a vector of molecules. + // Each result vector corresponds for a product template. + // i.e. + // res = library.next(); + // res[0] are the results for library.getReaction().getProdcts()[0] + virtual std::vector next() = 0; + + //! get the next set of products as smiles + // This returns a vector of a vector strings. + // Each result vector corresponds for a product template. + virtual std::vector > nextSmiles(); + + //! Get the current position into the reagent vectors + // Use getState/setState to save/restart the enumeration + // from this position. + const EnumerationTypes::RGROUPS &getPosition() const; + + //! Get the current state of the enumerator + // This is the position of the enumerator and the enumerators + // state that can be used to restart enumerating + // from this position. + std::string getState() const; + + //! Set the current state of the enumerator + // Restart the enumerator from this position. + void setState(const std::string &); + + //! Reset the enumerator to the beginning + void resetState(); + + + //! serializes (pickles) to a stream + virtual void toStream(std::ostream &ss) const = 0; + + //! returns a string with a serialized (pickled) representation + virtual std::string Serialize() const { + std::stringstream ss; + toStream(ss); + return ss.str(); + } + + //! initializes from a stream pickle + virtual void initFromStream(std::istream &ss) = 0; + + //! initializes from a string pickle + virtual void initFromString(const std::string &text) { + std::stringstream ss(text); + initFromStream(ss); + } + + private: +#ifdef RDK_USE_BOOST_SERIALIZATION + friend class boost::serialization::access; + template + void save(Archive &ar, const unsigned int) const { + std::string pickle; + ReactionPickler::pickleReaction(m_rxn, pickle); + ar &pickle; + ar &m_enumerator; + // we handle the m_initialEnumerator from a string + // for backwards compatibility with a unreleased + // version + EnumerationStrategyPickler::pickle(m_initialEnumerator, + pickle); + ar &pickle; + } + template + void load(Archive &ar, const unsigned int /*version*/) { + std::string pickle; + ar &pickle; + ReactionPickler::reactionFromPickle(pickle, m_rxn); + ar &m_enumerator; + ar &pickle; + m_initialEnumerator = \ + EnumerationStrategyPickler::fromPickle(pickle); + + } + + BOOST_SERIALIZATION_SPLIT_MEMBER(); +#endif +}; + +#ifdef RDK_USE_BOOST_SERIALIZATION +BOOST_SERIALIZATION_ASSUME_ABSTRACT(EnumerateLibraryBase) +#endif +} +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/EnumerateTypes.h b/Code/GraphMol/ChemReactions/Enumerate/EnumerateTypes.h new file mode 100644 index 000000000..ba580d636 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EnumerateTypes.h @@ -0,0 +1,58 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef ENUMERATETYPES_H +#define ENUMERATETYPES_H + +#include + +namespace RDKit { +namespace EnumerationTypes { +//! BBS - Helper typedef for holding buliding blocks for reactions +//! holds vectors of reagents for each reactant in a Reaction +typedef std::vector BBS; + +//! RGROUPS Helper typedef for indexing into the BBS vector +//! - The indices into the BBS molecule list to create a product +//! Example +//! RGROUPS groups; +//! groups.push_back(10); +//! groups.push_back(5); +//! +//! Will create a product from the following building blocks: +//! MOL_SPTR_VECT building_blocks; +//! building_blocks.push_back( BBS[0][groups[0] ); +//! building_blocks.push_back( BBS[1][groups[1] ); +//! rxn.runReactants( building_blocks ); +typedef std::vector RGROUPS; +} +} +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.cpp b/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.cpp new file mode 100644 index 000000000..b7c526bd6 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.cpp @@ -0,0 +1,110 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include "EnumerationPickler.h" +#include "CartesianProduct.h" +#include "RandomSample.h" +#include "RandomSampleAllBBs.h" + + +#ifdef RDK_USE_BOOST_SERIALIZATION +#include +#include +#include +#include +#include +#endif + +namespace RDKit { + +std::string GetClass(const EnumerationStrategyBase *en) { + if (dynamic_cast(en)) return "-->cartesian"; + if (dynamic_cast(en)) return "-->random"; + if (dynamic_cast(en)) + return "-->randombbs"; + return "Unknown!"; +} + +namespace EnumerationStrategyPickler { + +void pickle(const boost::shared_ptr &enumerator, + std::ostream &ss) { +#ifdef RDK_USE_BOOST_SERIALIZATION + boost::archive::text_oarchive ar(ss); + ar &enumerator; +#else + RDUNUSED_PARAM(enumerator); + RDUNUSED_PARAM(ss); + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); +#endif +} + +void pickle(const boost::shared_ptr &enumerator, + std::string &s) { +#ifdef RDK_USE_BOOST_SERIALIZATION + std::stringstream ss; + pickle(enumerator, ss); + s = ss.str(); +#else + RDUNUSED_PARAM(enumerator); + RDUNUSED_PARAM(s); + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); +#endif +} + +boost::shared_ptr fromPickle(std::istream &pickle) { + boost::shared_ptr enumerator; +#ifdef RDK_USE_BOOST_SERIALIZATION + boost::archive::text_iarchive ar(pickle); + ar &enumerator; + return enumerator; +#else + RDUNUSED_PARAM(pickle); + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); +#endif + +} + +boost::shared_ptr fromPickle( + const std::string &pickle) { +#ifdef RDK_USE_BOOST_SERIALIZATION + std::stringstream ss(pickle); + return fromPickle(ss); +#else + RDUNUSED_PARAM(pickle); + PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED"); + return boost::shared_ptr(); +#endif + +} +} +} diff --git a/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.h b/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.h new file mode 100644 index 000000000..c02f9f05c --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EnumerationPickler.h @@ -0,0 +1,56 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef ENUMERATIONPICKLER_H +#define ENUMERATIONPICKLER_H + +#include "EnumerationStrategyBase.h" + +namespace RDKit { +namespace EnumerationStrategyPickler { +//! pickles a EnumerationStrategy and adds the results to a stream \c ss +void pickle(const boost::shared_ptr &enumerator, + std::ostream &ss); +void pickle(const boost::shared_ptr &enumerator, + std::string &s); + +//! constructs a EnumerationStrategy from a pickle stored in a string +//! Since an EnumerationStrategyBase is polymorphic, this must return +//! a shared pointer to the EnumerationStrategyBase +boost::shared_ptr fromPickle(std::istream &pickle); + +//! a pointer to the EnumerationStrategyBase +boost::shared_ptr fromPickle( + const std::string &pickle); +} +} + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/EnumerationStrategyBase.h b/Code/GraphMol/ChemReactions/Enumerate/EnumerationStrategyBase.h new file mode 100644 index 000000000..c7b3dc9b6 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EnumerationStrategyBase.h @@ -0,0 +1,199 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef ENUMERATION_STRATEGY_H +#define ENUMERATION_STRATEGY_H + +#include "EnumerateTypes.h" +#include "../Reaction.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace RDKit { + +//! class for flagging enumeration strategy errors +class EnumerationStrategyException : public std::exception { + public: + EnumerationStrategyException(const char *msg) : _msg(msg){}; + EnumerationStrategyException(const std::string &msg) : _msg(msg){}; + const char *message() const { return _msg.c_str(); }; + ~EnumerationStrategyException() throw(){}; + + private: + std::string _msg; +}; + +//! Return the number of elements per input vector +/*! \param bbs vector > + + \result vector number of elements in each vector + */ +template +EnumerationTypes::RGROUPS getSizesFromBBs(const std::vector > &bbs) { + EnumerationTypes::RGROUPS sizes; + for (size_t i = 0; i < bbs.size(); ++i) sizes.push_back(bbs[i].size()); + return sizes; +} + +//! getSizesFromReactants +//! Helper function for enumeration, bbs are stored in a +//! std::vector< std::vector > +// +EnumerationTypes::RGROUPS getSizesFromReactants(const std::vector &bbs); + +//! getReactantsFromRGroups +//! Helper function for enumeration, bbs are stored in a +//! std::vector< std::vector > +// +MOL_SPTR_VECT getReactantsFromRGroups(const std::vector &bbs, + const EnumerationTypes::RGROUPS &rgroups); + +//! computeNumProducts +//! Returns the number of possible product combination from +//! The given numbers of building blocks for each rgroup +//! or EnumerationStrategyBase::EnumerationOverflow if the +//! number will not fit into the machines integer type. +//! n.b. An overflow simply means there are a lot of products +//! not that they cannot be enumerated +boost::uint64_t computeNumProducts(const EnumerationTypes::RGROUPS &sizes); + +//! Base Class for enumeration strageties +//! Usage: +//! EnumerationStrategyBase must be initialized with both a reaction +//! and the building block (molecule) vector to be sampled. +//! +//! \verbatim +//! EnumerationStrategyBase &eb = ... +//! if(eb) { // can we get another entry +//! const std::vector &v = eb.next(); +//! v[0] // RGroup 0 position +//! v[1] // RGroup 1 position... +//! } +//! \endverbatim + +class EnumerationStrategyBase { + protected: + EnumerationTypes::RGROUPS m_permutation; // where are we currently? + EnumerationTypes::RGROUPS m_permutationSizes; // m_permutationSizes num bbs per group + boost::uint64_t m_numPermutations; // total number of permutations for this group + // -1 if > ssize_t::max + public: + static const boost::uint64_t EnumerationOverflow = static_cast(-1); + EnumerationStrategyBase() + : m_permutation(), m_permutationSizes(), m_numPermutations() {} + + virtual ~EnumerationStrategyBase() {} + + virtual const char *type() const { return "EnumerationStrategyBase"; } + + //! Initialize the enumerator based on the reaction and the + //! supplied building blocks + //! This is the standard API point. + void initialize(const ChemicalReaction &reaction, + const EnumerationTypes::BBS &building_blocks) { + // default initialization, may be overridden (sets the # reactants + // and computes the default # of permutations) + m_permutationSizes = getSizesFromBBs(building_blocks); + m_permutation.resize(m_permutationSizes.size()); + + m_numPermutations = computeNumProducts(m_permutationSizes); + std::fill(m_permutation.begin(), m_permutation.end(), 0); + + initializeStrategy(reaction, building_blocks); + } + + // ! Initialize derived class + // ! must exist, EnumerationStrategyBase structures are already initialized + virtual void initializeStrategy(const ChemicalReaction &reaction, + const EnumerationTypes::BBS &building_blocks) = 0; + + //! returns true if there are more permutations left + //! random enumerators may always return true... + virtual operator bool() const = 0; + + //! The current permutation {r1, r2, ...} + virtual const EnumerationTypes::RGROUPS &next() = 0; + + //! copy the enumeration strategy complete with current state + virtual EnumerationStrategyBase *copy() const = 0; + + //! The current position in the enumeration + const EnumerationTypes::RGROUPS &getPosition() const { return m_permutation; } + + //! a result of EnumerationOverflow indicates that the number of + //! permutations is not computable with the current + //! rdlonglong size. + boost::uint64_t getNumPermutations() const { return m_numPermutations; } + + //! Returns how many permutations have been processed by this strategy + virtual boost::uint64_t getPermutationIdx() const = 0; + + //! Skip the specified number of permutations (useful for + //! resetting state to a known position) + bool skip(boost::uint64_t skipCount) { + for (boost::uint64_t i = 0; i < skipCount; ++i) next(); + return true; + } + + protected: + //! Initialize the internal data structures + //! i.e. RGROUPS = {10,40,50}; + void internalInitialize(const EnumerationTypes::RGROUPS &rgroups) { + m_permutation.resize(rgroups.size()); + m_permutationSizes = rgroups; + m_numPermutations = computeNumProducts(m_permutationSizes); + std::fill(m_permutation.begin(), m_permutation.end(), 0); + } + + private: + friend class boost::serialization::access; + template + void serialize(Archive &ar, const unsigned int /*version*/) { + ar &m_permutation; + ar &m_permutationSizes; + ar &m_numPermutations; + } +}; + +BOOST_SERIALIZATION_ASSUME_ABSTRACT(EnumerationStrategyBase) +} + +BOOST_CLASS_VERSION(RDKit::EnumerationStrategyBase, 1) + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.cpp b/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.cpp new file mode 100644 index 000000000..572d90f6e --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.cpp @@ -0,0 +1,281 @@ +// +// Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include "EvenSamplePairs.h" +#include +#include + + +namespace RDKit { + +using namespace EnumerationTypes; +// Based on an implementation from a correspondance with Bernd Rohde. +void EvenSamplePairsStrategy::initializeStrategy(const ChemicalReaction &, + const BBS &bbs) { + size_t npos = bbs.size(); + used_count.resize(npos); + std::fill(used_count.begin(), used_count.end(), 0); + + var_used.resize(npos); + for (size_t i = 0; i < npos; ++i) { + var_used[i].resize(m_permutationSizes[i]); + std::fill(var_used[i].begin(), var_used[i].end(), 0); + } + + boost::uint64_t nmonomers = 0; + for (size_t i = 0; i < bbs.size(); ++i) nmonomers += m_permutationSizes[i]; + + pair_used.resize(nmonomers); + for (size_t i = 0; i < nmonomers; ++i) { + pair_used[i].resize(nmonomers); + std::fill(pair_used[i].begin(), pair_used[i].end(), 0); + } + + pair_counts.resize(npos); + for (size_t i = 0; i < npos; i++) { + pair_counts[i].resize(npos); + std::fill(pair_counts[i].begin(), pair_counts[i].end(), 0); + } + + /* Initialize random number generator */ + /* Find modulus */ + PRECONDITION(m_numPermutations >= 0, + "Number of permutations too large to Evenly sample"); + for (M = 1; M < rdcast(m_numPermutations); M = 2 * M) + ; + /* Set factor */ + a = 5; + b = 7; + + // control of random number and heuristics + seed = 0; + m_numPermutationsProcessed = 0; + nslack = 0; // increase this to break evenness criteria + rejected_period = 0; + rejected_unique = 0; + rejected_slack_condition = 0; + rejected_bb_sampling_condition = 0; + + selected.clear(); // clear the selected (unique) set +} + +// Try to add the given encoded seed position into +// the current set of return groups. This checks to +// see if the BBS are evenly sampled as pairs. If +// they currently are not, reject the selection. +// This is fairly suboptimal for large collections +// of building blocks and may take a while to +// terminate... +bool EvenSamplePairsStrategy::try_add(size_t seed) { + const RGROUPS &digits = decode(seed); + const RGROUPS &rgroups = m_permutationSizes; + size_t islack = 0; + size_t num_rgroups = m_permutationSizes.size(); + + for (size_t i = 0; i < num_rgroups; ++i) { + if (var_used[i][digits[i]]) islack += var_used[i][digits[i]]; + if (islack > nslack) { + // add better heuristic here?? + rejected_slack_condition += 1; + return false; + } + } + + islack = 0; + size_t ioffset = 0; + // check that building block pairs get evenly sampled + for (size_t i = 0; i < num_rgroups; ++i) { + size_t joffset = 0; + for (size_t j = 0; j < num_rgroups; ++j) { + if (j == i) continue; + size_t ii = digits[i] + ioffset; + size_t jj = digits[j] + joffset; + if (pair_used[ii][jj] > 0) { + double numer = (double)pair_used[ii][jj]; + double denom = sqrt((double)(rgroups[i]) * (double)(rgroups[j])); + islack = (int)(numer / denom); + } + joffset += rgroups[j]; + } + ioffset += rgroups[i]; + } + + if (islack > nslack) { + rejected_bb_sampling_condition += 1; + return false; + } + + // keep track of bb usage + for (size_t i = 0; i < num_rgroups; ++i) { + if (var_used[i][digits[i]] == 0) { + used_count[i]++; + } + var_used[i][digits[i]] += 1; + if (used_count[i] == rdcast(rgroups[i])) { + // complete variable scan => initialize + if (nslack > min_nslack && rgroups[i] > 1) // cleared slack on i + nslack = min_nslack; + + used_count[i] = 0; + for (size_t j = 0; j < rgroups[i]; ++j) { + var_used[i][j]--; + assert(var_used[i][j] >= 0); + if (var_used[i][j] > 0) used_count[i]++; + } + } // end scan + } + + // keep track of BB Pair usage + ioffset = 0; + for (size_t i = 0; i < num_rgroups; ioffset += rgroups[i], ++i) { + size_t joffset = 0; + for (size_t j = 0; j < num_rgroups; joffset += rgroups[j], ++j) { + if (j == i) { + continue; + } + size_t ii = digits[i] + ioffset; + size_t jj = digits[j] + joffset; + if (pair_used[ii][jj] == 0) { + pair_counts[i][j]++; + } + pair_used[ii][jj]++; + if (pair_counts[i][j] >= rgroups[i] * rgroups[j]) { // all pairs visited + if (nslack > min_nslack && (rgroups[i] > 1 || rgroups[j] > 1)) { + nslack = min_nslack; + } + pair_counts[i][j] = 0; + for (size_t ii = 0; ii < rgroups[i]; ++ii) { + for (size_t jj = 0; jj < rgroups[j]; ++jj) { + pair_used[ioffset + ii][joffset + jj]--; + if (pair_used[ioffset + ii][joffset + jj] > 0) { + pair_counts[i][j]++; + } + } + } + } + } + } + + selected.insert(seed); + return true; +} + +const RGROUPS &EvenSamplePairsStrategy::next() { + nslack = 0; + while (m_numPermutationsProcessed < rdcast(m_numPermutations)) { + bool added = false; + for (size_t l = 0; l < M; ++l) { + seed = ((seed * a + b) % M); + if (seed > rdcast(m_numPermutations)) { + rejected_period += 1; + continue; + } else if (selected.find(seed) != selected.end()) { + rejected_unique += 1; + continue; + } else if (try_add(seed)) { + m_numPermutationsProcessed++; + added = true; + return decode(seed); + } + } + + if (!added) { + // loosen heuristic + nslack += 1; + min_nslack += 1; + } + } + + throw EnumerationStrategyException("Ran out of molecules"); +} + +std::string EvenSamplePairsStrategy::stats() const { + std::ostringstream ss; + + size_t npos = m_permutationSizes.size(); + const RGROUPS &nvars = m_permutationSizes; + size_t i, l, j, ii, jj, ioffset, joffset; + ss << "#BEGIN# BBSTAT\n"; + for (i = 0; i < npos; i++) { + size_t maxcount = 0; + if (nvars[i] == 1) continue; + for (j = 0; j < nvars[i]; j++) + if (maxcount < var_used[i][j]) maxcount = var_used[i][j]; + + ss << boost::format("%lu\t%lu\t%6.2f") % (i + 1) % nvars[i] % + ((double)m_numPermutationsProcessed / nvars[i]); + + for (l = 0; l <= maxcount; l++) { + size_t n = 0; + for (j = 0; j < nvars[i]; j++) + if (var_used[i][j] == l) n++; + if (n > 0) ss << boost::format("\t%lu|%lu") % l % n; + } + ss << std::endl; + } + ss << "#END# BBSTAT\n"; + + ss << "#BEGIN# PAIRSTAT\n"; + for (i = 0, ioffset = 0; i < npos; ioffset += nvars[i], i++) { + if (nvars[i] == 1) continue; + for (j = 0, joffset = 0; j < npos; joffset += nvars[j], j++) { + size_t maxcount = 0; + if (nvars[j] == 1) continue; + if (j <= i) continue; + for (ii = 0; ii < nvars[i]; ii++) + for (jj = 0; jj < nvars[j]; jj++) + if (maxcount < pair_used[ii + ioffset][jj + joffset]) + maxcount = pair_used[ii + ioffset][jj + joffset]; + ss << boost::format("%lu\t%lu\t%lu\t%lu\t%6.2f") % (i + 1) % + (j + 1) % nvars[i] % nvars[j] % + ((double)m_numPermutationsProcessed / + (nvars[i] * nvars[j])); + for (l = 0; l <= maxcount; l++) { + int n = 0; + for (ii = 0; ii < nvars[i]; ii++) + for (jj = 0; jj < nvars[j]; jj++) + if (l == pair_used[ii + ioffset][jj + joffset]) n++; + if (n > 0) ss << boost::format("\t%ld|%d") % l % n; + } + ss << boost::format("\n"); + } + } + ss << "#END# PAIRSTAT\n"; + + ss << "Rejected Period: " << rejected_period << std::endl; + ss << "Rejected (dupes): " << rejected_unique << std::endl; + ss << "Rejected Slack Conditions: " << rejected_slack_condition + << std::endl; + ss << "Rejected Pair Sampling: " << rejected_bb_sampling_condition + << std::endl; + return ss.str(); +} +} diff --git a/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h b/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h new file mode 100644 index 000000000..e7e31e58c --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h @@ -0,0 +1,193 @@ +// +// Copyright (c) 2016, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef RGROUP_EVEN_SAMPLE_H +#define RGROUP_EVEN_SAMPLE_H + +#include "EnumerationStrategyBase.h" +#ifdef RDK_USE_BOOST_SERIALIZATION +#include +#endif +#include + +namespace RDKit { +//! EvenSamplePairsStrategy +/*! Randomly sample Pairs evenly from a collection of building blocks + This is a good strategy for choosing a relatively small selection + of building blocks from a larger set. As the amount of work needed + to retrieve the next evenly sample building block grows with the + number of samples, this method performs progressively worse as the + number of samples gets larger. + + See EnumeartionStrategyBase for more details. +*/ + +class EvenSamplePairsStrategy : public EnumerationStrategyBase { + boost::uint64_t m_numPermutationsProcessed; + + std::vector used_count; + std::vector > var_used; + std::vector > pair_used; + std::vector > pair_counts; + std::set selected; + + size_t seed; // last seed for permutation (starts at 0) + size_t M, a, b; // random number stuff + size_t nslack, min_nslack; + size_t rejected_period, rejected_unique; + size_t rejected_slack_condition, rejected_bb_sampling_condition; + + public: + EvenSamplePairsStrategy() + : EnumerationStrategyBase(), + m_numPermutationsProcessed(), + used_count(), + var_used(), + pair_used(), + pair_counts(), + selected(), + seed(), + M(), + a(), + b(), + nslack(), + min_nslack(), + rejected_period(), + rejected_unique(), + rejected_slack_condition(), + rejected_bb_sampling_condition() {} + + EvenSamplePairsStrategy(const EvenSamplePairsStrategy &rhs) + : EnumerationStrategyBase(rhs), + m_numPermutationsProcessed(rhs.m_numPermutationsProcessed), + used_count(rhs.used_count), + var_used(rhs.var_used), + pair_used(rhs.pair_used), + pair_counts(rhs.pair_counts), + selected(rhs.selected), + seed(rhs.seed), + M(rhs.M), + a(rhs.a), + b(rhs.b), + nslack(rhs.nslack), + min_nslack(rhs.min_nslack), + rejected_period(rhs.rejected_period), + rejected_unique(rhs.rejected_unique), + rejected_slack_condition(rhs.rejected_slack_condition), + rejected_bb_sampling_condition(rhs.rejected_bb_sampling_condition) {} + + virtual const char *type() const { return "EvenSamplePairsStrategy"; } + + //! This is a class for enumerating RGroups using Cartesian Products of + //! reagents. + /*! + basic usage: + + \verbatim + std::vector bbs; + bbs.push_back( bbs_for_reactants_1 ); + bbs.push_back( bbs_for_reactants_2 ); + + EvenSamplePairsStrategy rgroups; + rgroups.initialize(rxn, bbs); + for(size_t i=0; i lprops = rxn.RunReactants(rvect); + ... + } + \endverbatim + */ + using EnumerationStrategyBase::initialize; + + virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &); + + //! The current permutation {r1, r2, ...} + virtual const EnumerationTypes::RGROUPS &next(); + + virtual boost::uint64_t getPermutationIdx() const { + return m_numPermutationsProcessed; } + + virtual operator bool() const { return true; } + + EnumerationStrategyBase *copy() const { + return new EvenSamplePairsStrategy(*this); + } + + std::string stats() const; + + private: + friend class boost::serialization::access; + + // decode a packed integer into an RGroup selection + const EnumerationTypes::RGROUPS &decode(size_t seed) { + for (int64_t j = m_permutationSizes.size() - 1; j >= 0; j--) { + m_permutation[j] = seed % m_permutationSizes[j]; + seed /= m_permutationSizes[j]; + } + return m_permutation; + } + + bool try_add(size_t seed); + + public: +#ifdef RDK_USE_BOOST_SERIALIZATION + template + void serialize(Archive &ar, const unsigned int /*version*/) { + // invoke serialization of the base class + ar &boost::serialization::base_object(*this); + ar &m_numPermutationsProcessed; + ar &used_count; + ar &var_used; + ar &pair_used; + ar &pair_counts; + ar &selected; + + ar &seed; + + ar &M; + ar &a; + ar &b; + + ar &nslack; + ar &min_nslack; + ar &rejected_period; + ar &rejected_unique; + ar &rejected_slack_condition; + ar &rejected_bb_sampling_condition; + } +#endif +}; +} + +BOOST_CLASS_VERSION(RDKit::EvenSamplePairsStrategy, 1) + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/RandomSample.h b/Code/GraphMol/ChemReactions/Enumerate/RandomSample.h new file mode 100644 index 000000000..bc31aec98 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/RandomSample.h @@ -0,0 +1,162 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef RGROUP_RANDOM_SAMPLE_H +#define RGROUP_RANDOM_SAMPLE_H + +#include "EnumerationStrategyBase.h" +#include +#include +#include + +namespace RDKit { + +//! This is a class for fully randomly sampling reagents. +// Note that this enumerator never halts. +/*! + basic usage: + + \verbatim + std::vector bbs; + bbs.push_back( bbs_for_reactants_1 ); + bbs.push_back( bbs_for_reactants_2 ); + + RandomSampleStrategy rgroups; + rgroups.initialize(rxn, bbs); + for(size_t i=0; i lprops = rxn.RunReactants(rvect); + ... + } + \endverbatim + + See EnumerationStrategyBase for more details and usage. +*/ +class RandomSampleStrategy : public EnumerationStrategyBase { + boost::uint64_t m_numPermutationsProcessed; + boost::minstd_rand m_rng; + std::vector > m_distributions; + + public: + RandomSampleStrategy() + : EnumerationStrategyBase(), + m_numPermutationsProcessed(), + m_rng(), + m_distributions() { + for (size_t i = 0; i < m_permutation.size(); ++i) { + m_distributions.push_back( + boost::random::uniform_int_distribution<>(0, m_permutation[i] - 1)); + } + } + + using EnumerationStrategyBase::initialize; + + virtual void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) { + m_distributions.clear(); + for (size_t i = 0; i < m_permutationSizes.size(); ++i) { + m_distributions.push_back(boost::random::uniform_int_distribution<>( + 0, m_permutationSizes[i] - 1)); + } + + m_numPermutationsProcessed = 0; + } + + virtual const char *type() const { return "RandomSampleStrategy"; } + + //! The current permutation {r1, r2, ...} + virtual const EnumerationTypes::RGROUPS &next() { + for (size_t i = 0; i < m_permutation.size(); ++i) { + m_permutation[i] = m_distributions[i](m_rng); + } + + ++m_numPermutationsProcessed; + + return m_permutation; + } + + virtual boost::uint64_t getPermutationIdx() const { + return m_numPermutationsProcessed; } + + virtual operator bool() const { return true; } + + EnumerationStrategyBase *copy() const { + return new RandomSampleStrategy(*this); + } + + private: +#ifdef RDK_USE_BOOST_SERIALIZATION + friend class boost::serialization::access; + + template + void save(Archive &ar, const unsigned int /*version*/) const { + // invoke serialization of the base class + ar << boost::serialization::base_object( + *this); + ar << m_numPermutationsProcessed; + + std::stringstream random; + random << m_rng; + std::string s = random.str(); + ar << s; + } + + template + void load(Archive &ar, const unsigned int /*version*/) { + // invoke serialization of the base class + ar >> boost::serialization::base_object(*this); + ar >> m_numPermutationsProcessed; + std::string s; + ar >> s; + std::stringstream random(s); + random >> m_rng; + + // reset the uniform distributions + m_distributions.clear(); + for (size_t i = 0; i < m_permutationSizes.size(); ++i) { + m_distributions.push_back(boost::random::uniform_int_distribution<>( + 0, m_permutationSizes[i] - 1)); + } + } + + template + void serialize(Archive &ar, const unsigned int file_version) { + boost::serialization::split_member(ar, *this, file_version); + } +#endif +}; +} + +#ifdef RDK_USE_BOOST_SERIALIZATION +BOOST_CLASS_VERSION(RDKit::RandomSampleStrategy, 1) +#endif + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h b/Code/GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h new file mode 100644 index 000000000..7b33a16eb --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h @@ -0,0 +1,186 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef RGROUP_RANDOM_SAMPLE_ALLBBS_H +#define RGROUP_RANDOM_SAMPLE_ALLBBS_H + +#include "EnumerationStrategyBase.h" +#include +#include +#include + +namespace RDKit { +//! RandomSampleAllBBsStrategy +//! Randomly sample rgroup indices + +//! This is a class for randomly enumerating reagents that ensures all reagents +// are sampled. +/*! + basic usage: + + \verbatim + std::vector bbs; + bbs.push_back( bbs_for_reactants_1 ); + bbs.push_back( bbs_for_reactants_2 ); + + RandomSampleAllBBsStrategy rgroups; + rgroups.initialize(rxn, bbs); + for(size_t i=0; i lprops = rxn.RunReactants(rvect); + ... + } + \endverbatim + + See EnumerationStrategyBase for more details and usage. +*/ + +class RandomSampleAllBBsStrategy : public EnumerationStrategyBase { + boost::uint64_t m_numPermutationsProcessed; + size_t m_offset; + size_t m_maxoffset; + + boost::minstd_rand m_rng; + std::vector > m_distributions; + + public: + RandomSampleAllBBsStrategy() + : EnumerationStrategyBase(), + m_numPermutationsProcessed(0), + m_offset(0), + m_maxoffset(0), + m_rng(), + m_distributions() { + for (size_t i = 0; i < m_permutation.size(); ++i) { + m_distributions.push_back( + boost::random::uniform_int_distribution<>(0, m_permutation[i] - 1)); + } + } + using EnumerationStrategyBase::initialize; + + void initializeStrategy(const ChemicalReaction &, const EnumerationTypes::BBS &) { + m_distributions.clear(); + m_permutation.resize(m_permutationSizes.size()); + m_permutationSizes = m_permutationSizes; + m_offset = 0; + m_maxoffset = + *std::max_element(m_permutationSizes.begin(), m_permutationSizes.end()); + for (size_t i = 0; i < m_permutationSizes.size(); ++i) { + m_distributions.push_back(boost::random::uniform_int_distribution<>( + 0, m_permutationSizes[i] - 1)); + } + + m_numPermutationsProcessed = 0; + } + + virtual const char *type() const { return "RandomSampleAllBBsStrategy"; } + + //! The current permutation {r1, r2, ...} + virtual const EnumerationTypes::RGROUPS &next() { + if (m_offset >= m_maxoffset) { + for (size_t i = 0; i < m_permutation.size(); ++i) { + m_permutation[i] = m_distributions[i](m_rng); + } + m_offset = 0; + } else { + for (size_t i = 0; i < m_permutation.size(); ++i) { + m_permutation[i] = (m_permutation[i] + 1) % m_permutationSizes[i]; + } + ++m_offset; + } + ++m_numPermutationsProcessed; + + return m_permutation; + } + + virtual boost::uint64_t getPermutationIdx() const { + return m_numPermutationsProcessed; } + + virtual operator bool() const { return true; } + + EnumerationStrategyBase *copy() const { + return new RandomSampleAllBBsStrategy(*this); + } + + private: +#ifdef RDK_USE_BOOST_SERIALIZATION + friend class boost::serialization::access; + + template + void save(Archive &ar, const unsigned int /*version*/) const { + // invoke serialization of the base class + ar << boost::serialization::base_object( + *this); + ar << m_numPermutationsProcessed; + + std::stringstream random; + random << m_rng; + std::string s = random.str(); + ar << s; + + ar << m_offset; + ar << m_maxoffset; + } + + template + void load(Archive &ar, const unsigned int /*version*/) { + // invoke serialization of the base class + ar >> boost::serialization::base_object(*this); + ar >> m_numPermutationsProcessed; + std::string s; + ar >> s; + std::stringstream random(s); + random >> m_rng; + ar >> m_offset; + ar >> m_maxoffset; + + // reset the uniform distributions + m_distributions.clear(); + for (size_t i = 0; i < m_permutationSizes.size(); ++i) { + m_distributions.push_back(boost::random::uniform_int_distribution<>( + 0, m_permutationSizes[i] - 1)); + } + } + + template + void serialize(Archive &ar, const unsigned int file_version) { + boost::serialization::split_member(ar, *this, file_version); + } +#endif +}; +} + +#ifdef RDK_USE_BOOST_SERIALIZATION +BOOST_CLASS_VERSION(RDKit::RandomSampleAllBBsStrategy, 1) +#endif + +#endif diff --git a/Code/GraphMol/ChemReactions/Enumerate/testEnumerate.cpp b/Code/GraphMol/ChemReactions/Enumerate/testEnumerate.cpp new file mode 100644 index 000000000..27fff7efb --- /dev/null +++ b/Code/GraphMol/ChemReactions/Enumerate/testEnumerate.cpp @@ -0,0 +1,299 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef RDK_USE_BOOST_SERIALIZATION +#include +#include +#include +#include +#endif + +using namespace RDKit; + +#ifdef RDK_USE_BOOST_SERIALIZATION +// for each starting point check to see that the archive +// starts at the same point +void pickleTest(EnumerationStrategyBase &en, size_t len) { + boost::shared_ptr base(en.copy()); + TEST_ASSERT(std::string(base->type()) == std::string(en.type())); + + for (size_t i = 0; i < len; ++i) { + std::stringstream ss; + { + boost::archive::text_oarchive ar(ss); + ar &base; + } + boost::shared_ptr copy; + { + boost::archive::text_iarchive ar(ss); + ar © + } + TEST_ASSERT(std::string(base->type()) == std::string(copy->type())); + TEST_ASSERT(base->next() == copy->next()); + TEST_ASSERT(base->getPosition() == en.next()); + } +} +#endif + +void testSamplers() { + EnumerationTypes::BBS bbs; + bbs.resize(3); + for (int i = 0; i < 10; ++i) + bbs[0].push_back(boost::shared_ptr(SmilesToMol("C=CCN=C=S"))); + + for (int i = 0; i < 5; ++i) + bbs[1].push_back(boost::shared_ptr(SmilesToMol("NCc1ncc(Cl)cc1Br"))); + + for (int i = 0; i < 6; ++i) + bbs[2].push_back( + boost::shared_ptr(SmilesToMol("NCCCc1ncc(Cl)cc1Br"))); + + ChemicalReaction rxn; + CartesianProductStrategy cart; + cart.initialize(rxn, bbs); + RandomSampleStrategy rand; + rand.initialize(rxn, bbs); + RandomSampleAllBBsStrategy randBBs; + randBBs.initialize(rxn, bbs); + EvenSamplePairsStrategy even; + even.initialize(rxn, bbs); + std::vector > enumerators; + enumerators.push_back( + boost::shared_ptr(cart.copy())); + enumerators.push_back( + boost::shared_ptr(rand.copy())); + enumerators.push_back( + boost::shared_ptr(randBBs.copy())); + enumerators.push_back( + boost::shared_ptr(even.copy())); +#ifdef RDK_USE_BOOST_SERIALIZATION + for (size_t i = 0; i < enumerators.size(); ++i) { + TEST_ASSERT(enumerators[i]->getNumPermutations() == 10 * 5 * 6); + pickleTest(*enumerators[i], 10 * 5 * 6); + } +#endif + // for(auto&& i: enumerators) { + // TEST_ASSERT(i->getNumPermutations() == 10*5*6); + //} +} + +void testEvenSamplers() { + EnumerationTypes::BBS bbs; + bbs.resize(3); + unsigned long R1 = 6000; + unsigned long R2 = 500; + unsigned long R3 = 10000; + for (unsigned long i = 0; i < R1; ++i) + bbs[0].push_back(boost::shared_ptr(SmilesToMol("C=CCN=C=S"))); + + for (unsigned long i = 0; i < R2; ++i) + bbs[1].push_back(boost::shared_ptr(SmilesToMol("NCc1ncc(Cl)cc1Br"))); + + for (unsigned long i = 0; i < R3; ++i) + bbs[2].push_back( + boost::shared_ptr(SmilesToMol("NCCCc1ncc(Cl)cc1Br"))); + + ChemicalReaction rxn; + EvenSamplePairsStrategy even; + even.initialize(rxn, bbs); + std::cout << even.getNumPermutations() << " " << R1 * R2 * R3 << std::endl; + TEST_ASSERT(even.getNumPermutations() == R1 * R2 * R3); + + for (size_t i = 0; i < 5000; ++i) { + even.next(); + } + even.stats(); +} + +const char *smiresults[] = { + "C=CCNC(=S)NCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCc1ncc(Cl)cc1Br", + "C=CCNC(=S)NCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCc1ncc(Cl)cc1Br", + "C=CCNC(=S)NCCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br"}; + +void testEnumerations() { + EnumerationTypes::BBS bbs; + bbs.resize(2); + + bbs[0].push_back(boost::shared_ptr(SmilesToMol("C=CCN=C=S"))); + bbs[0].push_back(boost::shared_ptr(SmilesToMol("CC=CCN=C=S"))); + + bbs[1].push_back(boost::shared_ptr(SmilesToMol("NCc1ncc(Cl)cc1Br"))); + bbs[1].push_back(boost::shared_ptr(SmilesToMol("NCCc1ncc(Cl)cc1Br"))); + bbs[1].push_back(boost::shared_ptr(SmilesToMol("NCCCc1ncc(Cl)cc1Br"))); + + ChemicalReaction *rxn = RxnSmartsToChemicalReaction( + "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);" + "!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:" + "2]"); + + { + EnumerateLibrary en(*rxn, bbs); + size_t i = 0; + for (; (bool)en; ++i) { + std::vector > res = en.nextSmiles(); + TEST_ASSERT(res.size() == 1); + TEST_ASSERT(res[0].size() == 1); + TEST_ASSERT(res[0][0] == smiresults[i]); + TEST_ASSERT(i<=6); + } + TEST_ASSERT(i == 6); + // tests reset + en.resetState(); + i = 0; + for (; (bool)en; ++i) { + std::vector > res = en.nextSmiles(); + TEST_ASSERT(res.size() == 1); + TEST_ASSERT(res[0].size() == 1); + TEST_ASSERT(res[0][0] == smiresults[i]); + TEST_ASSERT(i<=6); + } + TEST_ASSERT(i == 6); + + } + +#ifdef RDK_USE_BOOST_SERIALIZATION + { + + boost::shared_ptr en( + new EnumerateLibrary(*rxn, bbs, RandomSampleStrategy())); + + std::vector > >smir; + for (size_t j = 0; j < 10; ++j) { + std::vector > smiles = en->nextSmiles(); + smir.push_back(smiles); + } + + en->resetState(); + + for (size_t i = 0; i < 1000; ++i) { + // pickle and unpickle + std::stringstream ss; + { + boost::archive::text_oarchive ar(ss); + ar &en; + } + boost::shared_ptr copy; + { + boost::archive::text_iarchive ar(ss); + ar © + } + + for (size_t j = 0; j < 10; ++j) { + TEST_ASSERT(en->nextSmiles() == copy->nextSmiles()); + } + + copy->resetState(); + for (size_t j = 0; j < 10; ++j) { + TEST_ASSERT(smir[j] == copy->nextSmiles()); + } + } + } +#endif + delete rxn; +} + +const char *rxndata = "$RXN\nUntitled Document-1\n ChemDraw10291618492D\n\n 3 1\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n -0.4125 0.0000 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n -0.4125 0.0000 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n 0.4125 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n -0.4125 0.0000 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 14 15 0 0 0 0 0 0 0 0999 V2000\n 0.5072 -0.5166 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5072 0.3084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2949 -0.7616 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 1.7817 -0.0880 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2967 0.5794 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.5558 -1.5443 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n -0.2073 0.7208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9218 0.3083 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9217 -0.5167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.2073 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.6362 0.7208 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n 1.5452 1.3661 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n 2.3507 1.5443 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n -2.3507 0.3083 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 2 0 0\n 1 3 1 0 0\n 3 4 1 0 0\n 4 5 1 0 0\n 5 2 1 0 0\n 3 6 1 0 0\n 2 7 1 0 0\n 7 8 2 0 0\n 8 9 1 0 0\n 9 10 2 0 0\n 10 1 1 0 0\n 8 11 1 0 0\n 12 13 1 0 0\n 11 14 1 0 0\n 12 5 1 0 0\nM END\n"; + +void testInsaneEnumerations() { + EnumerationTypes::BBS bbs; + bbs.resize(3); + + ChemicalReaction *rxn2 = RxnBlockToChemicalReaction(rxndata); + //RxnOps::sanitizeRxn(*rxn2, MolOps::AdjustQueryParameters()); + MatchVectType tvect; + + bbs[0].push_back(boost::shared_ptr(SmilesToMol("CCNCC"))); + bbs[0].push_back(boost::shared_ptr(SmilesToMol("NCC"))); + std::cerr << "0,0 " << (int)SubstructMatch(*bbs[0][0].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl; + std::cerr << "0,1 " << (int)SubstructMatch(*bbs[0][1].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl; + + bbs[1].push_back(boost::shared_ptr(SmilesToMol("ClC1CCC1"))); + bbs[1].push_back(boost::shared_ptr(SmilesToMol("ClC1CCC1Cl"))); + std::cerr << "1,0 " << (int)SubstructMatch(*bbs[1][0].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl; + std::cerr << "1,1 " << (int)SubstructMatch(*bbs[1][1].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl; + + bbs[2].push_back(boost::shared_ptr(SmilesToMol("CCNCC"))); + bbs[2].push_back(boost::shared_ptr(SmilesToMol("NCC"))); + std::cerr << "2,0 " << (int)SubstructMatch(*bbs[2][0].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl; + std::cerr << "2,1 " << (int)SubstructMatch(*bbs[2][1].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl; + + + { + ChemicalReaction *rxn = RxnBlockToChemicalReaction(rxndata); + RxnOps::sanitizeRxn(*rxn, MolOps::AdjustQueryParameters()); + std::cerr << ChemicalReactionToRxnBlock(*rxn) << std::endl; + EnumerationParams ThereCanBeOnlyOne; + ThereCanBeOnlyOne.reagentMaxMatchCount = 1; + EnumerationTypes::BBS bbs2 = removeNonmatchingReagents( + *rxn, bbs, + ThereCanBeOnlyOne); + TEST_ASSERT(bbs2[0].size() == 1); + TEST_ASSERT(bbs2[1].size() == 1); + TEST_ASSERT(bbs2[2].size() == 1); + + delete rxn; + } + delete rxn2; +} + +int main(int argc, char *argv[]) { + RDLog::InitLogs(); + bool doLong = false; + if (argc > 1) { + if (!strncmp(argv[1], "-l", 2)) { + doLong = true; + } + } + + /* + testSamplers(); + testEvenSamplers(); + testEnumerations(); + */ + testInsaneEnumerations(); +} diff --git a/Code/GraphMol/ChemReactions/PreprocessRxn.cpp b/Code/GraphMol/ChemReactions/PreprocessRxn.cpp index 089f5f6ba..2b55826b3 100644 --- a/Code/GraphMol/ChemReactions/PreprocessRxn.cpp +++ b/Code/GraphMol/ChemReactions/PreprocessRxn.cpp @@ -89,6 +89,7 @@ bool preprocessReaction(ChemicalReaction &rxn, const std::map &queries, const std::string &propName) { rxn.setImplicitPropertiesFlag(true); + rxn.initReactantMatchers(); if (rxn.validate(numWarnings, numErrors)) { addRecursiveQueriesToReaction(rxn, diff --git a/Code/GraphMol/ChemReactions/SanitizeRxn.cpp b/Code/GraphMol/ChemReactions/SanitizeRxn.cpp new file mode 100644 index 000000000..86c35176f --- /dev/null +++ b/Code/GraphMol/ChemReactions/SanitizeRxn.cpp @@ -0,0 +1,391 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include "SanitizeRxn.h" +#include +#include + +namespace RDKit { +namespace RxnOps { + +// molAtomMapNumber ==> int +// molFileRLabel ==> unsigned int +namespace { +template +T getMaxProp(ChemicalReaction &rxn, const std::string &prop) { + T max_atom = (T)0; + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + T map; + if (atom->getPropIfPresent(prop, map)) { + if (map > max_atom) + max_atom = map; + } + } + } + + for(MOL_SPTR_VECT::iterator it = rxn.beginAgentTemplates(); + it != rxn.endAgentTemplates(); + ++it) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + T map; + if (atom->getPropIfPresent(prop, map)) { + if (map > max_atom) + max_atom = map; + } + } + } + + for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates(); + it != rxn.endProductTemplates(); + ++it) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + T map; + if (atom->getPropIfPresent(prop, map)) { + if (map > max_atom) + max_atom = map; + } + } + } + + return max_atom; +} + +struct AtomInfo { + Atom * atom; + unsigned int templateIdx; + unsigned int rlabel; + int atomMap; + int isotope; + std::string dummyLabel; + AtomInfo(Atom *at, unsigned int templateIdx) : + atom(at), templateIdx(templateIdx), rlabel(0), atomMap(0), + isotope(at->getIsotope()), dummyLabel() { + atom->getPropIfPresent(common_properties::_MolFileRLabel, rlabel); + atom->getPropIfPresent(common_properties::molAtomMapNumber, atomMap); + atom->getPropIfPresent(common_properties::dummyLabel, dummyLabel); + //std::cerr << atom->getIdx() << " : " << atom->getAtomicNum() << " " << + // " rgroup: " << rlabel << " atomMap " << atomMap << " isotope " << isotope << + // " label " << dummyLabel << + // std::endl; + } + + bool NeedsRLabel() { + return atom->getAtomicNum() == 0 && rlabel == 0; + } + + unsigned int bestGuessRLabel() { + if (rlabel) return rlabel; + if (isotope) return isotope; + if (atomMap) return atomMap; + if (dummyLabel.size()) { + try { + return boost::lexical_cast( + dummyLabel.substr(1,dummyLabel.size()-1)); + } catch (...) { + return 0; + } + } + return 0; + } + + void setRLabel(unsigned int rlabel) { + PRECONDITION(atom, "Internal error in SanitizeRxn - null atom"); + RWMol &mol = dynamic_cast(atom->getOwningMol()); + + QueryAtom qatom(*atom); + qatom.setProp(common_properties::_MolFileRLabel, rlabel); + std::string dLabel = "R" + boost::lexical_cast(rlabel); + qatom.setProp(common_properties::dummyLabel, dLabel); + if (rlabel > 0 && rlabel < 999) { + qatom.setIsotope(rlabel); + } + qatom.setQuery(makeAtomNullQuery()); + unsigned int idx = atom->getIdx(); + mol.replaceAtom(idx, &qatom); + atom = mol.getAtomWithIdx(idx); + } + + void setAtomMap(int map) { + atom->setProp(common_properties::molAtomMapNumber, map); + } +}; + +std::string makeReactantErrorMessage(const std::string &error, + const AtomInfo &at) { + std::ostringstream str; + str << error << " for reactant idx: " << at.templateIdx << + " atom: " << at.atom->getIdx(); + return str.str(); +} + +std::string makeProductErrorMessage(const std::string &error, + const AtomInfo &at) { + std::ostringstream str; + str << error << " for product idx: " << at.templateIdx << + " atom: " << at.atom->getIdx(); + return str.str(); +} +} + +// if we have query atoms without rlabels, make proper rlabels if possible +// ensure that every rlabel in the reactant has one in the product +void fixRGroups(ChemicalReaction &rxn) { + std::map remapped; + std::vector reactantAtomsToFix; + std::vector productAtomsToFix; + + unsigned int templateIdx = 0; + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it, ++templateIdx) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + AtomInfo at(atom, templateIdx); + if (at.NeedsRLabel()) + reactantAtomsToFix.push_back(at); + } + } + + templateIdx = 0; + for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates(); + it != rxn.endProductTemplates(); + ++it, ++templateIdx) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + AtomInfo at(atom, templateIdx); + if (at.NeedsRLabel()) + productAtomsToFix.push_back(at); + } + } + + if (!reactantAtomsToFix.size() && !productAtomsToFix.size()) + return; + + if( reactantAtomsToFix.size() != productAtomsToFix.size() ) { + std::ostringstream str; + str << "Mismatched rlabels: " << + reactantAtomsToFix.size() << " unmapped reactant rlabels," << + productAtomsToFix.size() << " unmappped product rlabels" ; + throw RxnSanitizeException(str.str()); + } + + + unsigned int max_rlabel = getMaxProp(rxn, + common_properties::_MolFileRLabel); + int max_atom_map = getMaxProp(rxn, + common_properties::molAtomMapNumber); + + BOOST_FOREACH(AtomInfo &rat, reactantAtomsToFix) { + bool found = false; + unsigned int bestGuess = rat.bestGuessRLabel(); + if (!bestGuess) { + throw RxnSanitizeException(makeReactantErrorMessage( + "Could not deduce RLabel", rat)); + } + + BOOST_FOREACH(AtomInfo &pat, productAtomsToFix) { + if (!pat.atom) continue; + + if(rat.bestGuessRLabel() == pat.bestGuessRLabel()) { + // if the atomMaps don't match, this is bad, no atomMap is ok(==0) + if (rat.atomMap == pat.atomMap) { + found = true; + rat.setRLabel( max_rlabel + rat.bestGuessRLabel() ); + pat.setRLabel( max_rlabel + pat.bestGuessRLabel() ); + if (!rat.atomMap) { // set atom mapping as well + rat.setAtomMap(max_atom_map + rat.bestGuessRLabel()); + pat.setAtomMap(max_atom_map + rat.bestGuessRLabel()); + } + pat.atom = NULL; // don't match again + break; + } + } + } + + if(!found) { + throw RxnSanitizeException(makeReactantErrorMessage( + "Could not find RLabel mapping", rat)); + } + } + return; +} + +// if we have query atoms without rlabels, make proper rlabels if possible +// ensure that every rlabel in the reactant has one in the product + +void fixAtomMaps(ChemicalReaction &rxn) { + int max_atom_map = getMaxProp( + rxn, + common_properties::molAtomMapNumber); + std::map potential_mappings; + + unsigned int templateIdx = 0; + + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it, ++templateIdx) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + AtomInfo at(atom, templateIdx); + if(at.rlabel && !at.atomMap) { + if(potential_mappings.find(at.rlabel) != potential_mappings.end()) { + throw RxnSanitizeException(std::string("Duplicated RLabels")); + } + int map = potential_mappings[at.rlabel] = rdcast(at.rlabel)+max_atom_map; + at.setAtomMap(map); + } + } + } + + + if (!potential_mappings.size()) + return; // everything is ok! + + templateIdx = 0; + for(MOL_SPTR_VECT::iterator it = rxn.beginProductTemplates(); + it != rxn.endProductTemplates(); + ++it, ++templateIdx) { + for (ROMol::AtomIterator atIt = (*it)->beginAtoms(); + atIt != (*it)->endAtoms(); + ++atIt) { + Atom *atom = (*atIt); + AtomInfo at(atom, templateIdx); + if(at.rlabel) { + if(!at.atomMap) { + at.setAtomMap(potential_mappings[at.rlabel]); + } else { + if (at.atomMap != potential_mappings[at.rlabel]) { + throw RxnSanitizeException(makeProductErrorMessage( + "RLabel is mapped in product but not in reactant", at)); + } + } + } + } + } +} + +// might throw mol sanitization exception??? wrap in RxnSanitize? +void fixReactantTemplateAromaticity(ChemicalReaction &rxn) { + unsigned int ops; + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it) { + RWMol * rw = dynamic_cast(it->get()); + if (rw) + sanitizeMol(*rw, ops, MolOps::SANITIZE_SETAROMATICITY); + else + PRECONDITION(rw, "Oops, not really a RWMol?"); + } +} + +void fixHs(ChemicalReaction &rxn) { + const bool mergeUnmappedOnly = true; + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it) { + RWMol * rw = dynamic_cast(it->get()); + if (rw) + MolOps::mergeQueryHs(*rw, mergeUnmappedOnly); + else + PRECONDITION(rw, "Oops, not really a RWMol?"); + } +} + +void adjustTemplates(ChemicalReaction &rxn, + const MolOps::AdjustQueryParameters ¶ms) { + if(!params.adjustDegree && !params.adjustRingCount) { + return; + } + + for(MOL_SPTR_VECT::iterator it = rxn.beginReactantTemplates(); + it != rxn.endReactantTemplates(); + ++it) { + RWMol * rw = dynamic_cast(it->get()); + if (rw) + adjustQueryProperties(*rw, ¶ms); + else + PRECONDITION(rw, "Oops, not really a RWMol?"); + } +} +void sanitizeRxn(ChemicalReaction &rxn, + unsigned int &operationsThatFailed, + unsigned int ops, + const MolOps::AdjustQueryParameters ¶ms) +{ + operationsThatFailed = SANITIZE_NONE; + + if (ops & SANITIZE_RGROUP_NAMES) { + operationsThatFailed = SANITIZE_RGROUP_NAMES; + fixRGroups(rxn); + } + + if (ops & SANITIZE_ATOM_MAPS) { + operationsThatFailed = SANITIZE_ATOM_MAPS; + fixAtomMaps(rxn); + } + + if (ops & SANITIZE_ADJUST_REACTANTS) { + operationsThatFailed = SANITIZE_ADJUST_REACTANTS; + adjustTemplates(rxn, params); + } + if (ops & SANITIZE_MERGEHS) { + operationsThatFailed = SANITIZE_MERGEHS; + fixHs(rxn); + } + + operationsThatFailed = SANITIZE_NONE; +} + +void sanitizeRxn(ChemicalReaction &rxn, const MolOps::AdjustQueryParameters ¶ms) { + unsigned int ops = 0; + return sanitizeRxn(rxn, ops, SANITIZE_ALL, params); +} + + +} +} diff --git a/Code/GraphMol/ChemReactions/SanitizeRxn.h b/Code/GraphMol/ChemReactions/SanitizeRxn.h new file mode 100644 index 000000000..9893e4505 --- /dev/null +++ b/Code/GraphMol/ChemReactions/SanitizeRxn.h @@ -0,0 +1,150 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutes for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#ifndef RDKIT_SANITIZERXN_H +#define RDKIT_SANITIZERXN_H + +#include "Reaction.h" +#include +#include +#include + +namespace RDKit { + +//! class for flagging sanitization errors +class RxnSanitizeException : public std::exception { + public: + RxnSanitizeException(const char *msg) : _msg(msg){}; + RxnSanitizeException(const std::string &msg) : _msg(msg){}; + const char *message() const { return _msg.c_str(); }; + ~RxnSanitizeException() throw(){}; + + private: + std::string _msg; +}; + + +namespace RxnOps { +//! Any dummy atom with a map but no RGroup label, should be an RGroup +//! in RDKit's view of a reaction. +//! See if these atoms can be salvaged into RGroups. +void fixRGroups(ChemicalReaction &rxn); + +//! If atom maps are not defined on rgroups, attempt to deduce them from the RGroup +//! labels, or add new ones if possible. +void fixAtomMaps(ChemicalReaction &rxn); + + +//! Adjusts the reactant templates to properly match reagents +void adjustTemplates(ChemicalReaction &rxn, const MolOps::AdjustQueryParameters ¶ms); + +//! merge query Hs if appropriate +void fixHs(ChemicalReaction &rxn); + +// Default adjustment parameters for matching reagents +inline const MolOps::AdjustQueryParameters DefaultRxnAdjustParams() { + MolOps::AdjustQueryParameters params; + params.adjustDegree = false; + params.adjustDegreeFlags = MolOps::ADJUST_IGNOREALL; + params.adjustRingCount = false; + params.adjustRingCountFlags = MolOps::ADJUST_IGNOREALL; + params.makeDummiesQueries = false; + params.aromatizeIfPossible = true; + return params; +} + +// Default adjustment parameters for ChemDraw style matching of reagents +inline const MolOps::AdjustQueryParameters ChemDrawRxnAdjustParams() { + MolOps::AdjustQueryParameters params; + params.adjustDegree = true; + params.adjustDegreeFlags = MolOps::ADJUST_IGNOREDUMMIES; + params.adjustRingCount = false; + params.adjustRingCountFlags = MolOps::ADJUST_IGNORENONE; + params.makeDummiesQueries = false; + params.aromatizeIfPossible = true; + return params; +} + +typedef enum { + SANITIZE_NONE = 0x0, + SANITIZE_RGROUP_NAMES = 0x1, + SANITIZE_ATOM_MAPS = 0x2, + SANITIZE_ADJUST_REACTANTS = 0x4, + SANITIZE_MERGEHS = 0x8, + SANITIZE_ALL = 0xFFFFFFFF +} SanitizeRxnFlags; + +//! \brief carries out a collection of tasks for cleaning up a reaction and +// ensuring +//! that it makes "chemical sense" in the context of RDKit reacitons +/*! + This functions calls the following in sequence + -# RxnOps::fixRGroups() + -# RxnOps::fixupAtomMaps() + -# RxnOps::fixupTemplateAromaticity() + -# RxnOps::mergeHs() + + \param rxn : the ChemicalReaction to be cleaned + + \param operationThatFailed : the first (if any) sanitization operation that + fails is set here. + The values are taken from the \c SanitizeFlags + enum. + On success, the value is \c + SanitizeFlags::SANITIZE_NONE + + \param sanitizeOps : the bits here are used to set which sanitization + operations are carried + out. The elements of the \c SanitizeFlags enum define + the operations. + + Notes: + - This attempts to fix known issues with certain reaction drawers. + HOWEVER, if any flag is returned in operationsPerformed, + the reaction may still be suspect to its validity. + - Aromaticity can be tricky when starting with Kekule structures that + have query features, aromaticity works well for non-query rings, however + certain structures (substitutions on Kekule rings that should really be + aromatic) may not have enough information. +*/ + +void sanitizeRxn(ChemicalReaction &rxn, + unsigned int &operationsThatFailed, + unsigned int sanitizeOps = SANITIZE_ALL, + const MolOps::AdjustQueryParameters ¶ms = DefaultRxnAdjustParams()); +//! \overload +void sanitizeRxn(ChemicalReaction &rxn, + const MolOps::AdjustQueryParameters ¶ms = DefaultRxnAdjustParams()); + +} +} + +#endif diff --git a/Code/GraphMol/ChemReactions/Wrap/CMakeLists.txt b/Code/GraphMol/ChemReactions/Wrap/CMakeLists.txt index bd2aad5b4..ea506369b 100644 --- a/Code/GraphMol/ChemReactions/Wrap/CMakeLists.txt +++ b/Code/GraphMol/ChemReactions/Wrap/CMakeLists.txt @@ -1,9 +1,15 @@ rdkit_python_extension(rdChemReactions + Enumerate.cpp rdChemReactions.cpp DEST Chem LINK_LIBRARIES -ChemReactions ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch GraphMol Catalogs FilterCatalog RDGeneral RDGeometryLib RDBoost) +ChemReactions FilterCatalog ChemTransforms Descriptors Fingerprints Subgraphs DataStructs Depictor FileParsers SmilesParse SubstructMatch GraphMol Catalogs FilterCatalog RDGeneral RDGeometryLib RDBoost ${Boost_SERIALIZATION_LIBRARY}) add_pytest(pyChemReactions ${CMAKE_CURRENT_SOURCE_DIR}/testReactionWrapper.py) +add_pytest(pyChemReactionEnumerations + ${CMAKE_CURRENT_SOURCE_DIR}/testEnumerations.py) + +add_pytest(pyChemReactionSanitize + ${CMAKE_CURRENT_SOURCE_DIR}/testSanitize.py) diff --git a/Code/GraphMol/ChemReactions/Wrap/Enumerate.cpp b/Code/GraphMol/ChemReactions/Wrap/Enumerate.cpp new file mode 100644 index 000000000..aa208fe25 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Wrap/Enumerate.cpp @@ -0,0 +1,435 @@ +// +// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Novartis Institutues for BioMedical Research Inc. +// nor the names of its contributors may be used to endorse or promote +// products derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +#include +#include +#include +#include +#include +#include +#include + +namespace python = boost::python; + + +namespace RDKit { + +template +std::vector ConvertToVect(T bbs) { + std::vector vect; + size_t num_bbs = python::extract(bbs.attr("__len__")()); + vect.resize(num_bbs); + for(size_t i=0; i(bbs[i].attr("__len__")()); + RDKit::MOL_SPTR_VECT &reacts = vect[i]; + reacts.reserve(len1); + for(unsigned int j=0;j(bbs[i][j]); + if(mol) + reacts.push_back(mol); + else { + throw_value_error("reaction called with non molecule reactant"); + } + } + } + return vect; +} + + +bool EnumerateLibraryBase__nonzero__(RDKit::EnumerateLibraryBase *base) { + return static_cast(*base); +} +bool EnumerationStrategyBase__nonzero__(RDKit::EnumerationStrategyBase *base) { + return static_cast(*base); +} + +inline python::object pass_through(python::object const& o) { return o; } + +PyObject *EnumerateLibraryBase__next__(RDKit::EnumerateLibraryBase *base) { + if (!static_cast(*base)) { + PyErr_SetString(PyExc_StopIteration, "Enumerations exhausted"); + boost::python::throw_error_already_set(); + } + std::vector mols; + { + NOGIL gil; + mols = base->next(); + } + PyObject *res=PyTuple_New(mols.size()); + + for(unsigned int i=0;i(PyBytes_FromStringAndSize(res.c_str(), res.length()))); + return retval; +} + +class EnumerateLibraryWrap : public RDKit::EnumerateLibrary { +public: + EnumerateLibraryWrap() : RDKit::EnumerateLibrary() {} + EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::list ob, + const EnumerationParams & params = EnumerationParams() + ) : + RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), params) { + } + + EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::tuple ob, + const EnumerationParams & params = EnumerationParams() + ) : + RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), params) { + } + + EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::list ob, + const EnumerationStrategyBase &enumerator, + const EnumerationParams & params = EnumerationParams() + ) : + RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), enumerator, params) { + } + + EnumerateLibraryWrap(const RDKit::ChemicalReaction &rxn, python::tuple ob, + const EnumerationStrategyBase &enumerator, + const EnumerationParams & params = EnumerationParams()) : + RDKit::EnumerateLibrary(rxn, ConvertToVect(ob), enumerator, params) { + } + +}; + +namespace { + template< typename T > + inline + std::vector< T > to_std_vector( const python::object& iterable ) + { + return std::vector< T >( python::stl_input_iterator< T >( iterable ), + python::stl_input_iterator< T >( ) ); + } +} + +void ToBBS(EnumerationStrategyBase &rgroup, ChemicalReaction &rxn, python::list ob) { + rgroup.initialize(rxn, ConvertToVect(ob)); +} + +typedef std::vector VectSizeT; +typedef std::vector > VectStringVect; +typedef std::vector VectMolVect; + +struct enumeration_wrapper { + static void wrap() { + std::string docString; + python::class_("VectorOfStringVectors") + .def(python::vector_indexing_suite() ); + + python::class_("VectSizeT") + .def(python::vector_indexing_suite() ); + + python::class_("VectMolVect") + .def(python::vector_indexing_suite() ); + + python::class_( + "EnumerateLibraryBase", python::no_init) + .def("__nonzero__", &EnumerateLibraryBase__nonzero__) + .def("__bool__", &EnumerateLibraryBase__nonzero__) + .def("__iter__", &pass_through) + .def("next", &EnumerateLibraryBase__next__, + "Return the next molecule from the enumeration.") + .def("__next__", &EnumerateLibraryBase__next__, + "Return the next molecule from the enumeration.") + .def("nextSmiles", &RDKit::EnumerateLibraryBase::nextSmiles, + "Return the next smiles string from the enumeration.") + .def("Serialize", &EnumerateLibraryBase_Serialize, + "Serialize the library to a binary string.\n" + "Note that the position in the library is serialized as well. Care should\n" + "be taken when serializing. See GetState/SetState for position manipulation.") + .def("InitFromString", &RDKit::EnumerateLibraryBase::initFromString, + python::arg("data"), + "Inititialize the library from a binary string") + .def("GetPosition", &RDKit::EnumerateLibraryBase::getPosition, + "Returns the current enumeration position into the reagent vectors", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + .def("GetState", &RDKit::EnumerateLibraryBase::getState, + "Returns the current enumeration state (position) of the library.\n" + "This position can be used to restart the library from a known position") + .def("SetState", &RDKit::EnumerateLibraryBase::setState, + python::arg("state"), + "Sets the enumeration state (position) of the library.") + .def("ResetState", &RDKit::EnumerateLibraryBase::resetState, + "Returns the current enumeration state (position) of the library to the start.") + .def("GetReaction", &RDKit::EnumerateLibraryBase::getReaction, + "Returns the chemical reaction for this library", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + .def("GetEnumerator", &RDKit::EnumerateLibraryBase::getEnumerator, + "Returns the enumation strategy for the current library", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()); + + docString = \ +"EnumerationParams\n\ +Controls some aspects of how the enumeration is performed.\n\ +Options:\n\ + reagentMaxMatchCount [ default Infinite ]\n\ + This specifies how many times the reactant template can match a reagent.\n\ +\n\ + sanePartialProducts [default false]\n\ + If true, forces all products of the reagent plus the product templates\n\ + pass chemical sanitization. Note that if the product template itself\n\ + does not pass sanitization, then none of the products will.\n\ +"; + + python::class_("EnumerationParams", + docString.c_str(), + python::init<>()) + .def_readwrite("reagentMaxMatchCount", + &RDKit::EnumerationParams::reagentMaxMatchCount) + .def_readwrite("sanePartialProducts", + &RDKit::EnumerationParams::sanePartialProducts); + + docString = \ +"EnumerateLibrary\n\ +This class allows easy enumeration of reactions. Simply provide a reaction\n\ +and a set of reagents and you are off the the races.\n\ +\n\ +EnumerateLibrary follows the python enumerator protocol, for example:\n\ +\n\ +library = EnumerateLibrary(rxn, bbs)\n\ +for products in library:\n\ + ... do something with the product\n\ +\n\ +It is useful to sanitize reactions before hand:\n\ +\n\ +SanitizeRxn(rxn)\n\ +library = EnumerateLibrary(rxn, bbs)\n\ +\n\ +If ChemDraw style reaction semantics are prefereed, you can apply\n\ +the ChemDraw parameters:\n\ +\n\ +SanitizeRxn(rxn, params=GetChemDrawRxnAdjustParams())\n\ +\n\ +For one, this enforces only matching RGroups and assumes all atoms\n\ +have fully satisfied valences.\n\ +\n\ +Each product has the same output as applying a set of reagents to\n\ +the libraries reaction.\n\ +\n\ +This can be a bit confusing as each product can have multiple molecules\n\ +generated. The returned data structure is as follows:\n\ +\n\ + [ [products1], [products2],... ]\n\ +Where products1 are the molecule products for the reactions first product\n\ +template and products2 are the molecule products for the second product\n\ +template. Since each reactant can match more than once, there may be\n\ +multiple product molecules for each template.\n\ +\n\ +for result in library:\n\ + for results_for_product_template in products:\n\ + for mol in results_for_product_template:\n\ + Chem.MolToSmiles(mol) # finally have a molecule!\n\ +\n\ +For sufficiently large libraries, using this iteration strategy is not\n\ +recommended as the library may contain more products than atoms in the\n\ +universe. To help with this, you can supply an enumeration strategy.\n\ +The default strategy is a CartesianProductStrategy which enumerates\n\ +everything. RandomSampleStrategy randomly samples the products but\n\ +this strategy never terminates, however, python supplies itertools:\n\ +\n\ +import itertools\n\ +library = EnumerateLibrary(rxn, bbs, rdChemReactions.RandomSampleStrategy())\n\ +for result in itertools.islice(libary, 1000):\n\ + # do something with the first 1000 samples\n\ +\n\ +for result in itertools.islice(libary, 1000):\n\ + # do something with the next 1000 samples\n\ +\n\ +Libraries are also serializable, including their current state:\n\ +\n\ +s = library.Serialize()\n\ +library2 = EnumerateLibrary()\n\ +library2.InitFromString(s)\n\ +for result in itertools.islice(libary2, 1000):\n\ + # do something with the next 1000 samples\n\ +"; + python::class_ >( + "EnumerateLibrary", docString.c_str(), + python::init<>()) + .def(python::init< + const RDKit::ChemicalReaction &, + python::list, + python::optional + >(python::args("rxn", "reagents", "params"))) + .def(python::init< + const RDKit::ChemicalReaction &, + python::tuple, + python::optional + >(python::args("rxn", "reagents", "params"))) + + .def(python::init + >(python::args( + "rxn", "reagents", "enumerator", "params"))) + .def(python::init + >(python::args( + "rxn", "reagents", "enumerator", "params"))) + + .def("GetReagents", &RDKit::EnumerateLibrary::getReagents, + "Return the reagents used in this library.", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + ; + + //iterator_wrappers().wrap("EnumerateLibraryIterator"); + + python::class_( + "EnumerationStrategyBase", python::no_init) + .def("__nonzero__", &EnumerationStrategyBase__nonzero__) + .def("__bool__", &EnumerationStrategyBase__nonzero__) + .def("Type", &EnumerationStrategyBase::type, + "Returns the enumeration strategy type as a string.") + .def("Skip", &EnumerationStrategyBase::skip, + python::args("skipCount"), + "Skip the next Nth results. note: this may be an expensive " + "operation\n" + "depending on the enumeration strategy used. It is recommended to " + "use\n" + "the enumerator state to advance to a known position") + .def("__copy__", python::pure_virtual(&EnumerationStrategyBase::copy), + python::return_value_policy()) + .def("GetNumPermutations", &EnumerationStrategyBase::getNumPermutations, + "Returns the total number of results for this enumeration strategy.\n" + "Note that some strategies are effectively infinite.") + .def("GetPosition", &EnumerationStrategyBase::getPosition, + "Return the current indices into the arrays of reagents", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + .def("next", python::pure_virtual(&EnumerationStrategyBase::next), + "Return the next indices into the arrays of reagents", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + .def("__next__", python::pure_virtual(&EnumerationStrategyBase::next), + "Return the next indices into the arrays of reagents", + python::return_internal_reference< + 1, python::with_custodian_and_ward_postcall<0, 1> >()) + .def("Initialize", ToBBS); + + docString = "CartesianProductStrategy produces a standard walk through all possible\n" + "reagent combinations:\n" + "\n" + "(0,0,0), (1,0,0), (2,0,0) ...\n"; + + python::class_ >("CartesianProductStrategy", + docString.c_str(), + python::init<>()) + .def("__copy__", &RDKit::CartesianProductStrategy::copy, + python::return_value_policy()) + ; + + docString = "RandomSampleStrategy simply randomly samples from the reagent sets.\n" + "Note that this strategy never halts and can produce duplicates."; + python::class_ >("RandomSampleStrategy", + docString.c_str(), + python::init<>()) + .def("__copy__", &RDKit::RandomSampleStrategy::copy, + python::return_value_policy()) + ; + + docString = "RandomSampleAllBBsStrategy randomly samples from the reagent sets\n" + "with the constraint that all building blocks are samples as early as possible.\n" + "Note that this strategy never halts and can produce duplicates."; + python::class_ >("RandomSampleAllBBsStrategy", + docString.c_str(), + python::init<>()) + .def("__copy__", &RDKit::RandomSampleAllBBsStrategy::copy, + python::return_value_policy()) + ; + + docString = "Randomly sample Pairs evenly from a collection of building blocks\n" + "This is a good strategy for choosing a relatively small selection\n" + "of building blocks from a larger set. As the amount of work needed\n" + "to retrieve the next evenly sample building block grows with the\n" + "number of samples, this method performs progressively worse as the\n" + "number of samples gets larger.\n" + "See EnumerationStrategyBase for more details.\n"; + + python::class_ >("EvenSamplePairsStrategy", + docString.c_str(), + python::init<>()) + .def("__copy__", &RDKit::EvenSamplePairsStrategy::copy, + python::return_value_policy()) + .def("Stats", &RDKit::EvenSamplePairsStrategy::stats, + "Return the a statisics log of the pairs used in the current enumeration.") + ; + + python::def("EnumerateLibraryCanSerialize", EnumerateLibraryCanSerialize, + "Returns True if the EnumerateLibrary is serializable " + "(requires boost serialization"); + + } +}; + +}// end of namespace + +void wrap_enumeration() { + RDKit::enumeration_wrapper::wrap(); +} + diff --git a/Code/GraphMol/ChemReactions/Wrap/rdChemReactions.cpp b/Code/GraphMol/ChemReactions/Wrap/rdChemReactions.cpp index fa775f9ba..3bf45d068 100644 --- a/Code/GraphMol/ChemReactions/Wrap/rdChemReactions.cpp +++ b/Code/GraphMol/ChemReactions/Wrap/rdChemReactions.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -328,7 +329,7 @@ python::object AddRecursiveQueriesToReaction(ChemicalReaction &self, python::object PreprocessReaction(ChemicalReaction &reaction, python::dict queryDict, std::string propName) { - + // transform dictionary into map std::map queries; unsigned int size = python::extract(queryDict.keys().attr("__len__")()); @@ -353,11 +354,11 @@ python::object PreprocessReaction(ChemicalReaction &reaction, reaction.validate(nWarn, nError); std::vector< std::vector > > labels; - + if (!nError) { preprocessReaction(reaction, nWarn, nError, labels, queries, propName); } - + // transform labels into python::tuple(python::tuple(python::tuple)) python::list reactantLabels; for (unsigned int i = 0; i < labels.size(); ++i) { @@ -374,6 +375,30 @@ python::object PreprocessReaction(ChemicalReaction &reaction, python::tuple(reactantLabels)); } +#ifdef RDK_32BIT_BUILD +typedef int sanitize_ops; +#else +typedef unsigned int sanitize_ops; +#endif + +RxnOps::SanitizeRxnFlags sanitizeReaction( + ChemicalReaction &rxn, + sanitize_ops sanitizeOps, + const MolOps::AdjustQueryParameters ¶ms, + bool catchErrors) { + unsigned int operationsThatFailed = 0; + try { + RxnOps::sanitizeRxn(rxn, operationsThatFailed, sanitizeOps, params); + } catch(...) { + if (!catchErrors) + throw; + } + return static_cast(operationsThatFailed); +} +} + + +void wrap_enumeration(); BOOST_PYTHON_MODULE(rdChemReactions) { python::scope().attr("__doc__") = @@ -661,7 +686,7 @@ of the replacements argument.", "Caution: This is an expert-user function which will change a property (molInversionFlag) of your products.\ This function is called by default using the RXN or SMARTS parser for reactions and should really only be called if reactions have been constructed some other way.\ The function updates the stereochemistry of the product by considering 4 different cases: inversion, retention, removal, and introduction"); - + python::def( "ReduceProductToSideChains", RDKit::reduceProductToSideChains, (python::arg("product"), python::arg("addDummyAtoms") = true), @@ -669,7 +694,7 @@ of the replacements argument.", The output is a molecule with attached wildcards indicating where the product was attached.\ The dummy atom has the same reaction-map number as the product atom (if available).", python::return_value_policy()); - + python::def("RemoveMappingNumbersFromReactions", RDKit::removeMappingNumbersFromReactions, (python::arg("reaction")), @@ -774,10 +799,36 @@ Sample Usage:\n\ True\n\ "; - python::def("PreprocessReaction", PreprocessReaction, + python::def("PreprocessReaction", RDKit::PreprocessReaction, (python::arg("reaction"), python::arg("queries")=python::dict(), - python::arg("propName")=common_properties::molFileValue), + python::arg("propName")=RDKit::common_properties::molFileValue), docString.c_str()); -} + + python::enum_("SanitizeFlags") + .value("SANITIZE_NONE", RDKit::RxnOps::SANITIZE_NONE) + .value("SANITIZE_ATOM_MAPS", RDKit::RxnOps::SANITIZE_ATOM_MAPS) + .value("SANITIZE_RGROUP_NAMES", RDKit::RxnOps::SANITIZE_RGROUP_NAMES) + .value("SANITIZE_ADJUST_REACTANTS", RDKit::RxnOps::SANITIZE_ADJUST_REACTANTS) + .value("SANITIZE_MERGEHS", RDKit::RxnOps::SANITIZE_MERGEHS) + .value("SANITIZE_ALL", RDKit::RxnOps::SANITIZE_ALL) + .export_values(); + ; + + python::def("GetDefaultAdjustParams", RDKit::RxnOps::DefaultRxnAdjustParams, + "Returns the default adjustment parameters for reactant templates"); + + python::def("GetChemDrawRxnAdjustParams", RDKit::RxnOps::ChemDrawRxnAdjustParams, + "Returns the chemdraw style adjustment parameters for reactant templates"); + + std::string docstring = "feed me"; + python::def( + "SanitizeRxn", RDKit::sanitizeReaction, + (python::arg("rxn"), python::arg("sanitizeOps") = rdcast(RDKit::RxnOps::SANITIZE_ALL), + python::arg("params") = RDKit::RxnOps::DefaultRxnAdjustParams(), + python::arg("catchErrors") = false), + docString.c_str()); + + wrap_enumeration(); + } diff --git a/Code/GraphMol/ChemReactions/Wrap/testEnumerations.py b/Code/GraphMol/ChemReactions/Wrap/testEnumerations.py new file mode 100644 index 000000000..dcd15a48e --- /dev/null +++ b/Code/GraphMol/ChemReactions/Wrap/testEnumerations.py @@ -0,0 +1,654 @@ +# Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +from __future__ import print_function + +import unittest +import os,sys, copy + +from rdkit.six.moves import cPickle + +from rdkit import rdBase +from rdkit import Chem +from rdkit.Chem import AllChem,rdChemReactions +from rdkit import Geometry +from rdkit import RDConfig +import itertools, time +import numpy as np + +def log(s): + Chem.LogErrorMsg("== " + s) + +class TestCase(unittest.TestCase) : + def setUp(self): + self.dataDir = os.path.join(RDConfig.RDBaseDir,'Code','GraphMol','ChemReactions','testData') + + def testCartesianProduct(self): + log("testCartesianProduct") + rxn = rdChemReactions.ChemicalReaction(); + rgroups = [[Chem.MolFromSmiles("C")]*10, + [Chem.MolFromSmiles("N")]*5, + [Chem.MolFromSmiles("O")]*6] + + cartProd = rdChemReactions.CartesianProductStrategy() + cartProd.Initialize(rxn, rgroups) + self.assertEquals(cartProd.GetNumPermutations(), 10*5*6) + groups = [] + count = 0 + print (cartProd.__bool__()) + while cartProd: + groups.append(tuple(cartProd.next())) +# count += 1 +# assert count <= cartProd.GetNumPermutations() + self.assertEquals(len(groups), 10*5*6) + # see if we are equal to the Python implementation + g = list(itertools.product( list(range(10)), list(range(5)), list(range(6)) )) + self.assertEquals(set(g), set(groups)) + copy.copy(cartProd) + + def testRandomSample(self): + log("testRandomSample") + rgroups = [[Chem.MolFromSmiles("C")]*10, + [Chem.MolFromSmiles("N")]*5, + [Chem.MolFromSmiles("O")]*6] + rxn = rdChemReactions.ChemicalReaction(); + + randProd = rdChemReactions.RandomSampleStrategy() + randProd.Initialize(rxn, rgroups) + self.assertEquals(randProd.GetNumPermutations(), 10*5*6) + groups = [] + for i in range(10*5*6): + groups.append(tuple(randProd.next())) + print( len(set(groups)), "out of", 10*5*6 ) + + randProd = rdChemReactions.RandomSampleStrategy() + randProd.Initialize(rxn, rgroups) + self.assertEquals(randProd.GetNumPermutations(), 10*5*6) + groups = [] + for i in range(10): + groups.append(tuple(randProd.next())) + + for i in range(3): + print( i, len(set([g[i] for g in groups])), "out of", [10,5,6][i] ) + copy.copy(randProd) + + def testRandomSampleAllBBs(self): + log("testRandomSampleAllBBs") + rxn = rdChemReactions.ChemicalReaction(); + rgroups = [[Chem.MolFromSmiles("C")]*10, + [Chem.MolFromSmiles("N")]*5, + [Chem.MolFromSmiles("O")]*6] + + randProd = rdChemReactions.RandomSampleAllBBsStrategy() + randProd.Initialize(rxn, rgroups) + self.assertEquals(randProd.GetNumPermutations(), 10*5*6) + groups = [] + for i in range(10*5*6): + groups.append(tuple(randProd.next())) + + print( len(set(groups)), "out of", 10*5*6 ) + + randProd = rdChemReactions.RandomSampleAllBBsStrategy() + randProd.Initialize(rxn, rgroups) + self.assertEquals(randProd.GetNumPermutations(), 10*5*6) + groups = [] + for i in range(10): + groups.append(tuple(randProd.next())) + + for i in range(3): + print( i, len(set([g[i] for g in groups])), "out of", [10,5,6][i] ) + self.assertEquals(len(set([g[i] for g in groups])), [10,5,6][i]) + copy.copy(randProd) + + def testTimings(self): + log("testTimings") + rxn = rdChemReactions.ChemicalReaction(); + + rgroups = [[Chem.MolFromSmiles("C")]*17000, + [Chem.MolFromSmiles("N")]*50000, + [Chem.MolFromSmiles("O")]*4000] + cartProd = rdChemReactions.CartesianProductStrategy() + randProd = rdChemReactions.RandomSampleStrategy() + randAllBBs = rdChemReactions.RandomSampleAllBBsStrategy() + for r in [cartProd, randProd, randAllBBs]: + r.Initialize(rxn, rgroups) + num = 10000000 + t1 = time.time() + r.Skip(num) + t2 = time.time() + print("%s Skipped %s in %s seconds"%(r, num, t2-t1)) + + def testEvenPairsSampling(self): + rxn = rdChemReactions.ChemicalReaction(); + + rgroups = [[Chem.MolFromSmiles("C")]*10, + [Chem.MolFromSmiles("N")]*10, + [Chem.MolFromSmiles("O")]*10] + + rxn = rdChemReactions.ChemicalReaction(); + count = 0 + pairs01 = {} + pairs12 = {} + pairs02 = {} + + strategy = rdChemReactions.EvenSamplePairsStrategy() + strategy.Initialize(rxn, rgroups) + # try 100 samples + while count < 100: + v = strategy.next() + p01 = (v[0], v[1]) + p12 = (v[1], v[2]) + p02 = (v[0], v[2]) + pairs01[p01] = pairs01.get(p01, 0) + 1 + pairs12[p01] = pairs12.get(p12, 0) + 1 + pairs02[p01] = pairs02.get(p02, 0) + 1 + count += 1 + + # each pair should be used rougly once + self.assertEquals(np.median(list(pairs01.values())), 1.0) + self.assertEquals(np.median(list(pairs02.values())), 1.0) + self.assertEquals(np.median(list(pairs12.values())), 1.0) + + # now try 1000 + pairs01 = {} + pairs12 = {} + pairs02 = {} + strategy = rdChemReactions.EvenSamplePairsStrategy() + strategy.Initialize(rxn, rgroups) + count = 0 + while count < 1000: + v = strategy.next() + p01 = (v[0], v[1]) + p12 = (v[1], v[2]) + p02 = (v[0], v[2]) + pairs01[p01] = pairs01.get(p01, 0) + 1 + pairs12[p01] = pairs12.get(p12, 0) + 1 + pairs02[p01] = pairs02.get(p02, 0) + 1 + count += 1 + + # each pair should be used roughly 10 times + self.assertTrue( 9 <= np.median(list(pairs01.values())) <= 11) + self.assertTrue( 9 <= np.median(list(pairs02.values())) <= 11) + self.assertTrue( 9 <= np.median(list(pairs12.values())) <= 11) + + # now try 500 + pairs01 = {} + pairs12 = {} + pairs02 = {} + strategy = rdChemReactions.EvenSamplePairsStrategy() + strategy.Initialize(rxn, rgroups) + count = 0 + while count < 500: + v = strategy.next() + p01 = (v[0], v[1]) + p12 = (v[1], v[2]) + p02 = (v[0], v[2]) + pairs01[p01] = pairs01.get(p01, 0) + 1 + pairs12[p01] = pairs12.get(p12, 0) + 1 + pairs02[p01] = pairs02.get(p02, 0) + 1 + count += 1 + + # each pair should be used roughly 5 times + self.assertTrue( 4 <= np.median(list(pairs01.values())) <= 6) + self.assertTrue( 4 <= np.median(list(pairs02.values())) <= 6) + self.assertTrue( 4 <= np.median(list(pairs12.values())) <= 6) + + + self.assertTrue("PAIRSTAT" in strategy.Stats()) + + def testEnumerateLibrary(self): + log("testEnumerateLibrary") + smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]" + rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea) + reagents = [ + [Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')], + [Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'), + ] + ] + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents) + self.assertTrue(enumerator) + + # need to initialize the reaction before getting the binary serialization + rxn.Initialize() + self.assertEquals(rxn.ToBinary(), enumerator.GetReaction().ToBinary()) + + bbs = enumerator.GetReagents() + for i in range(len(bbs)): + for j in range(len(bbs[i])): + self.assertTrue(Chem.MolToSmiles(reagents[i][j]) == Chem.MolToSmiles(bbs[i][j])) + + smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br'] + results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults] + + enumerators = [enumerator] + + # add serialized enumerators as well for testing if possible + if rdChemReactions.EnumerateLibraryCanSerialize(): + pickle = enumerator.Serialize() + enumerator2 = rdChemReactions.EnumerateLibrary() + enumerator2.InitFromString(pickle) + + # make sure old pickles work + enumerator3 = rdChemReactions.EnumerateLibrary() + enumerator3.InitFromString(open(os.path.join(self.dataDir, "enumeration.pickle"), 'rb').read()) + + print("==", enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type()) + self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type()) + enumerators.append(enumerator2) + enumerators.append(enumerator3) + + # check for fully sampled and deterministic ordering in final index values + expected_positions = [[0, 0],[1, 0],[0, 1],[1, 1],[0, 2],[1, 2]] + + out = [] + for en in enumerators: + i = 0 + positions = [] + for i, prods in enumerate(en): + positions.append( list(en.GetPosition()) ) + for mols in prods: + self.assertEquals(len(mols), 1) + smi = Chem.MolToSmiles(mols[0]) + if en is enumerator: + out.append(smi) + self.assertEquals(smi, results[i]) + + if en is enumerator and i == 1 and rdChemReactions.EnumerateLibraryCanSerialize(): + # save the state not at the start + pickle_at_2 = enumerator.Serialize() + self.assertEquals(i, 5) + self.assertEquals(positions, expected_positions) + + if rdChemReactions.EnumerateLibraryCanSerialize(): + # see if we can restore the enumeration from the middle + out3 = [] + enumerator3 = rdChemReactions.EnumerateLibrary() + enumerator3.InitFromString(pickle_at_2) + for prods in enumerator3: + for mols in prods: + self.assertEquals(len(mols), 1) + smi = Chem.MolToSmiles(mols[0]) + out3.append(smi) + + self.assertEquals(out[2:], out3) + # test smiles interface + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents) + i = 0 + while enumerator: + for mols in enumerator.nextSmiles(): + self.assertEquals(len(mols), 1) + self.assertEquals(mols[0], results[i]) + i += 1 + self.assertEquals(i, 6) + + def testRandomEnumerateLibrary(self): + log("testRandomEnumerateLibrary") + smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]" + rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea) + reagents = [ + [Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')], + [Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'), + ] + ] + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleStrategy()) + self.assertTrue(enumerator) + smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br'] + results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults] + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleStrategy()) + iteren = iter(enumerator) + res = set() + count = 0 + while res != set(results): + count += 1 + if count > 100000: + print("Unable to find enumerate set with 100,000 random samples!", file=sys.stderr) + self.assertEquals(res,set(results)) + + prod = iteren.next() + for mols in prod: + smi1 = Chem.MolToSmiles(mols[0]) + res.add(smi1) + + if rdChemReactions.EnumerateLibraryCanSerialize(): + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleStrategy()) + pickle = enumerator.Serialize() + enumerator2 = rdChemReactions.EnumerateLibrary() + enumerator2.InitFromString(pickle) + + self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type()) + + iteren = iter(enumerator) + iteren2 = iter(enumerator2) + + outsmiles = [] + for i in range(10): + prods1 = iteren.next() + prods2 = iteren2.next() + self.assertEquals(len(prods1), len(prods2)) + for mols1, mols2 in zip(prods1, prods2): + self.assertEquals(len(mols1), 1) + smi1 = Chem.MolToSmiles(mols1[0]) + self.assertEquals(smi1, Chem.MolToSmiles(mols2[0])) + outsmiles.append(smi1) + + if i == 1: + pickle_at_2 = enumerator.Serialize() + + # make sure we can pickle the state as well + enumerator3 = rdChemReactions.EnumerateLibrary() + enumerator3.InitFromString(pickle_at_2) + iteren3 = iter(enumerator3) + outsmiles2 = [] + for i in range(8): + prods3 = iteren3.next() + for mols3 in prods3: + self.assertEquals(len(mols3), 1) + smi1 = Chem.MolToSmiles(mols3[0]) + self.assertEquals(smi1, Chem.MolToSmiles(mols3[0])) + outsmiles2.append(smi1) + + self.assertEquals(outsmiles2, outsmiles[2:]) + + def testRandomEnumerateAllBBsLibrary(self): + log("testRandomEnumerateAllBBsLibrary") + smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]" + rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea) + reagents = [ + [Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')], + [Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'), + ] + ] + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleAllBBsStrategy()) + self.assertTrue(enumerator) + + # test the BB sampling here + strategy = iter(enumerator) + r1 = set() + r2 = set() + strategy.next() + groups = strategy.GetPosition() + print("**", list(groups), file=sys.stderr) + r1.add(groups[0]) + r2.add(groups[1]) + strategy.next() + groups = strategy.GetPosition() + print("**", list(groups),file=sys.stderr) + r1.add(groups[0]) + r2.add(groups[1]) + self.assertEquals(r1, set([0,1])) # two bbs at reagent one all sampled at one iteration + strategy.next() + groups = strategy.GetPosition() + print("**", list(groups),file=sys.stderr) + r1.add(groups[0]) + r2.add(groups[1]) + self.assertEquals(r2, set([0,1,2])) # three bbs at reagent one all sampled in three iterations + + smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br'] + results = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults] + + + if rdChemReactions.EnumerateLibraryCanSerialize(): + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleAllBBsStrategy()) + self.assertTrue(enumerator) + + pickle = enumerator.Serialize() + enumerator2 = rdChemReactions.EnumerateLibrary() + enumerator2.InitFromString(pickle) + + self.assertEquals(enumerator.GetEnumerator().Type(), enumerator2.GetEnumerator().Type()) + iteren = iter(enumerator) + iteren2 = iter(enumerator2) + + outsmiles = [] + for i in range(10): + prods1 = iteren.next() + prods2 = iteren2.next() + self.assertEquals(len(prods1), len(prods2)) + for mols1, mols2 in zip(prods1, prods2): + self.assertEquals(len(mols1), 1) + smi1 = Chem.MolToSmiles(mols1[0]) + self.assertEquals(smi1, Chem.MolToSmiles(mols2[0])) + outsmiles.append(smi1) + + if i == 1: + pickle_at_2 = enumerator.Serialize() + + # make sure we can pickle the state as well + enumerator3 = rdChemReactions.EnumerateLibrary() + enumerator3.InitFromString(pickle_at_2) + self.assertEquals(enumerator.GetEnumerator().Type(), enumerator3.GetEnumerator().Type()) + + iteren3 = iter(enumerator3) + outsmiles2 = [] + for i in range(8): + prods3 = iteren3.next() + for mols3 in prods3: + self.assertEquals(len(mols3), 1) + smi1 = Chem.MolToSmiles(mols3[0]) + self.assertEquals(smi1, Chem.MolToSmiles(mols3[0])) + outsmiles2.append(smi1) + + self.assertEquals(outsmiles2, outsmiles[2:]) + + + def testRGroupState(self): + if not rdChemReactions.EnumerateLibraryCanSerialize(): + print("-- Skipping testRGroupState, serialization of EnumerateLibrary not enabled", file=sys.stderr) + return + + log("testRGroupState") + smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]" + rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea) + reagents = [ + [Chem.MolFromSmiles('C=CCN=C=S'), Chem.MolFromSmiles('CC=CCN=C=S')], + [Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'), + ] + ] + + def tostr(l): + return [[str(x) for x in v] for v in l] + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents) + state = enumerator.GetState() + p = enumerator.nextSmiles() + p2 = enumerator.nextSmiles() + enumerator.SetState(state) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p)) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2)) + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleStrategy()) + + state = enumerator.GetState() + p = enumerator.nextSmiles() + p2 = enumerator.nextSmiles() + enumerator.SetState(state) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p)) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2)) + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents, + rdChemReactions.RandomSampleAllBBsStrategy()) + state = enumerator.GetState() + p = enumerator.nextSmiles() + p2 = enumerator.nextSmiles() + enumerator.SetState(state) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p)) + self.assertEquals(tostr(enumerator.nextSmiles()), tostr(p2)) + + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents) + smiresults = ['C=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCc1ncc(Cl)cc1Br', + 'C=CCNC(=S)NCCCc1ncc(Cl)cc1Br', + 'CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br'] + smiresults = [Chem.MolToSmiles(Chem.MolFromSmiles(smi)) for smi in smiresults] + enumerator.GetEnumerator().Skip(10) + enumerator.ResetState() + + results = [] + for result in enumerator: + for prodSet in result: + for mol in prodSet: + results.append( Chem.MolToSmiles(mol) ) + + self.assertEquals(results, smiresults) + + def testRemovingBadMatches(self): + log("testRemoveBadMatches") + smirks_thiourea = "[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:2]" + + rxn = rdChemReactions.ReactionFromSmarts(smirks_thiourea) + # invert matches so nothing matches + reagents = [ + [Chem.MolFromSmiles('NCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCc1ncc(Cl)cc1Br'), + Chem.MolFromSmiles('NCCCc1ncc(Cl)cc1Br'), + ], + + [Chem.MolFromSmiles('C=CCN=C=S'), + Chem.MolFromSmiles('CC=CCN=C=S'), + Chem.MolFromSmiles('CCC'), + Chem.MolFromSmiles('CCCCC'), + ], + ] + + enumerator = rdChemReactions.EnumerateLibrary(rxn, reagents) + self.assertEquals([], list(enumerator)) + + def testRemoveInsaneReagents(self): + rxndata = "$RXN\nUntitled Document-1\n ChemDraw10291618492D\n\n 3 1\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n -0.4125 0.0000 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n -0.4125 0.0000 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n 0.4125 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n -0.4125 0.0000 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 14 15 0 0 0 0 0 0 0 0999 V2000\n 0.5072 -0.5166 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5072 0.3084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2949 -0.7616 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 1.7817 -0.0880 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2967 0.5794 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.5558 -1.5443 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n -0.2073 0.7208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9218 0.3083 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9217 -0.5167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.2073 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.6362 0.7208 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n 1.5452 1.3661 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n 2.3507 1.5443 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n -2.3507 0.3083 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 2 0 0\n 1 3 1 0 0\n 3 4 1 0 0\n 4 5 1 0 0\n 5 2 1 0 0\n 3 6 1 0 0\n 2 7 1 0 0\n 7 8 2 0 0\n 8 9 1 0 0\n 9 10 2 0 0\n 10 1 1 0 0\n 8 11 1 0 0\n 12 13 1 0 0\n 11 14 1 0 0\n 12 5 1 0 0\nM END\n"; + + rxn = AllChem.ReactionFromRxnBlock(rxndata) + bbs = [] + r1 = [ Chem.MolFromSmiles("CCNCC"), + Chem.MolFromSmiles("NCC"), + ] + r2 = [ Chem.MolFromSmiles("ClC1CCCC1"), + Chem.MolFromSmiles("ClC1CCCC1Cl"), + ] + r3 = [ Chem.MolFromSmiles("CCNCC"), + Chem.MolFromSmiles("NCC"), + ] + bbs = [r1, r2, r3] + + # nothing matches! + for i,reagent in enumerate(rxn.GetReactants()): + for bb in bbs[i]: + self.assertFalse(bb.HasSubstructMatch(reagent)) + + # everything matches - yay sanitization! + rdChemReactions.SanitizeRxn(rxn) + for i,reagent in enumerate(rxn.GetReactants()): + for bb in bbs[i]: + self.assertTrue(bb.HasSubstructMatch(reagent)) + + en = rdChemReactions.EnumerateLibrary(rxn, bbs) + self.assertTrue(len(en.GetReagents()[0]) == 2) + self.assertTrue(len(en.GetReagents()[1]) == 2) + self.assertTrue(len(en.GetReagents()[2]) == 2) + + ##################################################################################### + # Match only at rgroups (ChemDraw style) + rxn = AllChem.ReactionFromRxnBlock(rxndata) + expected_matches = [[False,True], [True,True],[False, True] ] + rdChemReactions.SanitizeRxn(rxn, params=rdChemReactions.GetChemDrawRxnAdjustParams()) + for i,(reagent, expected) in enumerate(zip(rxn.GetReactants(), expected_matches)): + match = [bb.HasSubstructMatch(reagent) for reagent in bbs[i]] + self.assertTrue(match, expected) + + # Now try EnumerateLibrary + en = rdChemReactions.EnumerateLibrary(rxn, bbs) + self.assertTrue(len(en.GetReagents()[0]) == 1) + self.assertTrue(len(en.GetReagents()[1]) == 2) + self.assertTrue(len(en.GetReagents()[2]) == 1) + + + ##################################################################################### + # now set the removal options ot only make one product per reagent set + rxn = AllChem.ReactionFromRxnBlock(rxndata) + rdChemReactions.SanitizeRxn(rxn) + + opts = rdChemReactions.EnumerationParams() + opts.reagentMaxMatchCount = 1 + en = rdChemReactions.EnumerateLibrary(rxn, bbs, params=opts) + self.assertTrue(len(en.GetReagents()[0]) == 1) + self.assertTrue(len(en.GetReagents()[1]) == 1) + self.assertTrue(len(en.GetReagents()[2]) == 1) + + ##################################################################################### + # now set the removal options ot only make one product per reagent set + # but wt + rxn = AllChem.ReactionFromRxnBlock(rxndata) + rdChemReactions.SanitizeRxn(rxn, + params=rdChemReactions.GetChemDrawRxnAdjustParams()) + + + opts = rdChemReactions.EnumerationParams() + opts.reagentMaxMatchCount = 1 + en = rdChemReactions.EnumerateLibrary(rxn, bbs, params=opts) + self.assertTrue(len(en.GetReagents()[0]) == 1) + self.assertTrue(len(en.GetReagents()[1]) == 1) + self.assertTrue(len(en.GetReagents()[2]) == 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/Code/GraphMol/ChemReactions/Wrap/testSanitize.py b/Code/GraphMol/ChemReactions/Wrap/testSanitize.py new file mode 100644 index 000000000..3b07088c7 --- /dev/null +++ b/Code/GraphMol/ChemReactions/Wrap/testSanitize.py @@ -0,0 +1,239 @@ +# Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Novartis Institutes for BioMedical Research Inc. +# nor the names of its contributors may be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +from __future__ import print_function + +import unittest +import os,sys + +from rdkit.six.moves import cPickle + +from rdkit import rdBase +from rdkit import Chem +from rdkit.Chem import rdChemReactions, AllChem +from rdkit import Geometry +from rdkit import RDConfig +import itertools, time + +test_data = [("good", '''$RXN + + ISIS 052820091627 + + 2 1 +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + -3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9875 -7.4667 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0 + 1 2 1 0 0 0 0 +V 1 halogen.bromine.aromatic +M RGP 1 2 1 +M END +$MOL + + -ISIS- 05280916272D + + 4 3 0 0 0 0 0 0 0 0999 V2000 + 3.4375 -7.7917 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0 + 4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2 3 1 0 0 0 0 + 1 2 1 0 0 0 0 + 2 4 1 0 0 0 0 +V 2 boronicacid +M RGP 1 1 2 +M END +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + 11.2667 -7.3417 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0 + 11.9811 -6.9292 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0 + 1 2 1 0 0 0 0 +M RGP 2 1 1 2 2 +M END'''), + +("bad", '''$RXN + + ISIS 052820091627 + + 2 1 +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + -3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9875 -7.4667 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +V 1 halogen.bromine.aromatic +M RGP 1 2 1 +M END +$MOL + + -ISIS- 05280916272D + + 4 3 0 0 0 0 0 0 0 0999 V2000 + 3.4375 -7.7917 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2 3 1 0 0 0 0 + 1 2 1 0 0 0 0 + 2 4 1 0 0 0 0 +V 2 boronicacid +M RGP 1 1 2 +M END +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + 11.2667 -7.3417 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0 + 11.9811 -6.9292 0.0000 R# 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +M RGP 2 1 1 2 2 +M END'''), +# chemdraw style +("bad", '''$RXN + + ISIS 052820091627 + + 2 1 +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + -3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9875 -7.4667 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +V 1 halogen.bromine.aromatic +M END +$MOL + + -ISIS- 05280916272D + + 4 3 0 0 0 0 0 0 0 0999 V2000 + 3.4375 -7.7917 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2 3 1 0 0 0 0 + 1 2 1 0 0 0 0 + 2 4 1 0 0 0 0 +V 2 boronicacid +M END +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + 11.2667 -7.3417 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0 + 11.9811 -6.9292 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +M END'''), +("fail", '''$RXN + + ISIS 052820091627 + + 2 1 +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + -3.2730 -7.0542 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0 + -3.9875 -7.4667 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +V 1 halogen.bromine.aromatic +M END +$MOL + + -ISIS- 05280916272D + + 4 3 0 0 0 0 0 0 0 0999 V2000 + 3.4375 -7.7917 0.0000 R3 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -7.3792 0.0000 B 0 0 0 0 0 0 0 0 0 0 0 0 + 4.1520 -6.5542 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 4.8664 -7.7917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 2 3 1 0 0 0 0 + 1 2 1 0 0 0 0 + 2 4 1 0 0 0 0 +V 2 boronicacid +M END +$MOL + + -ISIS- 05280916272D + + 2 1 0 0 0 0 0 0 0 0999 V2000 + 11.2667 -7.3417 0.0000 R1 0 0 0 0 0 0 0 0 0 0 0 0 + 11.9811 -6.9292 0.0000 R2 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 +M END'''), +] + +good_res = (0,0,2,1,(((0, 'halogen.bromine.aromatic'),), ((1, 'boronicacid'),))) +bad_res = (3,0,2,1,(((0, 'halogen.bromine.aromatic'),), ((1, 'boronicacid'),))) + +class TestCase(unittest.TestCase) : + def test_sanitize(self): + for status, block in test_data: + print("*"*44) + rxna = AllChem.ReactionFromRxnBlock(block) + rxnb = AllChem.ReactionFromRxnBlock(block) + rxna.Initialize() + res = rdChemReactions.PreprocessReaction(rxna) + print(AllChem.ReactionToRxnBlock(rxna)) + if status == "good": + self.assertEquals(res, good_res) + elif status == "bad": + self.assertEquals(res, bad_res) + print (">"*44) + rxnb.Initialize() + try: + rdChemReactions.SanitizeRxn(rxnb) + res = rdChemReactions.PreprocessReaction(rxnb) + print(AllChem.ReactionToRxnBlock(rxnb)) + self.assertEquals(res, good_res) + assert not status == "fail" + except: + print ("$RXN Failed") + if status == "fail": + continue + + + +if __name__ == '__main__': + unittest.main() diff --git a/Code/GraphMol/ChemReactions/Wrap/test_list.py b/Code/GraphMol/ChemReactions/Wrap/test_list.py index 3984a3ea3..e54208bf7 100644 --- a/Code/GraphMol/ChemReactions/Wrap/test_list.py +++ b/Code/GraphMol/ChemReactions/Wrap/test_list.py @@ -1,11 +1,15 @@ import sys -tests = [("python", "testReactionWrapper.py", {}), ] +tests=[ + ("python", "testReactionWrapper.py",{}), + ("python", "testEnumerations.py",{}), + ] -longTests = [] +longTests=[ + ] -if __name__ == '__main__': +if __name__=='__main__': import sys from rdkit import TestRunner - failed, tests = TestRunner.RunScript('test_list.py', 0, 1) + failed,tests = TestRunner.RunScript('test_list.py',0,1) sys.exit(len(failed)) diff --git a/Code/GraphMol/ChemReactions/testData/enumeration.pickle b/Code/GraphMol/ChemReactions/testData/enumeration.pickle new file mode 100644 index 0000000000000000000000000000000000000000..328e43d06b405d93e0e3f0199f1c4d0a4ea5be89 GIT binary patch literal 4601 zcmds4-EPw`6n31%`B@Ozm{?mX$i&4ea?`Z1O}*OKKtf0x8ZSW9MoXlwozfHmS9>8| zfk)vLcmy~nS-LcMO-k1daOFDwiNDXEk57DBEo^@ER|t&3+u~d66_5e`4GlcL^iilS1?Oa z3f5M*S*0o1W{KI%iEaf8LRl*`uWM?3Am^2?Lb;(lqe)Gz(2A#JT1JgyWdGqgL2F9R z$&)(0_R!=yjET`qcAf7T9^I~MAZAqsYUsd@#{GUHaR#NVb}s%IE6pFVX-izhhVW$K zN?s4cGq-=>yWy#~S^EhI)yJi!CvGn8neRlN`yy=ZYqS^h9z#U-kTM=)F8;ToJu*V= zUwu072i+h#^iD$$)X$LrGxhJ3y%Y=o7ut?$R?!ck;>RKQp~6FiryNt^g{5$&(k?x@ zR@{lZM61JN3m?TBJ+g4ym``>bjcIY&%U?U-5;H||s#3VUrEv;p`e#i}prt;5c7Z&oZW%zdJAV_2AsaIrEjaKfAKbD2V@MPL4+GT|vs(F^E8mBb0U6f(^(J z96DCpYCG1+;oBhUbPn7w@cozXBg(_5K%(eiet)VT(;olZES5q zeO54vF9K3I9E@4HI68o~0Bk3YW>FLcavCS#4$6=(1V(%)MM^IpL}>W{Efl)2Pg%YHO57_$piMx#3ce%X^cw{ zR%RLKw&iTMxZpZWiy8cO0uF0Upx9o#L`g|Xk?o~iE<09<1uqJ}wLeYTK?hRlep_=ckThN=;j`o_AzsH(`4m}<1s=F(gNB+ML|H%%8dE@kT*Od%|0>l>O* zG*}BT7B!X0Qp$MuIhQfUADCw_UIKI2dXyQ7qs~OpkhpK?D@2E70eMN+6ecI+^>CC(C)(N)O.Cl"); diff --git a/Code/GraphMol/ChemReactions/tutorial/EnumerationToolkit.ipynb b/Code/GraphMol/ChemReactions/tutorial/EnumerationToolkit.ipynb new file mode 100644 index 000000000..4671ce46e --- /dev/null +++ b/Code/GraphMol/ChemReactions/tutorial/EnumerationToolkit.ipynb @@ -0,0 +1,1095 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "RDKit Enumeration Toolkit \n", + "=========================\n", + "\n", + "RDKit Reaction Enumeration Toolkit tutorial.\n", + "\n", + "Here you will learn how to enumerate reactions with various building blocks." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import print_function\n", + "from rdkit.Chem import AllChem\n", + "from rdkit.Chem import rdChemReactions\n", + "from rdkit.Chem.AllChem import ReactionFromRxnBlock, ReactionToRxnBlock\n", + "from rdkit.Chem.Draw import IPythonConsole\n", + "IPythonConsole.ipython_useSVG=True" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "The first thing we need is a reaction. Lets do a simple enumeration here, a halogen and a primary amine." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rxn_data = \"\"\"$RXN\n", + "\n", + " ISIS 090220091539\n", + "\n", + " 2 1\n", + "$MOL\n", + "\n", + " -ISIS- 09020915392D\n", + "\n", + " 2 1 1 0 0 0 0 0 0 0999 V2000\n", + " -2.0744 0.1939 0.0000 L 0 0 0 0 0 0 0 0 0 0 0 0\n", + " -2.5440 -0.1592 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0\n", + " 1 2 1 0 0 0 0\n", + " 1 F 2 17 35\n", + "V 1 halogen\n", + "M RGP 1 2 1\n", + "M ALS 1 2 F Cl Br \n", + "M END\n", + "$MOL\n", + "\n", + " -ISIS- 09020915392D\n", + "\n", + " 2 1 0 0 0 0 0 0 0 0999 V2000\n", + " 2.8375 -0.2500 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0\n", + " 3.3463 0.0438 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n", + " 1 2 1 0 0 0 0\n", + "V 2 amine.primary\n", + "M RGP 1 1 2\n", + "M END\n", + "$MOL\n", + "\n", + " -ISIS- 09020915392D\n", + "\n", + " 3 2 0 0 0 0 0 0 0 0999 V2000\n", + " 13.5792 0.0292 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n", + " 14.0880 0.3229 0.0000 R# 0 0 0 0 0 0 0 0 0 1 0 0\n", + " 13.0704 0.3229 0.0000 R# 0 0 0 0 0 0 0 0 0 2 0 0\n", + " 1 2 1 0 0 0 0\n", + " 1 3 1 0 0 0 0\n", + "M RGP 2 2 1 3 2\n", + "M END\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlgAAACWCAYAAAACG/YxAAAFd0lEQVR4nO3dbXLaOgCGUelOd9QF\nWllT15E1uT9y3RjjL+AlfPicmcwUI1KmBPtBVtza931fAACI+e/RTwAA4N0ILACAMIEFABAmsAAA\nwgQWAECYwAIACBNYAABhAgsAIExgAQCECSwAgDCBBQAQJrAAAMIEFgBAmMACAAgTWAAAYQILACBM\nYAEAhAksAIAwgQUAECawAADCBBYAQJjAAgAIE1gAAGECCwAgTGABAIQJLACAMIEFABAmsAAAwgQW\nAECYwAIACBNYAABhAgsAIExgAQCECSwAgDCBBQAQJrAAAMIEFgBAmMACAAgTWAAAYQILACBMYAEA\nhAksAIAwgQUAECawAADCBBYAQJjAAgAIE1gAAGECCwAgTGABAIQJLACAMIEFABAmsAAAwgQWAECY\nwAIACBNYAABhAgsAIExgAQCECSwAgDCBBQAQJrAAAMIEFgBAmMACAAgTWAAAYQILACBMYAHwUmqt\nj34KsElgAfBS+r4XWTw9gQXAyxFZPDuBdYX6WU++pvfBPbTWHv0U4KmIrNd0lJes9n3fP/pJvJL6\nWUv/uz/b1v3pSmtt9n64RK2ljN+VQ1h9fLSy9G4d77C8ozmaWmtxKFs39wFtum18e+0D3dq4tb9n\n2E/1/de2PR8a9457Rr8e/QReyVI8DXEFKbWW0nWtlDIOrOWx42PL9Da8u2EmS2Stm4uhYds0ZJbC\nZs+46e3zsLr8ub4igXWD8Q8mpK3NWI05poDIutXe49glx7shrLqunR0vt77POxxfrcG60lDtr/zi\n83zGn/T2jJtuM3vFkVmTlTE3S7Vn3PS+vr9sf/Rux1MzWFcQVqSNw2r657kd1No2kcWRmclatne9\n1LXjpuMvnYV6t+OqwLpQ/aylb9645KwF0VpkAfNE1rytWalbFp4vfe93i6ZLOEW409rlF1yagVvc\negxwRuRcrdXXwb+GnwPmrS1WX/tnW3rc3kXxR2IGa4fpbw9Og8plGRhb2sks3V7b+bTWSteVUuvy\nGM6ZuaBWM1jXmi5VKGV+Jr21tuvyMV13n+f57A4fWFsHwyGuxhW+FlTdn66U3+EnycuZC6rxeoSl\nT47Ljzn9VDne2Y13hsNtODJxdZ3xfqjrvm5P9zvD9u9xrZTyfXs8dngJhv3X1kvybrNdh7/Q6NK5\n5LnFeVszDXvG8f62fqb2jL9lHByZuLqP+v+1+aYzVnvC6ZJx78QarA0OftzT3GzWnnHAOXF1X19L\nFpb3RXNrt+qBLx9z+FOEt3DQY8namqvxtmvHAafE1c/4Pm3Yhi3/7nP5mFMCq+w7GO55HAy2ZqXM\njEKOuPpZX+s+h1OFrcytwcIpwlKKq7JzX3sXtW89Djgnrh7r46PNbneFDIF1xkEN4DWIq8eZ/vYy\n55wihAdYOi29tehd/MMXcfV408hy+ZhTAusGZruYs7Tmau3+S8fBkYmrn7f233ktjTn6S3T462AB\nAKRZgwUAECawAADCBBYAQJjAAgAIE1gAAGECCwAgTGABAIQJLACAMIEFABAmsAAAwgQWAECYwAIA\nCBNYAABhAgsAIExgAQCECSwAgDCBBQAQJrAAAMIEFgBAmMACAAgTWAAAYQILACBMYAEAhAksAIAw\ngQUAECawAADCBBYAQJjAAgAIE1gAAGECCwAgTGABAIQJLACAMIEFABAmsAAAwgQWAECYwAIACBNY\nAABhAgsAIExgAQCECSwAgDCBBQAQJrAAAMIEFgBAmMACAAgTWAAAYQILACBMYAEAhAksAIAwgQUA\nECawAADCBBYAQJjAAgAIE1gAAGECCwAgTGABAIQJLACAMIEFABAmsAAAwgQWAECYwAIACBNYAABh\nAgsAIExgAQCE/QWlzQdz1MzSTQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rxn = ReactionFromRxnBlock(rxn_data)\n", + "rxn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sanitizing Reaction Blocks\n", + "==========================\n", + "\n", + "Reaction blocks come from many different sketchers, and some don't follow the MDL conventions very well. It is always a good idea to sanitize your reaction blocks first. This is also true for Smiles reactions if they are in kekule form." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "rdkit.Chem.rdChemReactions.SanitizeFlags.SANITIZE_NONE" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "AllChem.SanitizeRxn(rxn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocessing Reaction Blocks\n", + "=============================\n", + "\n", + "You will note that there are some special annotations in the reaction block:\n", + " \n", + " V 1 halogen\n", + " V 2 amine.primary\n", + " \n", + "These allows us to specify functional groups with very specific smarts patterns. \n", + "These smarts patterns are preloaded into the RDKit, but require the use of PreprocessReactions\n", + "to embed the patterns." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of warnings: 0\n", + "Number of preprocessing errors: 0\n", + "Number of reactants in reaction: 2\n", + "Number of products in reaction: 1\n", + "Preprocess labels added: (((0, 'halogen'),), ((1, 'amine.primary'),))\n" + ] + } + ], + "source": [ + "rxn.Initialize()\n", + "nWarn, nError, nReactants, nProducts, labels = AllChem.PreprocessReaction(rxn)\n", + "print (\"Number of warnings:\", nWarn)\n", + "print (\"Number of preprocessing errors:\", nError)\n", + "print (\"Number of reactants in reaction:\", nReactants)\n", + "print (\"Number of products in reaction:\", nProducts)\n", + "print (\"Preprocess labels added:\", labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So now, this scaffold will only match the specified halogens and a primary amine. Let's get some!" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2016-11-05 09:31:09-- http://www.sigmaaldrich.com/content/dam/sigma-aldrich/docs/Aldrich/General_Information/1/sdf-benzylic-primary-amines.sdf\n", + "Resolving usca-proxy01.na.novartis.net... 160.62.237.221\n", + "Connecting to usca-proxy01.na.novartis.net|160.62.237.221|:2011... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 165130 (161K) [chemical/x-mdl-sdfile]\n", + "Saving to: 'amines.sdf'\n", + "\n", + "amines.sdf 100%[=====================>] 161.26K 759KB/s in 0.2s \n", + "\n", + "2016-11-05 09:31:09 (759 KB/s) - 'amines.sdf' saved [165130/165130]\n", + "\n" + ] + } + ], + "source": [ + "!wget http://www.sigmaaldrich.com/content/dam/sigma-aldrich/docs/Aldrich/General_Information/1/sdf-benzylic-primary-amines.sdf -O amines.sdf" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2016-11-05 09:31:09-- http://www.sigmaaldrich.com/content/dam/sigma-aldrich/docs/Aldrich/General_Information/1/sdf-alkyl-halides.sdf\n", + "Resolving usca-proxy01.na.novartis.net... 160.62.237.221\n", + "Connecting to usca-proxy01.na.novartis.net|160.62.237.221|:2011... connected.\n", + "Proxy request sent, awaiting response... 200 OK\n", + "Length: 149722 (146K) [chemical/x-mdl-sdfile]\n", + "Saving to: 'halides.sdf'\n", + "\n", + "halides.sdf 100%[=====================>] 146.21K 634KB/s in 0.2s \n", + "\n", + "2016-11-05 09:31:10 (634 KB/s) - 'halides.sdf' saved [149722/149722]\n", + "\n" + ] + } + ], + "source": [ + "!wget http://www.sigmaaldrich.com/content/dam/sigma-aldrich/docs/Aldrich/General_Information/1/sdf-alkyl-halides.sdf -O halides.sdf" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "reagents = [\n", + " [x for x in AllChem.SDMolSupplier(\"halides.sdf\")],\n", + " [x for x in AllChem.SDMolSupplier(\"amines.sdf\")]\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of reagents per template: [149, 131]\n" + ] + } + ], + "source": [ + "print (\"number of reagents per template:\", [len(x) for x in reagents])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Basic Usage\n", + "===========\n", + "\n", + "Creating a library for enumeration\n", + "----------------------------------\n", + "\n", + "Using the enumerator is simple, simply supply the desired reaction and reagents. The library filters away non-matching reagents by default. The RDKit will log any removed reagents to the info log." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit INFO: [09:31:10] Removed 37 non matching reagents at template 0\n" + ] + } + ], + "source": [ + "library = rdChemReactions.EnumerateLibrary(rxn, reagents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you only want each reactant to match once ( and hence only produce one product per reactant set ) you can adjust the parameters:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit INFO: [09:31:10] Removed 38 non matching reagents at template 0\n", + "RDKit INFO: [09:31:10] Removed 11 non matching reagents at template 1\n" + ] + } + ], + "source": [ + "params = rdChemReactions.EnumerationParams()\n", + "params.reagentMaxMatchCount = 1\n", + "library = rdChemReactions.EnumerateLibrary(rxn, reagents, params=params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Enumerating the library\n", + "-----------------------\n", + "\n", + "A library has an enumerator that determines what reagents are selected for purposes of enumeration.\n", + "The default enumerator is a CartesianProduct enumerator, which is a fancy way of saying enumerate everything. You can get hold this enumerator by using the **GetEnumerator** method." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "enumerator = library.GetEnumerator()\n", + "print (enumerator)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Possible number of permutations: 13320\n" + ] + } + ], + "source": [ + "print (\"Possible number of permutations:\", enumerator.GetNumPermutations())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Understanding results of enumerations\n", + "-------------------------------------\n", + "\n", + "Each enumeration result may contain multiple resulting molecules. Consider a reaction setup as follows:\n", + "\n", + " A + B >> C + D\n", + " \n", + "There may be multiple result molecules for a number of reasons:\n", + "\n", + " 1. The reactant templates (A and B) match a reagent multiple times.\n", + " Each match has to analyzed to form a new product. Hence,\n", + " the result has to be a vector of products.\n", + " 2. There me be multiple product templates, i.e. C+D as shown above\n", + " where C and D are two different result templates. These are\n", + " output in a result as follows:\n", + " \n", + " result = enumerator.next()\n", + " \n", + " result == [ [results_from_product_template1], \n", + " [results_from_product_template2], ... ]\n", + " \n", + " result[0] == [results_from_product_template1]\n", + " result[1] == [results_from_Product_template2]\n", + " \n", + "\n", + " \n", + "Because there may be multiple product templates specified with\n", + "potentially multiple matches, iterating through the results to\n", + "get to the final molecules isa bit complicated and requires three loops. Here we use:\n", + "\n", + " * **result** for the result of reacting one set of reagents\n", + " * **productSet** for the products for a given product template\n", + " * **mol** the actual product\n", + " \n", + "In many reactions, this will result in a single molecule, but the\n", + "datastructures have to handle the full set of results:\n", + " \n", + "```\n", + " for result in enumerator:\n", + " for productSet in results:\n", + " for mol in productSet:\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of result sets 13320\n", + "Number of result molecules 13320\n" + ] + } + ], + "source": [ + "count = 0\n", + "totalMols = 0\n", + "for results in library:\n", + " for productSet in results:\n", + " for mol in productSet:\n", + " totalMols += 1\n", + " count += 1\n", + "print(\"Number of result sets\", count)\n", + "print(\"Number of result molecules\", totalMols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Note: the productSet above may be empty if one of the current reagents did\n", + "not match the reaction!*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Note: the number of permutations is not the same as the number of molecules. There may be more or less depending on how many times the reagent matched the template, or if the reagent matched\n", + "at all.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How does the enumerator work?\n", + "=============================\n", + "\n", + "As mentioned, you can make a copy of the current enumeration scheme using the **GetEnumerator** method. Lets make a copy of this enumerator by **copying** it using copy.copy(..), this makes a copy so we don't change the state of the Library." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "import copy\n", + "enumerator = copy.copy(library.GetEnumerator())\n", + "print(enumerator)\n", + "test_enumerator = copy.copy(enumerator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's play with this enumerator.\n", + "\n", + "First: let's understand what the position means (this is the same as **library.GetPosition**)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[110L, 119L]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(test_enumerator.GetPosition())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What this means is make the product from reagents[0][111] and reagents[1][130]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAADPUlEQVR4nO3dQW4iMRRF0dDqHYX9\nr6BZEz2IFEVJQMBz2d/2OUOkSDUgV68ohE/X6/UNgFf9GX0BAHOTUYCIjAJEZBQgIqMAERkFiMgo\nQERGASIyChCRUYCIjAJEZBQgIqMAERkFiMgoQERGASJ/R1/Avs6Xy7dX/r2/D7kSICGjI+kmLEBG\ni/q5VT9dzuc7f+hUGOhMRou6N1TvhvJ0Oikp9CSjI32dnG7wYVIyOtIR6bxerwYp9OQLTwARGV3Q\nxyAdfRWwCxldk5JCNz5EW5ZPSKEPa3RZBin0IaMAERldmUEKHcjo4pQUjiajABEZXZ9BCoeSUYCI\njG7BIIXjyChAREZ3YZDCQWR0I0oKR5BRgIiM7sUgheZkFCDit9R21HyQehexMxmlAb9tys7c1ANE\nZJQGPLliZzIKEJFR2jBI2ZaMAkRklGYMUvYkowARGaUlg5QNyShAREZpzCBlNzJKe0rKVmQUICKj\nHMIgZR8yChCRUY5ikLIJGQWIyCgHMkjZgYwCRGSUYxmkLE9GASJOIqMHZ5GyMBllSs4ipQ439QAR\nGWVKnlxRh4wCRGSUWRmkFCGjABEZZWIGKRXIKHNTUoaTUYCIjDI9g5SxZBQgIqOswCBlIBkFiMgo\nizBIGUVGASIyyjoMUoaQUZaipPQnowARGWU1BimdyShAREZZkEFKTzIKEHG8IstqO0j9p3CLjMJD\nHOnMLW7qASIyCg/x2IpbZBQgIqPwKIOUX8koPEFJ+UlGASIyCs8xSPlGRgEiMgpPM0j5SkbhFUrK\nJxkFiMgovMgg5YOMAkRkFF5nkPImowAhGYWIQYqMQkpJNyejABEZhQYM0p3JKEDEKV3QjLNI9ySj\nUJSzSGfhph4gIqNQlMdWs5BRgIiMQl0G6RRkFEpT0vpkFCAio1CdQVqcjAJEZBQmYJBWJqMwByUt\nS0YBIjIK0zBIa5JRgIiMwkwM0oJkFCAiozAZg7QaGYX5KGkpMgoQkVGYkkFah4wCRGQUZmWQFiGj\nMDFHh1YgowARGQWIyChAREYBIjIKEJFRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFRgIiMAkRk\nFCAiowARGQWI/AeU+fmfzKiTyQAAAABJRU5ErkJggg==\n", + "image/svg+xml": [ + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "F\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reagents[0][111]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAD1klEQVR4nO3dwU7bUBRFUVzx4fnz\ndJC2orSgluN778vzWkIMEBIeRJvjxHGO+/3+AsBXfZs+AIDnJqMAERkFiMgoQERGASIyChCRUYCI\njAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZJST3W63X9/hCg53v+d0t9tNRrkOa5ST3X6a\nPhBoYo0CRKxRgIiMAkRkFCAiowARGQWIyChAREYBIjIKEJFROnhTExuTUTp4eygbk1GAiIzSxCBl\nVzIKEJFR+hikbMmN8mh1HB5y7MYapdX9fj+OY/oo4EwyChCRUboZpGxGRhmgpOxERgEiXjZlzLmD\n1COZKa/TB8BFnX7lk0upmOKkHiAiowyoWI5etmKKjAJEZJRudU9iGqSMkFFW8V8FlEvWIaO0+miK\n/u9E/Wh4GqT0k1H69FyTpKQ0k1HmfS2vcskiZJQmnZfHKyydZJRhSV7lkhXIKB3636mpsLSRUSbl\neZVLxsko5aZuGqKw9JBRxpyVV7lkloxS66zr7T/nanwGySiFVrgHqJJSTUYZ4EZ57ERGqbLCFH1Q\nWErJKN3cKI/NyCgl1pmiDwpLHRmlVXVe5ZJ+Msr5VpuiDwpLERnlfB8Fa/Z+o2vGnQ3IKH06b5TX\n84fgRUYpstoZtClKHRkFiMgoVdYZpKYopWSUQiuUVEOpJqMAERml1uwgNUVpIKMAERml3NQgNUXp\nIaN0WOG1Jigio+zJFKWNjNKkc5BqKJ1epw+Aq8vbqpjM8k+bVg070RSlmZN6gIiM0qr6GVJTlH4y\nSre6kmooI2QUICKjDKgYpKYoU2QUIOIfOGPOHaQeyUyRUTbhpJ4pTuoBIjLKJtxEiikyChCRUfZh\nkDJCRtmKktJPRgEiMspuDFKayShAREbZkEFKJxllT0pKGxkFiMgo2zJI6SGjABEZZWcGKQ1kFCAi\no2zOIKWajLI/JaWUjAJEZJRLMEipI6MAERnlKgxSisgoV+GjQykiowARGeUSTFHqyChAREbZnylK\nKRnlCjSUQjLK5o7jxRKllIwCRGSUnZmiNJBRtqWh9JBRgIiMsidTlDYyyp40lDYyChCRUYCIjAJE\nZBQgIqMAERkFiMgoC/nzo5Le/uQ4fnz94+9DDxnlOTwup398fV5SaCajPIePLqd/V1XoJ6M8Ge/y\nZDWv0wcAv/l8Wv61oY9Bqq1MkVHW8q6G754G/fzUXkkZ4aSe56CSLEtG2YTXmpjipJ6n8baSf12m\nSsoIn98NEHFSDxCRUYCIjAJEZBQgIqMAERkFiMgoQERGASIyChCRUYCIjAJEZBQg8h0jx3spAcHh\ngQAAAABJRU5ErkJggg==\n", + "image/svg+xml": [ + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "*\n", + "NH2\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reagents[1][130]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This also appears to be the last product. So lets' start over." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit INFO: [09:31:11] Removed 38 non matching reagents at template 0\n", + "RDKit INFO: [09:31:11] Removed 11 non matching reagents at template 1\n" + ] + }, + { + "data": { + "text/plain": [ + "[0L, 0L]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "library = rdChemReactions.EnumerateLibrary(rxn, reagents, params=params)\n", + "test_enumerator = copy.copy(library.GetEnumerator())\n", + "list(test_enumerator.GetPosition())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can Skip to the 100th result" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[99L, 0L]\n" + ] + } + ], + "source": [ + "test_enumerator.Skip(100)\n", + "pos = list(test_enumerator.GetPosition())\n", + "print(pos)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAACjklEQVR4nO3cMWrDQBBA0WzIjZz7\n3yBn2hQBkziosL7Bu/i9SrgQKuzPSCM85pxvAJz1/uwLANibjAIkMgqQyChAIqMAiYwCJDIKkMgo\nQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJ\njAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIK\nkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAi\nowDJx7MvgF2Nr3E9npd5/fB6DC9CRjnjJpdH9fyd2j8+D888pwqzGRnlbv+jeTSBHk6mx6kcYygp\ne/FsFCCRUdYy5xzj4FEALElGARIZZTkGUvYio9xtXubNCv5wIw8vwKaeM25K+vB3RX8GUit7tuCb\nyrqUlC24qQdIZJR12TWxBRkFSGSUpRlIWZ+MAiQyyuoMpCxORgESGWUDBlJWJqPsQUlZlowCJDLK\nNgykrElGARIZZScGUhYkowCJPyJjP48dSP0EiGQUIHFTD5DIKEAiowCJjAIkMgqQyChAIqMAiYwC\nJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDI\nKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMA\niYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQy\nCpDIKEDyDQfuWIMgl0pDAAAAAElFTkSuQmCC\n", + "image/svg+xml": [ + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "Cl\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reagents[0][pos[0]]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAADEElEQVR4nO3dQU5UURRFUcs4CubE\nnHBMOiYdxrdBYkAtArUr+ffGtXp0Kq8Bu877VOByHMcnAG71+ewDAOwmowCJjAIkMgqQyChAIqMA\niYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQy\nCpDIKEAiowCJjAIkMgqQyChAIqMAiYwCJDIKkMgoQCKjAImMAiQyCpDIKEAiowCJjAIkMgqQyCiD\nXC6Xs48AHyajDHIch5KyjowCJDLKLAYp68goQCKjjGOQsouMMpGSsoiMAiQyylAGKVvIKEAio8xl\nkLKCjAIkMspoBinzySjTKSnDyShAIqMsYJAymYwCJDLKDgYpY8koaygpM8koQCKjbGKQMpCMAiSX\n4zjOPgN8zH0HqR8Boi9nHwA+5nLx3s8sLvUAiYyyiSnKQDIKkMgoa5iizCSj7PBGQ32SlHPJKLuZ\nqJxORllAK5lMRllMXplARplOKxlORtlKXhlCRhlNK5lPRpnr7Q85yStDyChAIqMMZYqyhYyyjIYy\njYwykVayiIwyjus8u8goQCKjzGKKso7/xcSHPT0+vPry24+zTgITyCi3eJnOp8eHayX9I7i/ff3+\n89orm6KsI6PczXNPX1b1al6vv4hcso5no9zTG8u00FYms0a5xcvb+qsLfm7ocRx/R1NDmUxGucU7\nn43C/8ClnnGeB+nZp4D3klGAxKWeW1x7Nnov/3xCCjP5TmUuJWUFl3qAREaZy++aWEFGARIZZTSD\nlPlkFCCRUaYzSBlORgESGWUBg5TJZJQdlJSxZBQgkVHWMEiZSUYBEhllE4OUgWQUIJFRljFImUZG\n2ccfIWUUGQVIZBQgkVGAREYBEhkFSGQUIJFRgERGARIZBUhkFCCRUYBERgESGQVIZBQgkVGAREYB\nEhkFSGQUIJFRgERGARIZBUhkFCCRUYBERgESGQVIZBQgkVGAREYBEhkFSGQUIJFRgERGARIZBUhk\nFCCRUYBERgESGQVIZBQgkVGAREYBkl/kes0Ni0lSPQAAAABJRU5ErkJggg==\n", + "image/svg+xml": [ + "\n", + "\n", + " \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "Br\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reagents[0][pos[1]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's advance by one here and see what happens. It's no surprise that for the CartesianProduct strategy the first index is increased by one." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[100L, 0L]\n" + ] + } + ], + "source": [ + "pos = test_enumerator.next()\n", + "print(list(pos))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Enumeration States\n", + "==================\n", + "\n", + "Enumerations have states as well, so you can come back later using **GetState** and **SetState**\n", + "\n", + "**GetState** returns a text string so you can save this pretty much anywhere you like.\n", + "\n", + "Let's skip to the 100th sample and save both the state and the product at this step." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "State is:\n", + " '22 serialization::archive 12 0 1 1 31 RDKit::CartesianProductStrategy 1 1\\n0 0 1 2 0 99 0 2 0 111 120 13320 100\\n'\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "RDKit INFO: [09:31:11] Removed 38 non matching reagents at template 0\n", + "RDKit INFO: [09:31:11] Removed 11 non matching reagents at template 1\n" + ] + } + ], + "source": [ + "library = rdChemReactions.EnumerateLibrary(rxn, reagents, params=params)\n", + "# skip the first 100 molecules\n", + "library.GetEnumerator().Skip(100)\n", + "# get the state\n", + "\n", + "state = library.GetState()\n", + "print(\"State is:\\n\", repr(state))\n", + "\n", + "result = library.next()\n", + "for productSet in result:\n", + " for mol in productSet:\n", + " smiles = AllChem.MolToSmiles(mol)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now when we go back to this state, the next molecule should be the one we just saved." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CCNCc1cccc(c1)B1OC(C)(C)C(C)(C)O1 == CCNCc1cccc(c1)B1OC(C)(C)C(C)(C)O1 !\n" + ] + } + ], + "source": [ + "library.SetState(state)\n", + "result = library.next()\n", + "for productSet in result:\n", + " for mol in productSet:\n", + " assert AllChem.MolToSmiles(mol) == smiles\n", + " print(AllChem.MolToSmiles(mol), \"==\", smiles, \"!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Resetting the enumeration back to the beginning\n", + "===============================================\n", + "\n", + "To go back to the beginning, use **Reset**, for a CartesianProductStrategy this should revert back to [0,0] for indexing these reagents.\n", + "\n", + "This is useful because the state of the library is saved when the library\n", + "is serialized. See **Pickling Libraries** below." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0L, 0L]\n" + ] + } + ], + "source": [ + "library.ResetState()\n", + "print(list(library.GetPosition()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pickling Libraries\n", + "==================\n", + "\n", + "The whole library, including all reagents and the current enumeration state reagents is saved when the library is serialized." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s = library.Serialize() # XXX bug need default arg" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "library2 = rdChemReactions.EnumerateLibrary()\n", + "library2.InitFromString(s)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the libraries are in lock step." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result library1 CC(C)=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CC(C)=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 C=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 C=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CC=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CC=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CC=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CC=C(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CCC(C)(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CCC(C)(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CC(C)(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CC(C)(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CCCCCCCCCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CCCCCCCCCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CCCCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CCCCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CCCC(C)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library1 CCC(CC)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n", + "Result library2 CCC(CC)NCc1cccc(c1)B1OC(C)(C)C(C)(C)O1\n" + ] + } + ], + "source": [ + "for i in range(10):\n", + " result = library.next()\n", + " for productSet in result:\n", + " for mol in productSet:\n", + " print(\"Result library1\", AllChem.MolToSmiles(mol))\n", + " result = library2.next()\n", + " for productSet in result:\n", + " for mol in productSet:\n", + " print(\"Result library2\", AllChem.MolToSmiles(mol)) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Note: Don't forget that the enumeration state can be saved independently and applied to a serialized library. Note that you will need to be careful to ensure that the enumeration state actually came from the library you are applying it against!*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additional Enumeration Strategies\n", + "==================================" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*rdChemReactions.RandomSampleStrategy* - randomly sample from the building blocks\n", + "\n", + "*rdChemReactions.RandomSampleAllBBsStrategy* - randomly sample, but force using all reagents\n", + "\n", + "*rdChemReactions.EvenSamplePairs* - evenly sample pairs of building blocks (use only for generation of small libraries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Code/GraphMol/FilterCatalog/FilterMatcherBase.h b/Code/GraphMol/FilterCatalog/FilterMatcherBase.h index 7019e8a03..fed5da06f 100644 --- a/Code/GraphMol/FilterCatalog/FilterMatcherBase.h +++ b/Code/GraphMol/FilterCatalog/FilterMatcherBase.h @@ -116,10 +116,19 @@ class FilterMatcherBase virtual bool hasMatch(const ROMol &mol) const = 0; //------------------------------------ - //! Clone + //! Clone - deprecated // Clones the current FilterMatcherBase into one that // can be passed around safely. - virtual boost::shared_ptr Clone() const = 0; + virtual boost::shared_ptr Clone() const { + BOOST_LOG(rdWarningLog) << "FilterMatcherBase::Clone is deprecated, use copy instead" << std::endl; + return copy(); + } + + //------------------------------------ + //! copy + // copies the current FilterMatcherBase into one that + // can be passed around safely. + virtual boost::shared_ptr copy() const = 0; private: #ifdef RDK_USE_BOOST_SERIALIZATION diff --git a/Code/GraphMol/FilterCatalog/FilterMatchers.cpp b/Code/GraphMol/FilterCatalog/FilterMatchers.cpp index 7258e0259..7a684f170 100644 --- a/Code/GraphMol/FilterCatalog/FilterMatchers.cpp +++ b/Code/GraphMol/FilterCatalog/FilterMatchers.cpp @@ -91,7 +91,7 @@ bool SmartsMatcher::getMatches(const ROMol &mol, if (d_min_count == 1 && d_max_count == UINT_MAX) { RDKit::MatchVectType match; onPatExists = RDKit::SubstructMatch(mol, *d_pattern.get(), match); - if (onPatExists) matchVect.push_back(FilterMatch(Clone(), match)); + if (onPatExists) matchVect.push_back(FilterMatch(copy(), match)); } else { // need to count const bool uniquify = true; unsigned int count = @@ -99,7 +99,7 @@ bool SmartsMatcher::getMatches(const ROMol &mol, onPatExists = (count >= d_min_count && (d_max_count == UINT_MAX || count <= d_max_count)); if (onPatExists) { - boost::shared_ptr clone = Clone(); + boost::shared_ptr clone = copy(); for (size_t i = 0; i < matches.size(); ++i) { matchVect.push_back(FilterMatch(clone, matches[i])); } diff --git a/Code/GraphMol/FilterCatalog/FilterMatchers.h b/Code/GraphMol/FilterCatalog/FilterMatchers.h index 0c9382215..1bfacf9fd 100644 --- a/Code/GraphMol/FilterCatalog/FilterMatchers.h +++ b/Code/GraphMol/FilterCatalog/FilterMatchers.h @@ -58,7 +58,7 @@ class And : public FilterMatcherBase { //! True if arg1 and arg2 FilterMatchers are true And(const FilterMatcherBase &arg1, const FilterMatcherBase &arg2) - : FilterMatcherBase("And"), arg1(arg1.Clone()), arg2(arg2.Clone()) {} + : FilterMatcherBase("And"), arg1(arg1.copy()), arg2(arg2.copy()) {} And(const boost::shared_ptr &arg1, const boost::shared_ptr &arg2) @@ -93,7 +93,7 @@ class And : public FilterMatcherBase { return false; } - boost::shared_ptr Clone() const { + boost::shared_ptr copy() const { return boost::shared_ptr(new And(*this)); } @@ -122,7 +122,7 @@ class Or : public FilterMatcherBase { //! Constructs or Ander //! true if arg1 or arg2 are true Or(const FilterMatcherBase &arg1, const FilterMatcherBase &arg2) - : FilterMatcherBase("Or"), arg1(arg1.Clone()), arg2(arg2.Clone()) {} + : FilterMatcherBase("Or"), arg1(arg1.copy()), arg2(arg2.copy()) {} Or(const boost::shared_ptr &arg1, const boost::shared_ptr &arg2) @@ -154,7 +154,7 @@ class Or : public FilterMatcherBase { return res1 || res2; } - boost::shared_ptr Clone() const { + boost::shared_ptr copy() const { return boost::shared_ptr(new Or(*this)); } @@ -182,7 +182,7 @@ class Not : public FilterMatcherBase { // from getMatches since a false internal match matches // nothing! Not(const FilterMatcherBase &arg1) - : FilterMatcherBase("Not"), arg1(arg1.Clone()) {} + : FilterMatcherBase("Not"), arg1(arg1.copy()) {} Not(const boost::shared_ptr &arg1) : FilterMatcherBase("Not"), arg1(arg1) {} @@ -208,7 +208,7 @@ class Not : public FilterMatcherBase { return !arg1->getMatches(mol, matchVect); } - boost::shared_ptr Clone() const { + boost::shared_ptr copy() const { return boost::shared_ptr(new Not(*this)); } @@ -321,7 +321,7 @@ class SmartsMatcher : public FilterMatcherBase { virtual bool getMatches(const ROMol &mol, std::vector &matchVect) const; virtual bool hasMatch(const ROMol &mol) const; - virtual boost::shared_ptr Clone() const { + virtual boost::shared_ptr copy() const { return boost::shared_ptr(new SmartsMatcher(*this)); } @@ -403,7 +403,7 @@ class ExclusionList : public FilterMatcherBase { void addPattern(const FilterMatcherBase &base) { PRECONDITION(base.isValid(), "Invalid FilterMatcherBase"); - d_offPatterns.push_back(base.Clone()); + d_offPatterns.push_back(base.copy()); } void setExclusionPatterns( @@ -433,7 +433,7 @@ class ExclusionList : public FilterMatcherBase { return result; } - virtual boost::shared_ptr Clone() const { + virtual boost::shared_ptr copy() const { return boost::shared_ptr(new ExclusionList(*this)); } @@ -469,7 +469,7 @@ public: */ FilterHierarchyMatcher(const FilterMatcherBase &matcher) : FilterMatcherBase(), - d_matcher(matcher.Clone()) { + d_matcher(matcher.copy()) { } //! Return the name for this node (from the underlying FilterMatcherBase) @@ -491,7 +491,7 @@ public: */ void setPattern(const FilterMatcherBase & matcher) { PRECONDITION(matcher.isValid(), "Adding invalid patterns is not allowed."); - d_matcher = matcher.Clone(); + d_matcher = matcher.copy(); PRECONDITION(getName() == d_matcher->getName(), "Opps"); } @@ -527,8 +527,8 @@ public: return getMatches(mol, temp); } - //! Clones the FilterHierarchyMatcher into a FilterMatcherBase - virtual boost::shared_ptr Clone() const { + //! copys the FilterHierarchyMatcher into a FilterMatcherBase + virtual boost::shared_ptr copy() const { return boost::shared_ptr(new FilterHierarchyMatcher(*this)); } private: diff --git a/Code/GraphMol/FilterCatalog/Wrap/FilterCatalog.cpp b/Code/GraphMol/FilterCatalog/Wrap/FilterCatalog.cpp index 99bf832eb..5543b3828 100644 --- a/Code/GraphMol/FilterCatalog/Wrap/FilterCatalog.cpp +++ b/Code/GraphMol/FilterCatalog/Wrap/FilterCatalog.cpp @@ -71,7 +71,7 @@ void SetOffPatterns(ExclusionList &fc, boost::python::object list) { std::vector > temp; for (; begin != end; ++begin) { - temp.push_back((*begin)->Clone()); + temp.push_back((*begin)->copy()); } fc.setExclusionPatterns(temp); } @@ -147,7 +147,7 @@ class PythonFilterMatch : public FilterMatcherBase { functor(self), incref(false){}; - // ONLY CALLED FROM C++ from the Clone operation + // ONLY CALLED FROM C++ from the copy operation PythonFilterMatch(const PythonFilterMatch &rhs) : FilterMatcherBase(rhs), functor(rhs.functor), incref(true) { python::incref(functor); @@ -174,7 +174,7 @@ class PythonFilterMatch : public FilterMatcherBase { return python::call_method(functor, "HasMatch", boost::ref(mol)); } - virtual boost::shared_ptr Clone() const { + virtual boost::shared_ptr copy() const { return boost::shared_ptr(new PythonFilterMatch(*this)); } }; diff --git a/Contrib/Glare/README.txt b/Contrib/Glare/README.txt new file mode 100644 index 000000000..7ee3f8dbb --- /dev/null +++ b/Contrib/Glare/README.txt @@ -0,0 +1,49 @@ +Glare Algorithm. + +Implementation of + + GLARE: A New Approach for Filtering Large Reagent Lists in + Combinatorial Library Design Using Product Properties + Jean-Francois Truchon* and Christopher I. Bayly + + http://pubs.acs.org/doi/pdf/10.1021/ci0504871 + + Usage: + # somehow make sidechains1/2 with props [mw, alogp, tpsa] + r1 = RGroups(sidechains1) + r2 = RGroups(sidechains2) + lib = Library([r1, r2]) + props = [ + Property("mw", 0, 0, 500), + Property("alogp", 1, -2.4, 5), + Property("tpsa", 2, 0, 90) + ] + + glare = Glare() + glare.optimize(lib, props) + +Notes: +Some nomenclature: + + A Libary is made of RGroups + RGroups are a collection of sidechains (the paper uses Fragments) + that can populate the rgroup position. + + We desire to optimize the Library so that we have a good chance + of making the desired products. + + From the testing code, using Fake data: + + r1 = RGroups(makeFakeSidechains("aldehydes", num=1000)) + r2 = RGroups(makeFakeSidechains("boronic_acids", num=1500)) + + libs = Library([r1,r2]) + props = [ + Property("mw", propIdx=0, minValue=0, maxValue=500), + Property("alogp", propIdx=1, minValue=-2.4, maxValue=5), + Property("tpsa", propIdx=2, minValue=0, maxValue=90) + ] + + glare = Glare() + # optimize the library... + glare.optimize(libs, props) diff --git a/Contrib/Glare/glare.py b/Contrib/Glare/glare.py new file mode 100755 index 000000000..51348589f --- /dev/null +++ b/Contrib/Glare/glare.py @@ -0,0 +1,444 @@ +from __future__ import print_function +import random, operator, itertools, math + +""" +Glare Algorithm + +Some nomenclature: + + A Libary is made of RGroups + RGroups are a collection of sidechains (the paper uses Fragments) + that can populate the rgroup position. + + We desire to optimize the Library so that we have a good chance + of making the desired products. + + Example From the testing code, using Fake data: + + r1 = RGroups(makeFakeSidechains("aldehydes", num=1000)) + r2 = RGroups(makeFakeSidechains("boronic_acids", num=1500)) + + lib = Library([r1,r2]) + props = [ + Property("mw", propIdx=0, minValue=0, maxValue=500), + Property("alogp", propIdx=1, minValue=-2.4, maxValue=5), + Property("tpsa", propIdx=2, minValue=0, maxValue=90) + ] + + glare = Glare() + # optimize the library... + glare.optimize(lib, props) +""" + +class Property: + def __init__(self, name, propIdx, minValue, maxValue, scaffoldoffset=0.0): + """name, propIdx, minValue, maxValue, scaffoldoffset -> initial a Property + name is the name of the property. + propIdx: the index of the property in the property vector + minValue: the minimum acceptable value for the property + maxValue: the maximum acceptable value for the property + scaffoldoffset: any offset from the reaction scaffold (defaults to 0) + """ + self.name = name + self.propIdx = propIdx + self.minValue = minValue + self.maxValue = maxValue + self.offset = scaffoldoffset + + + def evaluate(self, sidechains): + """sidechains -> Evaluate a list of sidechains to see if they + pass the property values. + + Each sidechain must have a property vector e.g. (s.props for s in sidechains) + which is a vector of values where s.props[propIdx] is the property + being inspected + """ + product = self.offset + propIdx = self.propIdx + for s in sidechains: + product += s.props[propIdx] + return self.minValue <= product <= self.maxValue + +class Sidechain: + """Holds the name (identifier) and property list for the + given sidechain/fragment. Properties are assumed to + be numerical values""" + def __init__(self, name, props, goodCount=0): + """name, props, goodCount=0 -> initialize a Sidechain + initialize a sidechain. + name: the unique name for the sidechain + props: the property vector (see Properties class for details) + goodCount: the number of times this reagent belongs to + a good product, where good is a product that is in the desired + property space. + """ + self.name = name + self.props = props + self.good_count = goodCount # shared variable + self.dropped = False # shared variable + + def __str__(self): + return "Sidechain %s(%s, goodCount=%s)"%(self.name, + self.props, self.good_count) + def __repr__(self): + return "Sidechain(%r, %r, %s)"%(self.name, self.props, self.good_count) + +class RGroups: + """Holds a collection of sidechains for the given RGroup""" + def __init__(self, sidechains): + """Sidechains -> RGroups + sidechains: the list of Sidechains that make up the potential + sidechains at this rgroup position""" + self.sidechains = sidechains + + self.rejected = [] # list of rejected sidechains + self.initial_size = len(sidechains) + + def count(self): + """Returns the number of possible sidechains""" + return len(self.sidechains) + + def randomize(self): + """Randomly shuffles the sidechains and reset the goodness counts""" + random.shuffle(self.sidechains) + for s in self.sidechains: + s.good_count = 0 + + def effectiveness(self): + """-> return the current effectiveness of this collection + effectiveness is the number of items left divided by the + initial amount""" + + return len(self.sidechains)/float(self.initial_size) + + def chunk_size(self, num_chunks): + """num_chunks -> return the number of sidechains in each chunk + if the sidechains are split into num_chunks chunks""" + return int(math.ceil(float(len(self.sidechains))/num_chunks)) + + def chunk(self, chunk_idx, num_chunks): + """chunk_idx, num)chunks -> RGroups + return the chunk_idxth chunk given num_chunks total chunks""" + assert chunk_idx >=0 and chunk_idx < num_chunks, "%s %s"%( + chunk_idx, num_chunks) + + n = self.chunk_size(num_chunks) + return RGroups(self.sidechains[chunk_idx*n:(chunk_idx+1)*n]) + + def prune(self, fractionToKeep): + """fractionToKeep -> Sort the sidechains from the most often + found if good products to the least, and keep the best + fractionToKeep percentage""" + assert 0 < fractionToKeep <= 1.0, "fractionToKeep: %s"%fractionToKeep + + self.sidechains.sort(lambda x,y: -cmp(x.good_count, y.good_count)) + fragment_index = int(len(self.sidechains) * fractionToKeep + 0.5) + + # update rejected set + self.rejected += self.sidechains[fragment_index:] + self.sidechains = self.sidechains[:fragment_index] + +class Library: + """A library is a collection of RGroups that need to be combinitorially + combined""" + def __init__(self, rgroups): + """rgroups -> Initialize the Library. + rgroups: the list of possible RGroups that is combinitorially + combined to make the library""" + self.rgroups = rgroups + + def isValid(self): + """If we have an empty set for any rgroup, return False""" + for rg in self.rgroups: + if len(rg.sidechains) == 0: + return False + return True + + def randomize(self): + """randomize the order of the sidechains""" + for rg in self.rgroups: + rg.randomize() + + def getSidechainsPerPartition( self, total_num_partitions_per_rgroup ): + """total_num_partitions -> [num_fragments/partition for rgroup1, + num_fragments/partition for rgroup2] + return the number of sidechains in a partition + for each rgroup""" + + sizes = [ (libIdx, max(rg.count()/total_num_partitions_per_rgroup, 1)) + for libIdx, rg in enumerate(self.rgroups) ] + + # "optimially" apportion the partitions according the + # the glare paper see Appendix eq (8) and (9) + # sort by size + sizes.sort(lambda x,y: cmp(x[1], y[1])) + last_size = 1 + opt_sizes = [] + for libIdx, current_size in sizes[:-1]: + opt_sizes.append( (libIdx, + current_size - (current_size % last_size)) ) + last_size = current_size + + # From the Glare paper: + # the last library size is set equal to the second to last + # From Table 3, it is easy to understand that, if the fourth dimension + # was split in 24 instead of 12, a factor of 2 would be gained from the + # reduced size of the sublibraries. However, twice as many sublibraries + # would be needed, and the net speedup would be null, hence, the decision to + # set p4=p3. (p4 here is the last library) + libIdx, current_size = sizes[-1] + opt_sizes.append((libIdx, last_size)) + # back to the original library order + opt_sizes.sort() + res = [size for libIdx, size in opt_sizes] + return res + + def chunk(self, num_partitions): + """num_partitions -> [Library(..), Library(...)] + + Return new libraries that are chunks of this one. + These are the libraries that get sampled to see of + sidechains participate in good products. + """ + partitions = self.getSidechainsPerPartition(num_partitions) + max_subsets = max(partitions) + + enumeration_indices = [] + for i in xrange(max_subsets): + combinations = [] + for size in partitions: + combinations.append( i % size ) + enumeration_indices.append( combinations ) + + library_sets = [] + for subset_index, combinations in enumerate(enumeration_indices): + libs = [] + partitioned_rgroups = [] + for lib_index, libpart_index in enumerate(combinations): + lib = self.rgroups[lib_index] + num_chunks = partitions[lib_index] + partitioned_rgroups.append( lib.chunk(chunk_idx=libpart_index, + num_chunks=num_chunks)) + lib = Library(partitioned_rgroups) + if lib.isValid(): + library_sets.append(lib) + + return library_sets + + def effectiveness(self): + """-> returns the average effectiveness of this library set""" + sum = 0.0 + for rg in self.rgroups: + sum += rg.effectiveness() + return sum/len(self.rgroups) + + def evaluate(self, props): + """props -> num_good_enumerations, total_enumerations + + props: a list of Property evaluators for the fragments. + + returns the number of good enumerations and the total number of + enumerations for this Library""" + frags = [rg.sidechains for rg in self.rgroups] + good = 0 + bad = 0 + for i,frag in enumerate(itertools.product(*frags)): + for p in props: + if not p.evaluate(frag): + bad += 1 + break + else: + good += 1 + for sidechain in frag: + sidechain.good_count += 1 + return good, i+1 + + +class Glare: + """Glare Algorithm. Implementation of + + GLARE: A New Approach for Filtering Large Reagent Lists in + Combinatorial Library Design Using Product Properties + Jean-Francois Truchon* and Christopher I. Bayly + + http://pubs.acs.org/doi/pdf/10.1021/ci0504871 + + Usage: + # somehow make sidechains1/2 with props [mw, alogp, tpsa] + r1 = RGroups(sidechains1) + r2 = RGroups(sidechains2) + lib = Library([r1, r2]) + props = [ + Property("mw", 0, 0, 500), + Property("alogp", 1, -2.4, 5), + Property("tpsa", 2, 0, 90) + ] + + glare = Glare() + glare.optimize(lib, props) + """ + def __init__(self, + desiredFinalGoodness=0.95, + maxIterations=100, + rgroupScale=6.0, # None if no scaling + initialFraction=None,#None=auto -100., + numPartitions=16): + self.fractionGood = self.desiredFinalGoodness = desiredFinalGoodness + self.maxIterations = maxIterations + self.rgroupScale = rgroupScale + + if initialFraction is not None: + self.initialFraction = initialFraction/100. + else: + self.initialFraction = initialFraction + self.numPartitions = numPartitions + + def optimize(self, library, props): + """library, props + Given a Library and the list of Propery evaluators, + optimize the library. + The library is modified in place by removing building blocks + (sidechains) that are not likely to pass the property + criteria. + """ + # attempt to generate report like glare application + print ("------- PARAMETERS: --------------") + print ("GOOODNESS THRESHOLD : %s%%"%(self.desiredFinalGoodness * 100)) + print ("MIN PARTITION SIZE : %s"%self.numPartitions) + if self.initialFraction is None or self.initialFraction > 0.999: + print ("INITIAL FRACTION TO KEEP : AUTOMATIC") + else: + print ("INITAL FRACTION TO KEEP : %s%%"%(self.initialFraction*100)) + print ("Actual SIZE : %s = %s"%( + " x ".join([str(len(rg.sidechains)) for rg in library.rgroups]), + reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups]) + )) + + running_total = 0.0 + Gt = self.desiredFinalGoodness + + for iteration in range(1, self.maxIterations+1): + # chunk of the total library into smaller more managable sets + # and run combinitorial analysis on the sub libraries + # each of these records the number of times a sidechain is used + # in a successful enumeration which is then used to prune the + # library at the end + # + for rg in library.rgroups: + rg.randomize() + + good = total = 0.0 + chunked_libs = library.chunk(self.numPartitions) + # for each chunk, do the combinitorial check to see + # if reagents make good products + for libidx, chunk in enumerate(chunked_libs): + g,t = chunk.evaluate(props) + good += g + total += t + running_total += total + Gi = good/total # current goodness + + if Gi < 1e-12: + # I think we're done here :) + fraction = 0.0 + elif iteration == 1: + G0 = Gi # Goodness at first iteration + + # the first time, use the initalFraction or a "good enough" + # value + if self.initialFraction is not None: + fraction = K0 = self.initialFraction + else: + # auto choose the fraction based on the current good percentage + # and the desired + fraction = K0 = min(-1.1 * ( Gt - G0) + 1.2, + 0.9) + else: + # the second time, gradually eliminate reagents slowing + # down as the number of iterations increases + # see equation (5) in reference + if abs(Gt-G0) < 1e-4: + Ki = 1.0 + else: + Ki = (1.0 - K0) * (Gi - G0) / (Gt - G0) + K0; + fraction = min(1.0, Ki) + + # prune the library to keep the highest occuring sidechains + # note that even if all sidechains are acceptable, + # some will always get pruned + + max_size = float(max([len(rg.sidechains) for rg in library.rgroups])) + for rg in library.rgroups: + scale = 1.0 + if self.rgroupScale is not None: + # scale differently size rgroups via equation (6) in paper + numSidechains = len(rg.sidechains) + numer = 1.0 + denom = 1.0 + math.exp(-self.rgroupScale * + ((numSidechains/max_size) - 0.5)) + scale = numer/denom + fraction_to_reject = (1.0 - fraction) * scale + # keep the best fraction... + rg.prune(1.0 - fraction_to_reject) + + print ("-------------- ITERATION : %s ----------------------"%iteration) + print ("GOODNESS : %s%%"%(Gi * 100)) + print ("NUMBER EVAL : %s"%(total)) + print ("CUMUL EVAL : %s"%(running_total)) + print ("KEPT IN STEP : %s%%"%(fraction*100.)) + if not iteration: + print ("GOODNESS THRESHOLD : %s"%self.desiredFinalGoodness) + print ("MIN PARTITION SIZE : %s"%self.numPartitions) + print ("INITIAL FRACTION TO KEEP : ") + if self.fractionToKeep > 0.999: + print ("AUTOMATIC") + else: + print ("%s%%"%self.fractionToKeep) + + print ("Actual SIZE : %s = %s"%( + " x ".join([str(len(rg.sidechains)) for rg in library.rgroups]), + reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups]) + )) + print ("EFFECTIVENESS : %s%%"%(library.effectiveness()*100.)) + + # stopping critieria + if iteration and Gi < 1e-12: + return + elif abs(Gi - self.desiredFinalGoodness) < 0.001 or \ + Gi > self.desiredFinalGoodness: + return + +###################################################################### +# testing codes +def makeFakeProps(): + mw = random.randint(10,500) + alogp = random.randint(-10,10) + tpsa = random.randint(0,180) + return [mw, alogp, tpsa] + +def makeFakeSidechains(lib, num): + res = [] + for i in range(num): + res.append(Sidechain(lib + "_" + str(i), makeFakeProps())) + return res + +def testGlare(): + a = RGroups(makeFakeSidechains("aldehydes", 1000)) + b = RGroups(makeFakeSidechains("boronic_acids", 1500)) + + lib = Library([a,b]) + props = [ + Property("mw", 0, 0, 500, 230.1419), + Property("alogp", 1, -2.4, 5, 2.212749), + Property("tpsa", 2, 0, 90, 24.5) + ] + + glare = Glare() + glare.optimize(lib, props) + +if __name__ == "__main__": + testGlare() + + +