Files
rdkit/Code/GraphMol/ChemReactions/Enumerate/Enumerate.cpp
Brian Kelley fa89438358 Dev/reaction enumeration (#1111)
* Adds C++ Enumeration Engine to the RDKit

* Adds Sanitization helpers, wrappers and tests

* Clang format

* Remove unused enumerationStateOnly flag

* Fixes docStrings to current API

* Adds doc strings

* Removes RGroupPosition, adds getPosition to EnumerationBase

* Fixes readability.

* Adds EnumerateLibraryBase::reset and getReaction

* Added getReagents method to EnumerateLibrary

* Make the tests have the same naming

* Need to save the initial state for resetting.

* Stupid case-insensitive file systems

* Moves ResetState to EnumerateLibraryBase

* Adds removeNonmatchingReagents helper

* Renames currentPosition to getPosition

* Adds Enumeration Toolkit tutorial

* Fixes Python3 serialization and enumerators

* Verified to run on python2 and 3

* Fixes integer issues on windows

* The number of enumeration should be unsigned.

* Adds deserialization constructor

* Moves boost_serialization to the end

* Deprecates Clone in favor of copy

* Update tests to use copy.copy not Clone

* Move RGROUPS and BBS into an EnumerationTypes namespace

* Make sure old pickles work

* Adds pickle for backwards compatibility

* Moves to uint64_t from size_t for public api

* Whups, accidentally used the binary archiver.

* Commits boost 1.55 serialization

* Makes serialization turnoffable Like Filter Catalog

* Fixes tests when serialization not available.  Adds more enumeration strategy tests

* Fixes a syntax error on some versions of python

* Fixes sanitizeRxn to actually make proper RGroup atoms

* Updates SanitizeRXN python API

* Updates Enumeration API to a parameter class - fixes reagent removal

* Adds a mess of tests

* Change stats to return a string.

* Exposes EvenPairSamplingStrategy Stats to python

* Fixes a crash bug in SanitizeRxn

* Adds better testing of the even pair sampling

* Fixes namespace

* One more try to fix gcc

* Enum classes are c++11 and a microsoft extension.

* Fix typo

* Fixes np.median for python3

* Fixes atom iterators

* Adds virtual tags to derived virtual functions (for clarity)

* Fixes size comparison issues

* Adds doc string

* Small cleanup (has no effect since flags aren’t used)

* fixes crash bug on windows

* get the tests working on windows

* Updates tutorial

* Adds Glare implementation to Contrib
2016-11-05 14:42:52 +01:00

260 lines
9.0 KiB
C++

//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include "Enumerate.h"
#include "CartesianProduct.h"
#include "RandomSample.h"
#include "RandomSampleAllBBs.h"
#include "EvenSamplePairs.h"
#include "../ReactionPickler.h"
#include <GraphMol/MolPickler.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
// Since we are exporting the classes for serialization,
// we should declare the archives types used here
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <boost/serialization/shared_ptr.hpp>
#include <boost/serialization/export.hpp>
#include <RDGeneral/BoostEndInclude.h>
BOOST_CLASS_EXPORT(RDKit::EnumerationStrategyBase);
BOOST_CLASS_EXPORT(RDKit::CartesianProductStrategy);
BOOST_CLASS_EXPORT(RDKit::RandomSampleStrategy);
BOOST_CLASS_EXPORT(RDKit::RandomSampleAllBBsStrategy);
BOOST_CLASS_EXPORT(RDKit::EvenSamplePairsStrategy);
BOOST_CLASS_EXPORT(RDKit::EnumerateLibrary);
#endif
namespace RDKit {
using namespace EnumerationTypes;
const RGROUPS &EnumerateLibraryBase::getPosition() const {
return m_enumerator->getPosition();
}
std::string EnumerateLibraryBase::getState() const {
PRECONDITION(m_enumerator.get(), "Null Enumerator");
std::string state;
EnumerationStrategyPickler::pickle(m_enumerator, state);
return state;
}
void EnumerateLibraryBase::setState(const std::string &state) {
m_enumerator = EnumerationStrategyPickler::fromPickle(state);
}
void EnumerateLibraryBase::resetState() {
PRECONDITION(m_initialEnumerator.get(),
"Unset initial enumerator");
m_enumerator.reset(m_initialEnumerator->copy());
}
std::vector<std::vector<std::string> > EnumerateLibraryBase::nextSmiles() {
std::vector<std::vector<std::string> > result;
std::vector<MOL_SPTR_VECT> mols = next();
const bool doisomeric = true;
result.resize(mols.size());
for (size_t i = 0; i < mols.size(); ++i) {
result[i].resize(mols[i].size());
for (size_t j = 0; j < mols[i].size(); ++j) {
if (mols[i][j].get()) result[i][j] = MolToSmiles(*mols[i][j], doisomeric);
}
}
return result;
}
namespace {
size_t countMatches( const ROMol& bb, const ROMol& query, int maxMatches) {
std::vector<MatchVectType> matches;
const bool uniquify = true;
const bool useChirality = true;
const bool useQueryQueryMatches = false;
SubstructMatch(bb, query, matches,
uniquify, true, useChirality, useQueryQueryMatches,
maxMatches+1);
return matches.size();
}
}
BBS removeNonmatchingReagents(const ChemicalReaction &rxn, BBS bbs,
const EnumerationParams &params) {
PRECONDITION(bbs.size() <= rxn.getNumReactantTemplates(),
"Number of Reagents not compatible with reaction templates");
BBS result;
result.resize(bbs.size());
for(size_t reactant_idx=0; reactant_idx < bbs.size(); ++reactant_idx) {
size_t removedCount = 0;
const unsigned int maxMatches = (params.reagentMaxMatchCount == INT_MAX) ?
0 : rdcast<unsigned int>(params.reagentMaxMatchCount);
ROMOL_SPTR reactantTemplate = rxn.getReactants()[reactant_idx];
for(size_t reagent_idx = 0; reagent_idx < bbs[reactant_idx].size(); ++reagent_idx) {
ROMOL_SPTR mol = bbs[reactant_idx][reagent_idx];
size_t matches = countMatches(*mol.get(), *reactantTemplate.get(), maxMatches);
bool removeReagent = false;
if(!matches || matches > rdcast<size_t>(params.reagentMaxMatchCount)) {
removeReagent = true;
}
if(!removeReagent && params.sanePartialProducts) {
// see if we have any sane products in the results
std::vector<MOL_SPTR_VECT> partialProducts = rxn.runReactant(mol, reactant_idx);
for(size_t productTemplate_idx = 0;
productTemplate_idx < partialProducts.size();
++productTemplate_idx) {
int saneProducts = 0;
for(size_t product_idx = 0;
product_idx < partialProducts[productTemplate_idx].size();
++product_idx) {
try {
RWMol *m = dynamic_cast<RWMol*>(
partialProducts[productTemplate_idx][product_idx].get());
MolOps::sanitizeMol(*m);
saneProducts++;
} catch (...) {
}
}
if (!saneProducts) {
// if any product template has no sane products, we bail
removeReagent = true;
break;
}
}
}
if(removeReagent)
removedCount++;
else
result[reactant_idx].push_back(mol);
}
if(removedCount) {
BOOST_LOG(rdInfoLog) << "Removed " << removedCount <<
" non matching reagents at template " << reactant_idx << std::endl;
}
}
return result;
}
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
const EnumerationParams &params)
: EnumerateLibraryBase(rxn, new CartesianProductStrategy),
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
m_enumerator->initialize(m_rxn, m_bbs); // getSizesFromBBs(bbs));
m_initialEnumerator.reset(m_enumerator->copy());
}
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
const EnumerationStrategyBase &enumerator,
const EnumerationParams &params)
: EnumerateLibraryBase(rxn),
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
m_enumerator.reset(enumerator.copy());
m_enumerator->initialize(m_rxn, m_bbs);
m_initialEnumerator.reset(m_enumerator->copy());
}
EnumerateLibrary::EnumerateLibrary(const EnumerateLibrary &rhs)
: EnumerateLibraryBase(rhs), m_bbs(rhs.m_bbs) {}
std::vector<MOL_SPTR_VECT> EnumerateLibrary::next() {
PRECONDITION(static_cast<bool>(*this), "No more enumerations");
const RGROUPS &reactantIndices = m_enumerator->next();
MOL_SPTR_VECT reactants(m_bbs.size());
for (size_t i = 0; i < m_bbs.size(); ++i) {
reactants[i] = m_bbs[i][reactantIndices[i]];
}
return m_rxn.runReactants(reactants);
}
void EnumerateLibrary::toStream(std::ostream &ss) const {
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_oarchive ar(ss);
ar << *this;
#else
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
void EnumerateLibrary::initFromStream(std::istream &ss) {
#ifdef RDK_USE_BOOST_SERIALIZATION
boost::archive::text_iarchive ar(ss);
ar >> *this;
#else
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
#endif
}
boost::uint64_t computeNumProducts(const RGROUPS &sizes) {
boost::multiprecision::cpp_int myint = 1;
for (size_t i = 0; i < sizes.size(); ++i) {
myint *= sizes[i];
}
if (myint < std::numeric_limits<boost::uint64_t>::max())
return myint.convert_to<boost::uint64_t>();
else
return EnumerationStrategyBase::EnumerationOverflow;
}
MOL_SPTR_VECT getReactantsFromRGroups(const std::vector<MOL_SPTR_VECT> &bbs,
const RGROUPS &rgroups) {
PRECONDITION(bbs.size() == rgroups.size(),
"BBS and RGROUPS must have the same # reactants");
MOL_SPTR_VECT result;
result.reserve(bbs.size());
for (size_t i = 0; i < bbs.size(); ++i) {
result.push_back(bbs[i][rgroups[i]]);
}
return result;
}
bool EnumerateLibraryCanSerialize() {
#ifdef RDK_USE_BOOST_SERIALIZATION
return true;
#else
return false;
#endif
}
}