mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
* Adds C++ Enumeration Engine to the RDKit * Adds Sanitization helpers, wrappers and tests * Clang format * Remove unused enumerationStateOnly flag * Fixes docStrings to current API * Adds doc strings * Removes RGroupPosition, adds getPosition to EnumerationBase * Fixes readability. * Adds EnumerateLibraryBase::reset and getReaction * Added getReagents method to EnumerateLibrary * Make the tests have the same naming * Need to save the initial state for resetting. * Stupid case-insensitive file systems * Moves ResetState to EnumerateLibraryBase * Adds removeNonmatchingReagents helper * Renames currentPosition to getPosition * Adds Enumeration Toolkit tutorial * Fixes Python3 serialization and enumerators * Verified to run on python2 and 3 * Fixes integer issues on windows * The number of enumeration should be unsigned. * Adds deserialization constructor * Moves boost_serialization to the end * Deprecates Clone in favor of copy * Update tests to use copy.copy not Clone * Move RGROUPS and BBS into an EnumerationTypes namespace * Make sure old pickles work * Adds pickle for backwards compatibility * Moves to uint64_t from size_t for public api * Whups, accidentally used the binary archiver. * Commits boost 1.55 serialization * Makes serialization turnoffable Like Filter Catalog * Fixes tests when serialization not available. Adds more enumeration strategy tests * Fixes a syntax error on some versions of python * Fixes sanitizeRxn to actually make proper RGroup atoms * Updates SanitizeRXN python API * Updates Enumeration API to a parameter class - fixes reagent removal * Adds a mess of tests * Change stats to return a string. * Exposes EvenPairSamplingStrategy Stats to python * Fixes a crash bug in SanitizeRxn * Adds better testing of the even pair sampling * Fixes namespace * One more try to fix gcc * Enum classes are c++11 and a microsoft extension. * Fix typo * Fixes np.median for python3 * Fixes atom iterators * Adds virtual tags to derived virtual functions (for clarity) * Fixes size comparison issues * Adds doc string * Small cleanup (has no effect since flags aren’t used) * fixes crash bug on windows * get the tests working on windows * Updates tutorial * Adds Glare implementation to Contrib
260 lines
9.0 KiB
C++
260 lines
9.0 KiB
C++
//
|
|
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above
|
|
// copyright notice, this list of conditions and the following
|
|
// disclaimer in the documentation and/or other materials provided
|
|
// with the distribution.
|
|
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
|
|
// nor the names of its contributors may be used to endorse or promote
|
|
// products derived from this software without specific prior written
|
|
// permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
|
|
#include "Enumerate.h"
|
|
#include "CartesianProduct.h"
|
|
#include "RandomSample.h"
|
|
#include "RandomSampleAllBBs.h"
|
|
#include "EvenSamplePairs.h"
|
|
#include "../ReactionPickler.h"
|
|
#include <GraphMol/MolPickler.h>
|
|
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
|
|
|
// Since we are exporting the classes for serialization,
|
|
// we should declare the archives types used here
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
#include <RDGeneral/BoostStartInclude.h>
|
|
#include <boost/archive/text_oarchive.hpp>
|
|
#include <boost/archive/text_iarchive.hpp>
|
|
#include <boost/serialization/shared_ptr.hpp>
|
|
#include <boost/serialization/export.hpp>
|
|
#include <RDGeneral/BoostEndInclude.h>
|
|
|
|
BOOST_CLASS_EXPORT(RDKit::EnumerationStrategyBase);
|
|
BOOST_CLASS_EXPORT(RDKit::CartesianProductStrategy);
|
|
BOOST_CLASS_EXPORT(RDKit::RandomSampleStrategy);
|
|
BOOST_CLASS_EXPORT(RDKit::RandomSampleAllBBsStrategy);
|
|
BOOST_CLASS_EXPORT(RDKit::EvenSamplePairsStrategy);
|
|
BOOST_CLASS_EXPORT(RDKit::EnumerateLibrary);
|
|
#endif
|
|
|
|
namespace RDKit {
|
|
using namespace EnumerationTypes;
|
|
|
|
const RGROUPS &EnumerateLibraryBase::getPosition() const {
|
|
return m_enumerator->getPosition();
|
|
}
|
|
|
|
std::string EnumerateLibraryBase::getState() const {
|
|
PRECONDITION(m_enumerator.get(), "Null Enumerator");
|
|
std::string state;
|
|
EnumerationStrategyPickler::pickle(m_enumerator, state);
|
|
return state;
|
|
}
|
|
|
|
void EnumerateLibraryBase::setState(const std::string &state) {
|
|
m_enumerator = EnumerationStrategyPickler::fromPickle(state);
|
|
}
|
|
|
|
void EnumerateLibraryBase::resetState() {
|
|
PRECONDITION(m_initialEnumerator.get(),
|
|
"Unset initial enumerator");
|
|
m_enumerator.reset(m_initialEnumerator->copy());
|
|
}
|
|
|
|
std::vector<std::vector<std::string> > EnumerateLibraryBase::nextSmiles() {
|
|
std::vector<std::vector<std::string> > result;
|
|
std::vector<MOL_SPTR_VECT> mols = next();
|
|
const bool doisomeric = true;
|
|
result.resize(mols.size());
|
|
for (size_t i = 0; i < mols.size(); ++i) {
|
|
result[i].resize(mols[i].size());
|
|
for (size_t j = 0; j < mols[i].size(); ++j) {
|
|
if (mols[i][j].get()) result[i][j] = MolToSmiles(*mols[i][j], doisomeric);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
namespace {
|
|
size_t countMatches( const ROMol& bb, const ROMol& query, int maxMatches) {
|
|
std::vector<MatchVectType> matches;
|
|
const bool uniquify = true;
|
|
const bool useChirality = true;
|
|
const bool useQueryQueryMatches = false;
|
|
|
|
SubstructMatch(bb, query, matches,
|
|
uniquify, true, useChirality, useQueryQueryMatches,
|
|
maxMatches+1);
|
|
return matches.size();
|
|
}
|
|
}
|
|
BBS removeNonmatchingReagents(const ChemicalReaction &rxn, BBS bbs,
|
|
const EnumerationParams ¶ms) {
|
|
PRECONDITION(bbs.size() <= rxn.getNumReactantTemplates(),
|
|
"Number of Reagents not compatible with reaction templates");
|
|
BBS result;
|
|
result.resize(bbs.size());
|
|
|
|
for(size_t reactant_idx=0; reactant_idx < bbs.size(); ++reactant_idx) {
|
|
size_t removedCount = 0;
|
|
const unsigned int maxMatches = (params.reagentMaxMatchCount == INT_MAX) ?
|
|
0 : rdcast<unsigned int>(params.reagentMaxMatchCount);
|
|
|
|
ROMOL_SPTR reactantTemplate = rxn.getReactants()[reactant_idx];
|
|
for(size_t reagent_idx = 0; reagent_idx < bbs[reactant_idx].size(); ++reagent_idx) {
|
|
ROMOL_SPTR mol = bbs[reactant_idx][reagent_idx];
|
|
size_t matches = countMatches(*mol.get(), *reactantTemplate.get(), maxMatches);
|
|
|
|
bool removeReagent = false;
|
|
if(!matches || matches > rdcast<size_t>(params.reagentMaxMatchCount)) {
|
|
removeReagent = true;
|
|
}
|
|
|
|
if(!removeReagent && params.sanePartialProducts) {
|
|
// see if we have any sane products in the results
|
|
std::vector<MOL_SPTR_VECT> partialProducts = rxn.runReactant(mol, reactant_idx);
|
|
for(size_t productTemplate_idx = 0;
|
|
productTemplate_idx < partialProducts.size();
|
|
++productTemplate_idx) {
|
|
int saneProducts = 0;
|
|
for(size_t product_idx = 0;
|
|
product_idx < partialProducts[productTemplate_idx].size();
|
|
++product_idx) {
|
|
try {
|
|
RWMol *m = dynamic_cast<RWMol*>(
|
|
partialProducts[productTemplate_idx][product_idx].get());
|
|
MolOps::sanitizeMol(*m);
|
|
saneProducts++;
|
|
} catch (...) {
|
|
}
|
|
}
|
|
|
|
if (!saneProducts) {
|
|
// if any product template has no sane products, we bail
|
|
removeReagent = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(removeReagent)
|
|
removedCount++;
|
|
else
|
|
result[reactant_idx].push_back(mol);
|
|
}
|
|
|
|
|
|
if(removedCount) {
|
|
BOOST_LOG(rdInfoLog) << "Removed " << removedCount <<
|
|
" non matching reagents at template " << reactant_idx << std::endl;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
|
|
const EnumerationParams ¶ms)
|
|
: EnumerateLibraryBase(rxn, new CartesianProductStrategy),
|
|
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
|
|
m_enumerator->initialize(m_rxn, m_bbs); // getSizesFromBBs(bbs));
|
|
m_initialEnumerator.reset(m_enumerator->copy());
|
|
}
|
|
|
|
EnumerateLibrary::EnumerateLibrary(const ChemicalReaction &rxn, const BBS &bbs,
|
|
const EnumerationStrategyBase &enumerator,
|
|
const EnumerationParams ¶ms)
|
|
: EnumerateLibraryBase(rxn),
|
|
m_bbs(removeNonmatchingReagents(m_rxn, bbs, params)) {
|
|
m_enumerator.reset(enumerator.copy());
|
|
m_enumerator->initialize(m_rxn, m_bbs);
|
|
m_initialEnumerator.reset(m_enumerator->copy());
|
|
}
|
|
|
|
EnumerateLibrary::EnumerateLibrary(const EnumerateLibrary &rhs)
|
|
: EnumerateLibraryBase(rhs), m_bbs(rhs.m_bbs) {}
|
|
|
|
std::vector<MOL_SPTR_VECT> EnumerateLibrary::next() {
|
|
PRECONDITION(static_cast<bool>(*this), "No more enumerations");
|
|
const RGROUPS &reactantIndices = m_enumerator->next();
|
|
MOL_SPTR_VECT reactants(m_bbs.size());
|
|
|
|
for (size_t i = 0; i < m_bbs.size(); ++i) {
|
|
reactants[i] = m_bbs[i][reactantIndices[i]];
|
|
}
|
|
|
|
return m_rxn.runReactants(reactants);
|
|
}
|
|
|
|
void EnumerateLibrary::toStream(std::ostream &ss) const {
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
boost::archive::text_oarchive ar(ss);
|
|
ar << *this;
|
|
#else
|
|
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
|
|
#endif
|
|
}
|
|
|
|
void EnumerateLibrary::initFromStream(std::istream &ss) {
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
boost::archive::text_iarchive ar(ss);
|
|
ar >> *this;
|
|
#else
|
|
PRECONDITION(0, "BOOST SERIALIZATION NOT INSTALLED");
|
|
#endif
|
|
}
|
|
|
|
boost::uint64_t computeNumProducts(const RGROUPS &sizes) {
|
|
boost::multiprecision::cpp_int myint = 1;
|
|
|
|
for (size_t i = 0; i < sizes.size(); ++i) {
|
|
myint *= sizes[i];
|
|
}
|
|
|
|
if (myint < std::numeric_limits<boost::uint64_t>::max())
|
|
return myint.convert_to<boost::uint64_t>();
|
|
else
|
|
return EnumerationStrategyBase::EnumerationOverflow;
|
|
}
|
|
|
|
MOL_SPTR_VECT getReactantsFromRGroups(const std::vector<MOL_SPTR_VECT> &bbs,
|
|
const RGROUPS &rgroups) {
|
|
PRECONDITION(bbs.size() == rgroups.size(),
|
|
"BBS and RGROUPS must have the same # reactants");
|
|
MOL_SPTR_VECT result;
|
|
result.reserve(bbs.size());
|
|
for (size_t i = 0; i < bbs.size(); ++i) {
|
|
result.push_back(bbs[i][rgroups[i]]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
bool EnumerateLibraryCanSerialize() {
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
return true;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
}
|