Files
rdkit/Code/GraphMol/ChemReactions/Enumerate/testEnumerate.cpp
Brian Kelley fa89438358 Dev/reaction enumeration (#1111)
* Adds C++ Enumeration Engine to the RDKit

* Adds Sanitization helpers, wrappers and tests

* Clang format

* Remove unused enumerationStateOnly flag

* Fixes docStrings to current API

* Adds doc strings

* Removes RGroupPosition, adds getPosition to EnumerationBase

* Fixes readability.

* Adds EnumerateLibraryBase::reset and getReaction

* Added getReagents method to EnumerateLibrary

* Make the tests have the same naming

* Need to save the initial state for resetting.

* Stupid case-insensitive file systems

* Moves ResetState to EnumerateLibraryBase

* Adds removeNonmatchingReagents helper

* Renames currentPosition to getPosition

* Adds Enumeration Toolkit tutorial

* Fixes Python3 serialization and enumerators

* Verified to run on python2 and 3

* Fixes integer issues on windows

* The number of enumeration should be unsigned.

* Adds deserialization constructor

* Moves boost_serialization to the end

* Deprecates Clone in favor of copy

* Update tests to use copy.copy not Clone

* Move RGROUPS and BBS into an EnumerationTypes namespace

* Make sure old pickles work

* Adds pickle for backwards compatibility

* Moves to uint64_t from size_t for public api

* Whups, accidentally used the binary archiver.

* Commits boost 1.55 serialization

* Makes serialization turnoffable Like Filter Catalog

* Fixes tests when serialization not available.  Adds more enumeration strategy tests

* Fixes a syntax error on some versions of python

* Fixes sanitizeRxn to actually make proper RGroup atoms

* Updates SanitizeRXN python API

* Updates Enumeration API to a parameter class - fixes reagent removal

* Adds a mess of tests

* Change stats to return a string.

* Exposes EvenPairSamplingStrategy Stats to python

* Fixes a crash bug in SanitizeRxn

* Adds better testing of the even pair sampling

* Fixes namespace

* One more try to fix gcc

* Enum classes are c++11 and a microsoft extension.

* Fix typo

* Fixes np.median for python3

* Fixes atom iterators

* Adds virtual tags to derived virtual functions (for clarity)

* Fixes size comparison issues

* Adds doc string

* Small cleanup (has no effect since flags aren’t used)

* fixes crash bug on windows

* get the tests working on windows

* Updates tutorial

* Adds Glare implementation to Contrib
2016-11-05 14:42:52 +01:00

300 lines
12 KiB
C++

//
// Copyright (c) 2015, Novartis Institutes for BioMedical Research Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Novartis Institutes for BioMedical Research Inc.
// nor the names of its contributors may be used to endorse or promote
// products derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#include <RDGeneral/utils.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/ChemReactions/Enumerate/CartesianProduct.h>
#include <GraphMol/ChemReactions/Enumerate/EvenSamplePairs.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSample.h>
#include <GraphMol/ChemReactions/Enumerate/RandomSampleAllBBs.h>
#include <GraphMol/ChemReactions/Enumerate/Enumerate.h>
#include <GraphMol/ChemReactions/ReactionParser.h>
#include <GraphMol/ChemReactions/ReactionUtils.h>
#include <GraphMol/ChemReactions/SanitizeRxn.h>
#ifdef RDK_USE_BOOST_SERIALIZATION
#include <RDGeneral/BoostStartInclude.h>
#include <boost/archive/text_oarchive.hpp>
#include <boost/archive/text_iarchive.hpp>
#include <RDGeneral/BoostEndInclude.h>
#endif
using namespace RDKit;
#ifdef RDK_USE_BOOST_SERIALIZATION
// for each starting point check to see that the archive
// starts at the same point
void pickleTest(EnumerationStrategyBase &en, size_t len) {
boost::shared_ptr<EnumerationStrategyBase> base(en.copy());
TEST_ASSERT(std::string(base->type()) == std::string(en.type()));
for (size_t i = 0; i < len; ++i) {
std::stringstream ss;
{
boost::archive::text_oarchive ar(ss);
ar &base;
}
boost::shared_ptr<EnumerationStrategyBase> copy;
{
boost::archive::text_iarchive ar(ss);
ar &copy;
}
TEST_ASSERT(std::string(base->type()) == std::string(copy->type()));
TEST_ASSERT(base->next() == copy->next());
TEST_ASSERT(base->getPosition() == en.next());
}
}
#endif
void testSamplers() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
for (int i = 0; i < 10; ++i)
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
for (int i = 0; i < 5; ++i)
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
for (int i = 0; i < 6; ++i)
bbs[2].push_back(
boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction rxn;
CartesianProductStrategy cart;
cart.initialize(rxn, bbs);
RandomSampleStrategy rand;
rand.initialize(rxn, bbs);
RandomSampleAllBBsStrategy randBBs;
randBBs.initialize(rxn, bbs);
EvenSamplePairsStrategy even;
even.initialize(rxn, bbs);
std::vector<boost::shared_ptr<EnumerationStrategyBase> > enumerators;
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(cart.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(rand.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(randBBs.copy()));
enumerators.push_back(
boost::shared_ptr<EnumerationStrategyBase>(even.copy()));
#ifdef RDK_USE_BOOST_SERIALIZATION
for (size_t i = 0; i < enumerators.size(); ++i) {
TEST_ASSERT(enumerators[i]->getNumPermutations() == 10 * 5 * 6);
pickleTest(*enumerators[i], 10 * 5 * 6);
}
#endif
// for(auto&& i: enumerators) {
// TEST_ASSERT(i->getNumPermutations() == 10*5*6);
//}
}
void testEvenSamplers() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
unsigned long R1 = 6000;
unsigned long R2 = 500;
unsigned long R3 = 10000;
for (unsigned long i = 0; i < R1; ++i)
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
for (unsigned long i = 0; i < R2; ++i)
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
for (unsigned long i = 0; i < R3; ++i)
bbs[2].push_back(
boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction rxn;
EvenSamplePairsStrategy even;
even.initialize(rxn, bbs);
std::cout << even.getNumPermutations() << " " << R1 * R2 * R3 << std::endl;
TEST_ASSERT(even.getNumPermutations() == R1 * R2 * R3);
for (size_t i = 0; i < 5000; ++i) {
even.next();
}
even.stats();
}
const char *smiresults[] = {
"C=CCNC(=S)NCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCc1ncc(Cl)cc1Br",
"C=CCNC(=S)NCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCc1ncc(Cl)cc1Br",
"C=CCNC(=S)NCCCc1ncc(Cl)cc1Br", "CC=CCNC(=S)NCCCc1ncc(Cl)cc1Br"};
void testEnumerations() {
EnumerationTypes::BBS bbs;
bbs.resize(2);
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("C=CCN=C=S")));
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CC=CCN=C=S")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCc1ncc(Cl)cc1Br")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCCc1ncc(Cl)cc1Br")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCCCc1ncc(Cl)cc1Br")));
ChemicalReaction *rxn = RxnSmartsToChemicalReaction(
"[N;$(N-[#6]):3]=[C;$(C=S):1].[N;$(N[#6]);!$(N=*);!$([N-]);!$(N#*);"
"!$([ND3]);!$([ND4]);!$(N[O,N]);!$(N[C,S]=[S,O,N]):2]>>[N:3]-[C:1]-[N+0:"
"2]");
{
EnumerateLibrary en(*rxn, bbs);
size_t i = 0;
for (; (bool)en; ++i) {
std::vector<std::vector<std::string> > res = en.nextSmiles();
TEST_ASSERT(res.size() == 1);
TEST_ASSERT(res[0].size() == 1);
TEST_ASSERT(res[0][0] == smiresults[i]);
TEST_ASSERT(i<=6);
}
TEST_ASSERT(i == 6);
// tests reset
en.resetState();
i = 0;
for (; (bool)en; ++i) {
std::vector<std::vector<std::string> > res = en.nextSmiles();
TEST_ASSERT(res.size() == 1);
TEST_ASSERT(res[0].size() == 1);
TEST_ASSERT(res[0][0] == smiresults[i]);
TEST_ASSERT(i<=6);
}
TEST_ASSERT(i == 6);
}
#ifdef RDK_USE_BOOST_SERIALIZATION
{
boost::shared_ptr<EnumerateLibrary> en(
new EnumerateLibrary(*rxn, bbs, RandomSampleStrategy()));
std::vector<std::vector<std::vector<std::string> > >smir;
for (size_t j = 0; j < 10; ++j) {
std::vector<std::vector<std::string> > smiles = en->nextSmiles();
smir.push_back(smiles);
}
en->resetState();
for (size_t i = 0; i < 1000; ++i) {
// pickle and unpickle
std::stringstream ss;
{
boost::archive::text_oarchive ar(ss);
ar &en;
}
boost::shared_ptr<EnumerateLibrary> copy;
{
boost::archive::text_iarchive ar(ss);
ar &copy;
}
for (size_t j = 0; j < 10; ++j) {
TEST_ASSERT(en->nextSmiles() == copy->nextSmiles());
}
copy->resetState();
for (size_t j = 0; j < 10; ++j) {
TEST_ASSERT(smir[j] == copy->nextSmiles());
}
}
}
#endif
delete rxn;
}
const char *rxndata = "$RXN\nUntitled Document-1\n ChemDraw10291618492D\n\n 3 1\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n -0.4125 0.0000 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n -0.4125 0.0000 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n 0.4125 0.0000 0.0000 Cl 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 2 1 0 0 0 0 0 0 0 0999 V2000\n 0.4125 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n -0.4125 0.0000 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n 1 2 1 0 0\nM END\n$MOL\n\n\n\n 14 15 0 0 0 0 0 0 0 0999 V2000\n 0.5072 -0.5166 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.5072 0.3084 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2949 -0.7616 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 1.7817 -0.0880 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2967 0.5794 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.5558 -1.5443 0.0000 R1 0 0 0 0 0 0 0 0 0 1 0 0\n -0.2073 0.7208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9218 0.3083 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.9217 -0.5167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -0.2073 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n -1.6362 0.7208 0.0000 N 0 0 0 0 0 0 0 0 0 3 0 0\n 1.5452 1.3661 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0\n 2.3507 1.5443 0.0000 R4 0 0 0 0 0 0 0 0 0 4 0 0\n -2.3507 0.3083 0.0000 R2 0 0 0 0 0 0 0 0 0 2 0 0\n 1 2 2 0 0\n 1 3 1 0 0\n 3 4 1 0 0\n 4 5 1 0 0\n 5 2 1 0 0\n 3 6 1 0 0\n 2 7 1 0 0\n 7 8 2 0 0\n 8 9 1 0 0\n 9 10 2 0 0\n 10 1 1 0 0\n 8 11 1 0 0\n 12 13 1 0 0\n 11 14 1 0 0\n 12 5 1 0 0\nM END\n";
void testInsaneEnumerations() {
EnumerationTypes::BBS bbs;
bbs.resize(3);
ChemicalReaction *rxn2 = RxnBlockToChemicalReaction(rxndata);
//RxnOps::sanitizeRxn(*rxn2, MolOps::AdjustQueryParameters());
MatchVectType tvect;
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CCNCC")));
bbs[0].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCC")));
std::cerr << "0,0 " << (int)SubstructMatch(*bbs[0][0].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl;
std::cerr << "0,1 " << (int)SubstructMatch(*bbs[0][1].get(), *rxn2->getReactants()[0].get(), tvect) << std::endl;
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("ClC1CCC1")));
bbs[1].push_back(boost::shared_ptr<ROMol>(SmilesToMol("ClC1CCC1Cl")));
std::cerr << "1,0 " << (int)SubstructMatch(*bbs[1][0].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl;
std::cerr << "1,1 " << (int)SubstructMatch(*bbs[1][1].get(), *rxn2->getReactants()[1].get(), tvect) << std::endl;
bbs[2].push_back(boost::shared_ptr<ROMol>(SmilesToMol("CCNCC")));
bbs[2].push_back(boost::shared_ptr<ROMol>(SmilesToMol("NCC")));
std::cerr << "2,0 " << (int)SubstructMatch(*bbs[2][0].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl;
std::cerr << "2,1 " << (int)SubstructMatch(*bbs[2][1].get(), *rxn2->getReactants()[2].get(), tvect) << std::endl;
{
ChemicalReaction *rxn = RxnBlockToChemicalReaction(rxndata);
RxnOps::sanitizeRxn(*rxn, MolOps::AdjustQueryParameters());
std::cerr << ChemicalReactionToRxnBlock(*rxn) << std::endl;
EnumerationParams ThereCanBeOnlyOne;
ThereCanBeOnlyOne.reagentMaxMatchCount = 1;
EnumerationTypes::BBS bbs2 = removeNonmatchingReagents(
*rxn, bbs,
ThereCanBeOnlyOne);
TEST_ASSERT(bbs2[0].size() == 1);
TEST_ASSERT(bbs2[1].size() == 1);
TEST_ASSERT(bbs2[2].size() == 1);
delete rxn;
}
delete rxn2;
}
int main(int argc, char *argv[]) {
RDLog::InitLogs();
bool doLong = false;
if (argc > 1) {
if (!strncmp(argv[1], "-l", 2)) {
doLong = true;
}
}
/*
testSamplers();
testEvenSamplers();
testEnumerations();
*/
testInsaneEnumerations();
}