Add tautomer query to the substructlibrary (#3808)

* Fixes #3797

* [WIP] Add tautomer queries to the substruct library

* Add TautomerQuery to CMake

* Add missing TautomerQuery functions, python wrapper and tests

* Add python wrappers for Substruct Library Tautomer Queries

* Explictly label non-const pattern function now that we have both

* Use boost::shared_ptr not std::shared_ptr

* Fix java builds

* One more try to fix java builds

* Fix Java Tests

* Run clang format

* Reenable tests

* Fix annoyingly stupid bug and annoying commit of debug code

* Fix documentation

* reenable ifdef threadsafe check

* Throw warning and perform tautomer search instead of bailing with incorrect fingerprints

* Simplfy api with templates

* Fix SubstructLibrary java issues

* minor API cleanup

* simplify the SWIG wrappers

Co-authored-by: Brian Kelley <bkelley@relaytx.com>
Co-authored-by: greg landrum <greg.landrum@gmail.com>
This commit is contained in:
Brian Kelley
2021-03-04 22:56:20 -05:00
committed by GitHub
parent e21e2c2af6
commit c8aa10c80f
13 changed files with 643 additions and 273 deletions

View File

@@ -41,7 +41,6 @@
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/FileParsers/FileParsers.h>
@@ -51,10 +50,12 @@ using namespace RDKit;
namespace {
boost::dynamic_bitset<> runTest(SubstructLibrary &ssslib, const ROMol &pattern,
int nThreads) {
int nThreads) {
std::vector<unsigned int> libMatches = ssslib.getMatches(pattern, nThreads);
boost::dynamic_bitset<> hasMatch(ssslib.size());
for (auto idx : libMatches) { hasMatch[idx] = 1; }
for (auto idx : libMatches) {
hasMatch[idx] = 1;
}
for (unsigned int i = 0; i < ssslib.size(); ++i) {
MatchVectType match;
@@ -66,16 +67,15 @@ boost::dynamic_bitset<> runTest(SubstructLibrary &ssslib, const ROMol &pattern,
return hasMatch;
};
void runTest(SubstructLibrary &ssslib,
const ROMol &pattern,
int nThreads,
const boost::dynamic_bitset<> &hasMatch
) {
void runTest(SubstructLibrary &ssslib, const ROMol &pattern, int nThreads,
const boost::dynamic_bitset<> &hasMatch) {
std::vector<unsigned int> libMatches = ssslib.getMatches(pattern, nThreads);
boost::dynamic_bitset<> hasMatch2(ssslib.size());
for (auto idx : libMatches) { hasMatch2[idx] = 1; }
for (auto idx : libMatches) {
hasMatch2[idx] = 1;
}
TEST_ASSERT(hasMatch == hasMatch2);
for (unsigned int i = 0; i < ssslib.size(); ++i) {
MatchVectType match;
bool matched = SubstructMatch(*ssslib.getMol(i), pattern, match);
@@ -85,7 +85,6 @@ void runTest(SubstructLibrary &ssslib,
}
};
} // namespace
void test1() {
@@ -110,7 +109,7 @@ void test1() {
delete mol;
}
std::vector<SubstructLibrary*> libs;
std::vector<SubstructLibrary *> libs;
libs.push_back(&ssslib);
#ifdef RDK_USE_BOOST_SERIALIZATION
@@ -122,9 +121,9 @@ void test1() {
#endif
boost::dynamic_bitset<> hasMatch;
int i=0;
for(auto lib: libs) {
int i = 0;
for (auto lib : libs) {
ROMol *query = SmartsToMol("[#6;$([#6]([#6])[!#6])]");
if (i == 0) {
hasMatch = runTest(*lib, *query, 1);
@@ -140,7 +139,7 @@ void test1() {
}
i = 0;
for(auto lib: libs) {
for (auto lib : libs) {
ROMol *query = SmartsToMol("[$([O,S]-[!$(*=O)])]");
if (i == 0) {
hasMatch = runTest(*lib, *query, 1);
@@ -185,7 +184,7 @@ void test2() {
delete mol;
}
std::vector<SubstructLibrary*> libs;
std::vector<SubstructLibrary *> libs;
libs.push_back(&ssslib);
#ifdef RDK_USE_BOOST_SERIALIZATION
@@ -197,14 +196,17 @@ void test2() {
// check to see if we are still the right base type
MolHolderBase *_holder = serialized.getMolHolder().get();
TEST_ASSERT(_holder != nullptr);
TEST_ASSERT(dynamic_cast<MolHolder*>(_holder) != nullptr);
try { serialized.getFingerprints(); }
catch(...) { TEST_ASSERT(0); }
TEST_ASSERT(dynamic_cast<MolHolder *>(_holder) != nullptr);
try {
serialized.getFingerprints();
} catch (...) {
TEST_ASSERT(0);
}
libs.push_back(&serialized);
#endif
for(auto lib: libs) {
for (auto lib : libs) {
ROMol *query = SmartsToMol("[#6]([#6])[!#6]");
runTest(*lib, *query, 1);
#ifdef RDK_TEST_MULTITHREADED
@@ -236,7 +238,7 @@ void test3() {
delete m4;
}
std::vector<SubstructLibrary*> libs;
std::vector<SubstructLibrary *> libs;
libs.push_back(&ssslib);
#ifdef RDK_USE_BOOST_SERIALIZATION
@@ -248,11 +250,10 @@ void test3() {
// check to see if we are still the right base type
MolHolderBase *_holder = serialized.getMolHolder().get();
TEST_ASSERT(_holder != nullptr);
TEST_ASSERT(dynamic_cast<MolHolder*>(_holder) != nullptr);
TEST_ASSERT(dynamic_cast<MolHolder *>(_holder) != nullptr);
#endif
for(auto lib: libs) {
for (auto lib : libs) {
ROMol *query = SmartsToMol("C-1-C-C-O-C(-[O])(-[N])1");
std::vector<unsigned int> res = lib->getMatches(*query, true, false);
TEST_ASSERT(res.size() == 40);
@@ -286,7 +287,7 @@ void test4() {
holder->addSmiles("C1CCO[C@](O)(N)1");
}
std::vector<SubstructLibrary*> libs;
std::vector<SubstructLibrary *> libs;
libs.push_back(&ssslib);
#ifdef RDK_USE_BOOST_SERIALIZATION
@@ -298,26 +299,26 @@ void test4() {
// check to see if we are still the right base type
MolHolderBase *_holder = serialized.getMolHolder().get();
TEST_ASSERT(_holder != nullptr);
TEST_ASSERT(dynamic_cast<CachedSmilesMolHolder*>(_holder) != nullptr);
TEST_ASSERT(dynamic_cast<CachedSmilesMolHolder *>(_holder) != nullptr);
#endif
for(auto lib: libs) {
for (auto lib : libs) {
ROMol *query = SmartsToMol("C-1-C-C-O-C(-[O])(-[N])1");
std::vector<unsigned int> res = lib->getMatches(*query, true, false);
TEST_ASSERT(res.size() == 40);
delete query;
query = SmartsToMol("C-1-C-C-O-[C@@](-[O])(-[N])1");
res = lib->getMatches(*query, true, true);
TEST_ASSERT(res.size() == 20);
res = lib->getMatches(*query, true, false);
TEST_ASSERT(res.size() == 40);
delete query;
}
BOOST_LOG(rdErrorLog) << " Done (trusted smiles)" << std::endl;
}
@@ -379,9 +380,10 @@ void docTest() {
BOOST_LOG(rdErrorLog) << " Done (C++ doc tests)" << std::endl;
}
void ringTest() {
template <class Holder>
void ringTest(const std::string &name) {
BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl;
BOOST_LOG(rdErrorLog) << " Testing C++ ring query" << std::endl;
BOOST_LOG(rdErrorLog) << " Testing C++ ring query: " << name << std::endl;
std::unique_ptr<ROMol> q(SmartsToMol("[C&R1]"));
std::unique_ptr<ROMol> q2(SmartsToMol("C@C"));
@@ -389,10 +391,9 @@ void ringTest() {
std::unique_ptr<ROMol> m(SmilesToMol("C1CCO[C@@](N)(O)1"));
boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder =
boost::make_shared<CachedTrustedSmilesMolHolder>();
boost::shared_ptr<PatternHolder> patternHolder =
boost::make_shared<PatternHolder>();
boost::make_shared<CachedTrustedSmilesMolHolder>();
boost::shared_ptr<Holder> patternHolder = boost::make_shared<Holder>();
SubstructLibrary lib(molHolder, patternHolder);
lib.addMol(*m.get());
std::vector<unsigned int> results = lib.getMatches(*q.get());
@@ -438,20 +439,27 @@ void testAddPatterns() {
"CC1(C)CNc2cc(NC(=O)c3cccnc3NCc3ccncc3)ccc21"};
boost::shared_ptr<CachedSmilesMolHolder> holder =
boost::make_shared<CachedSmilesMolHolder>();
boost::make_shared<CachedSmilesMolHolder>();
for(auto s : pdb_ligands) {
for (auto s : pdb_ligands) {
holder->addSmiles(s);
}
SubstructLibrary ssslib(holder);
std::vector<int> num_threads = { 1, 0 };
for(auto nthreads : num_threads) {
std::vector<int> num_threads = {1, 0};
for (auto nthreads : num_threads) {
SubstructLibrary ssslib_with_patterns(holder);
SubstructLibrary ssslib_with_taut_patterns(holder);
addPatterns(ssslib_with_patterns, nthreads);
for(unsigned int i=0; i<ssslib.size(); ++i) {
TEST_ASSERT( ssslib.countMatches( *ssslib.getMol(i).get() ) ==
ssslib_with_patterns.countMatches( *ssslib.getMol(i).get() ) );
boost::shared_ptr<TautomerPatternHolder> patterns(
new TautomerPatternHolder);
addPatterns(ssslib_with_taut_patterns, patterns, nthreads);
for (unsigned int i = 0; i < ssslib.size(); ++i) {
TEST_ASSERT(ssslib.countMatches(*ssslib.getMol(i).get()) ==
ssslib_with_patterns.countMatches(*ssslib.getMol(i).get()));
TEST_ASSERT(
ssslib.countMatches(*ssslib.getMol(i).get()) ==
ssslib_with_taut_patterns.countMatches(*ssslib.getMol(i).get()));
}
}
}
@@ -564,20 +572,21 @@ void testMaxResultsAllSameNumThreads() {
}
}
void testPatternHolder() {
template <class Holder>
void testPatternHolder(const std::string &name) {
BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl;
BOOST_LOG(rdErrorLog) << " testPatternHolder" << std::endl;
BOOST_LOG(rdErrorLog) << " testing " << name << std::endl;
std::string fName = getenv("RDBASE");
fName += "/Data/NCI/first_5K.smi";
SmilesMolSupplier suppl(fName, "\t", 0, 1, false);
boost::shared_ptr<CachedTrustedSmilesMolHolder> mols1(
new CachedTrustedSmilesMolHolder());
boost::shared_ptr<PatternHolder> fps1(new PatternHolder());
boost::shared_ptr<Holder> fps1(new Holder());
SubstructLibrary ssslib1(mols1, fps1);
boost::shared_ptr<CachedTrustedSmilesMolHolder> mols2(
new CachedTrustedSmilesMolHolder());
boost::shared_ptr<PatternHolder> fps2(new PatternHolder());
boost::shared_ptr<Holder> fps2(new Holder());
SubstructLibrary ssslib2(mols2, fps2);
boost::logging::disable_logs("rdApp.error");
@@ -634,10 +643,10 @@ void testPatternHolder() {
}
for (size_t i = 0; i < 2; ++i) {
auto serialized_pattern_holder =
dynamic_cast<PatternHolder *>(serialized.getFpHolder().get());
dynamic_cast<Holder *>(serialized.getFpHolder().get());
TEST_ASSERT(serialized_pattern_holder);
auto orig_pattern_holder =
dynamic_cast<PatternHolder *>(ssslib1.getFpHolder().get());
dynamic_cast<Holder *>(ssslib1.getFpHolder().get());
TEST_ASSERT(orig_pattern_holder);
TEST_ASSERT(serialized_pattern_holder->getNumBits() ==
orig_pattern_holder->getNumBits());
@@ -657,16 +666,15 @@ void testSegFaultInHolder() {
boost::shared_ptr<CachedTrustedSmilesMolHolder> mols1(
new CachedTrustedSmilesMolHolder());
boost::shared_ptr<CachedSmilesMolHolder> mols2(
new CachedSmilesMolHolder());
for(int i=0; i<100; ++i) {
if(i%2==0) {
mols1->addSmiles("dsafsdf");
mols2->addSmiles("dsafsdf");
} else {
mols1->addSmiles("c1ccccc1");
mols2->addSmiles("c1ccccc1");
}
boost::shared_ptr<CachedSmilesMolHolder> mols2(new CachedSmilesMolHolder());
for (int i = 0; i < 100; ++i) {
if (i % 2 == 0) {
mols1->addSmiles("dsafsdf");
mols2->addSmiles("dsafsdf");
} else {
mols1->addSmiles("c1ccccc1");
mols2->addSmiles("c1ccccc1");
}
}
SubstructLibrary sss(mols1);
SubstructLibrary sss2(mols2);
@@ -681,6 +689,32 @@ void testSegFaultInHolder() {
addPatterns(sss2, 2);
}
void testTautomerQueries() {
BOOST_LOG(rdErrorLog) << "-------------------------------------" << std::endl;
BOOST_LOG(rdErrorLog) << " testTautomerQueries" << std::endl;
boost::shared_ptr<CachedTrustedSmilesMolHolder> mols1(
new CachedTrustedSmilesMolHolder());
mols1->addSmiles("CN1C2=C(C(=O)Nc3ccccc3)C(=O)CCN2c2ccccc21");
SubstructLibrary sss(mols1);
auto query = "Cc1nc2ccccc2[nH]1"_smiles;
//auto matches1 = sss.getMatches(*query);
//TEST_ASSERT(matches1.size() == 0);
auto tq = TautomerQuery::fromMol(*query);
auto matches2 = sss.getMatches(*tq);
TEST_ASSERT(matches2.size() == 1);
SubstructLibrary sss2(sss);
addPatterns(sss, boost::make_shared<TautomerPatternHolder>());
matches2 = sss.getMatches(*tq);
TEST_ASSERT(matches2.size() == 1);
// should work but throw logging errors
addPatterns(sss2);
matches2 = sss2.getMatches(*tq);
TEST_ASSERT(matches2.size() == 1);
}
int main() {
RDLog::InitLogs();
#if 1
@@ -689,13 +723,16 @@ int main() {
test3();
test4();
docTest();
ringTest();
ringTest<PatternHolder>("PatternHolder");
ringTest<TautomerPatternHolder>("TautomerPatternHolder");
testAddPatterns();
testPatternHolder();
testPatternHolder<PatternHolder>("PatternHolder");
testPatternHolder<TautomerPatternHolder>("TautomerPatternHolder");
testSegFaultInHolder();
#ifdef RDK_TEST_MULTITHREADED
testMaxResultsNumThreads();
testMaxResultsAllSameNumThreads();
testTautomerQueries();
#endif
#endif
return 0;