Files
rdkit/Code/GraphMol/SynthonSpaceSearch/LONG_TEST_catch_tests.cpp
2025-12-29 21:08:10 +01:00

466 lines
17 KiB
C++

//
//
// Copyright (C) David Cosgrove 2025.
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
#include <algorithm>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <GraphMol/SubstructLibrary/SubstructLibrary.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/Fingerprints/MorganGenerator.h>
#include <GraphMol/Fingerprints/RDKitFPGenerator.h>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpace.h>
#include <GraphMol/SynthonSpaceSearch/SearchResults.h>
#include <GraphMol/SynthonSpaceSearch/SynthonSpaceSearch_details.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <boost/parameter/aux_/pp_impl/match.hpp>
#include <catch2/catch_all.hpp>
using namespace RDKit;
using namespace RDKit::SynthonSpaceSearch;
using namespace RDKit::SynthonSpaceSearch::details;
using namespace RDKit::RascalMCES;
const char *rdbase = getenv("RDBASE");
int num5567MReads = 0;
int num5567RReads = 0;
int num5567NReads = 0;
// The Synthons_5567.csv is in the test data. It is used multiple times
// so convert to the binary if that isn't already there. This makes the
// tests faster and also tests the conversion process.
void convert5567FileR() {
num5567RReads++;
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
std::string binName1 =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_rdkit.spc";
if (!std::filesystem::exists(binName1)) {
bool cancelled = false;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
RDKitFP::getRDKitFPGenerator<std::uint64_t>());
convertTextToDBFile(libName, binName1, cancelled, fpGen.get());
}
}
void convert5567FileM() {
num5567MReads++;
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
std::string binName2 =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_morgan.spc";
if (!std::filesystem::exists(binName2)) {
bool cancelled = false;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
convertTextToDBFile(libName, binName2, cancelled, fpGen.get());
}
}
void convert5567FileN() {
num5567NReads++;
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
std::string binName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
if (!std::filesystem::exists(binName)) {
bool cancelled = false;
convertTextToDBFile(libName, binName, cancelled);
}
}
void tidy5567Binary() {
std::string fName(rdbase);
std::string binName1 =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_rdkit.spc";
std::string binName2 =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_morgan.spc";
if (num5567RReads == 1 && std::filesystem::exists(binName1)) {
std::remove(binName1.c_str());
}
if (num5567MReads == 3 && std::filesystem::exists(binName2)) {
std::remove(binName2.c_str());
}
std::string binName3 =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
if (num5567NReads == 4 && std::filesystem::exists(binName3)) {
std::cout << "removing " << binName3 << std::endl;
std::remove(binName3.c_str());
}
}
TEST_CASE("S Biggy") {
REQUIRE(rdbase);
std::string fName(rdbase);
convert5567FileN();
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(libName);
const std::vector<std::string> smis{"c1ccccc1C(=O)N1CCCC1",
"c1ccccc1NC(=O)C1CCN1",
"c12ccccc1c(N)nc(N)n2",
"c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1",
"c1nncn1",
"C(=O)NC(CC)C(=O)N(CC)C"};
const std::vector<size_t> numRes{6785, 4544, 48892, 1, 29147, 5651};
const std::vector<size_t> maxRes{6785, 4544, 48893, 1, 29312, 5869};
SubstructMatchParameters matchParams;
SynthonSpaceSearchParams params;
params.maxHits = -1;
for (auto numThreads : std::vector<int>{1, 2, -1}) {
params.numThreads = numThreads;
for (size_t i = 0; i < smis.size(); ++i) {
auto queryMol = v2::SmilesParse::MolFromSmarts(smis[i]);
auto results =
synthonspace.substructureSearch(*queryMol, matchParams, params);
CHECK(results.getHitMolecules().size() == numRes[i]);
CHECK(results.getMaxNumResults() == maxRes[i]);
}
}
tidy5567Binary();
}
TEST_CASE("S Random Hits") {
REQUIRE(rdbase);
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
SynthonSpace synthonspace;
convert5567FileN();
synthonspace.readDBFile(libName);
auto queryMol = "c1ccccc1C(=O)N1CCCC1"_smiles;
SubstructMatchParameters matchParams;
SynthonSpaceSearchParams params;
params.maxHits = 100;
params.randomSample = true;
params.randomSeed = 1;
auto results =
synthonspace.substructureSearch(*queryMol, matchParams, params);
std::map<std::string, int> libCounts;
for (const auto &m : results.getHitMolecules()) {
auto molName = m->getProp<std::string>(common_properties::_Name);
std::string lib(molName.substr(molName.length() - 2));
if (const auto &c = libCounts.find(lib); c == libCounts.end()) {
libCounts.insert(std::make_pair(lib, 1));
} else {
c->second++;
}
}
CHECK(results.getHitMolecules().size() == 100);
// std::shuffle gives different results on macOS, Linux and Windows, so
// just check we have some hits from each of a1, a6 and a7.
CHECK(libCounts.find("a1") != libCounts.end());
CHECK(libCounts.find("a6") != libCounts.end());
CHECK(libCounts.find("a7") != libCounts.end());
tidy5567Binary();
}
TEST_CASE("S Later hits") {
REQUIRE(rdbase);
std::string fName(rdbase);
// Test use of params.hitStart
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
SynthonSpace synthonspace;
convert5567FileN();
synthonspace.readDBFile(libName);
auto queryMol = "c1ccccc1C(=O)N1CCCC1"_smiles;
SubstructMatchParameters matchParams;
SynthonSpaceSearchParams params;
params.maxHits = 200;
auto results =
synthonspace.substructureSearch(*queryMol, matchParams, params);
std::vector<std::string> hitNames1;
for (const auto &m : results.getHitMolecules()) {
hitNames1.push_back(m->getProp<std::string>(common_properties::_Name));
}
params.maxHits = 100;
params.hitStart = 100;
results = synthonspace.substructureSearch(*queryMol, matchParams, params);
std::vector<std::string> hitNames2;
for (const auto &m : results.getHitMolecules()) {
hitNames2.push_back(m->getProp<std::string>(common_properties::_Name));
}
CHECK(hitNames1.size() == 200);
CHECK(hitNames2.size() == 100);
for (int i = 0; i < 100; ++i) {
CHECK(hitNames1[100 + i] == hitNames2[i]);
}
params.hitStart = 6780;
results = synthonspace.substructureSearch(*queryMol, matchParams, params);
CHECK(results.getHitMolecules().size() == 5);
params.hitStart = 7000;
results = synthonspace.substructureSearch(*queryMol, matchParams, params);
CHECK(results.getHitMolecules().empty());
tidy5567Binary();
}
TEST_CASE("S Complex query") {
REQUIRE(rdbase);
std::string fName(rdbase);
// Just to demonstrate that a complex query works.
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.spc";
SynthonSpace synthonspace;
convert5567FileN();
synthonspace.readDBFile(libName);
auto queryMol = v2::SmilesParse::MolFromSmarts(
"[$(c1ccccc1),$(c1ccncc1),$(c1cnccc1)]C(=O)N1[C&!$(CC(=O))]CCC1");
REQUIRE(queryMol);
SubstructMatchParameters matchParams;
SynthonSpaceSearchParams params;
params.maxHits = -1;
auto results =
synthonspace.substructureSearch(*queryMol, matchParams, params);
CHECK(results.getHitMolecules().size() == 7649);
// The screenout is poor for a complex query, so a lot of things
// will be identified as possible that aren't.
CHECK(results.getMaxNumResults() == 32814);
tidy5567Binary();
}
TEST_CASE("FP Biggy") {
REQUIRE(rdbase);
std::string fName(rdbase);
convert5567FileM();
std::string binName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_morgan.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(binName);
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
std::map<std::string, std::unique_ptr<RWMol>> mols;
const std::vector<std::string> smis{
"c1ccccc1C(=O)N1CCCC1", "c1ccccc1NC(=O)C1CCN1",
"c12ccccc1c(N)nc(N)n2", "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1",
"c1n[nH]cn1", "C(=O)NC(CC)C(=O)N(CC)C"};
const std::vector<size_t> numRes{46, 2, 0, 123, 0, 0};
const std::vector<size_t> maxRes{2408, 197, 0, 833, 0, 4};
SynthonSpaceSearchParams params;
params.approxSimilarityAdjuster = 0.2;
params.maxHits = -1;
for (size_t i = 0; i < smis.size(); ++i) {
auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == numRes[i]);
CHECK(results.getMaxNumResults() == maxRes[i]);
}
tidy5567Binary();
}
TEST_CASE("FP Random Hits") {
REQUIRE(rdbase);
convert5567FileM();
std::string fName(rdbase);
std::string binName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_morgan.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(binName);
auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
SynthonSpaceSearchParams params;
params.maxHits = 100;
params.randomSample = true;
params.randomSeed = 1;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
std::map<std::string, int> libCounts;
for (const auto &m : results.getHitMolecules()) {
std::string molName = m->getProp<std::string>(common_properties::_Name);
std::string lib(molName.substr(molName.length() - 2));
if (const auto &c = libCounts.find(lib); c == libCounts.end()) {
libCounts.insert(std::make_pair(lib, 1));
} else {
c->second++;
}
}
CHECK(results.getHitMolecules().size() == 100);
std::map<std::string, int> expCounts{{"a1", 100}};
CHECK(expCounts == libCounts);
tidy5567Binary();
}
TEST_CASE("Timeout") {
REQUIRE(rdbase);
convert5567FileM();
std::string fName(rdbase);
std::string binName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_morgan.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(binName);
SynthonSpaceSearchParams params;
params.maxHits = -1;
params.similarityCutoff = 0.3;
params.fragSimilarityAdjuster = 0.3;
params.timeOut = 2;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
MorganFingerprint::getMorganGenerator<std::uint64_t>(2));
auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getTimedOut());
// Make sure no timeout also works, but only on a short search.
params.maxHits = 100;
params.similarityCutoff = 0.3;
params.fragSimilarityAdjuster = 0.2;
params.timeOut = 0;
auto results1 = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(!results1.getTimedOut());
tidy5567Binary();
}
TEST_CASE("FP Approx Similarity") {
REQUIRE(rdbase);
convert5567FileR();
std::string fName(rdbase);
std::string binName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567_rdkit.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(binName);
SynthonSpaceSearchParams params;
// The addFP and subtractFP are built from a random selection of
// products so do occasionally vary, so use a fixed seed.
params.randomSeed = 1;
params.similarityCutoff = 0.5;
params.timeOut = 0;
params.maxHits = 1000;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
RDKitFP::getRDKitFPGenerator<std::uint64_t>());
auto queryMol = "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"_smiles;
// With RDKit fingerprints, 0.05 gives a reasonable compromise
// between speed and hits missed.
params.approxSimilarityAdjuster = 0.05;
auto results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 546);
CHECK(results.getMaxNumResults() == 1467);
// A tighter adjuster misses more hits.
params.approxSimilarityAdjuster = 0.01;
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 187);
// This is the actual number of hits achievable.
params.approxSimilarityAdjuster = 0.25;
results = synthonspace.fingerprintSearch(*queryMol, *fpGen, params);
CHECK(results.getHitMolecules().size() == 981);
tidy5567Binary();
}
TEST_CASE("Rascal Biggy") {
REQUIRE(rdbase);
std::string fName(rdbase);
std::string libName =
fName + "/Code/GraphMol/SynthonSpaceSearch/data/Syntons_5567.csv";
SynthonSpace synthonspace;
bool cancelled = false;
synthonspace.readTextFile(libName, cancelled);
const std::vector<std::string> smis{
"c1ccccc1C(=O)N1CCCC1", "c1ccccc1NC(=O)C1CCN1",
"c12ccccc1c(N)nc(N)n2", "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1",
"c1n[nH]cn1", "C(=O)NC(CC)C(=O)N(CC)C"};
const std::vector<size_t> numRes{254, 89, 2, 34, 0, 14};
const std::vector<size_t> maxRes{376110, 278747, 79833, 34817, 190, 45932};
SynthonSpaceSearchParams params;
params.maxHits = -1;
params.numThreads = 1;
RascalOptions rascalOptions;
for (size_t i = 0; i < smis.size(); ++i) {
auto queryMol = v2::SmilesParse::MolFromSmiles(smis[i]);
auto results = synthonspace.rascalSearch(*queryMol, rascalOptions, params);
CHECK(results.getHitMolecules().size() == numRes[i]);
CHECK(results.getMaxNumResults() == maxRes[i]);
}
}
#if 0
// Whilst the code is still under active development, it's convenient to have this
// in here. It can come out later.
TEST_CASE("FP Binary File2") {
REQUIRE(rdbase);
std::string fName(rdbase);
SynthonSpace synthonspace;
std::string libName =
"/Users/david/Projects/SynthonSpaceTests/REAL/2024-09_RID-4-Cozchemix/2024-09_REAL_synthons_rdkit_3000.spc";
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
RDKitFP::getRDKitFPGenerator<std::uint64_t>());
synthonspace.readDBFile(libName, -1);
CHECK(synthonspace.getNumReactions() == 1008);
CHECK(synthonspace.getNumProducts() == 70575407790);
std::cout << synthonspace.getNumReactions() << std::endl;
std::cout << synthonspace.getNumProducts() << std::endl;
SearchResults results;
auto queryMol = "O=C(Nc1c(CNC=O)cc[s]1)c1nccnc1"_smiles;
CHECK_NOTHROW(results = synthonspace.fingerprintSearch(*queryMol, *fpGen));
CHECK(results.getHitMolecules().size() == 211);
CHECK(results.getMaxNumResults() == 1397664);
}
#endif
#if 0
// Whilst the code is still under active development, it's convenient to have this
// in here. It can come out later.
TEST_CASE("FP Freedom Space") {
std::string libName =
"/Users/david/Projects/SynthonSpaceTests/FreedomSpace/2024-09_Freedom_synthons_rdkit.spc";
// std::string libName =
// "/Users/david/Projects/SynthonSpaceTests/REAL/2024-09_RID-4-Cozchemix/2024-09_REAL_synthons_rdkit_3000.spc";
// libName =
// "/Users/david/Projects/SynthonSpaceTests/REAL/2024-09_RID-4-Cozchemix/random_real_1_rdkit.spc";
SynthonSpace synthonspace;
synthonspace.readDBFile(libName);
auto m =
"C=CC(=O)Nc1cc(Nc2nccc(-c3cn(C)c4ccccc34)n2)c(OC)cc1N(C)CCN(C)C"_smiles;
std::unique_ptr<FingerprintGenerator<std::uint64_t>> fpGen(
RDKitFP::getRDKitFPGenerator<std::uint64_t>());
SynthonSpaceSearchParams params;
params.similarityCutoff = 0.4;
params.maxHits = 1000;
params.fragSimilarityAdjuster = 0.01;
params.approxSimilarityAdjuster = 0.05;
params.numThreads = -1;
SearchResults results;
results = synthonspace.fingerprintSearch(*m, *fpGen, params);
std::cout << "Number of results : " << results.getHitMolecules().size()
<< std::endl;
int i = 0;
for (const auto &mol : results.getHitMolecules()) {
if (i < 10 || i > params.maxHits - 10) {
std::cout << i << " : " << MolToSmiles(*mol) << " : "
<< mol->getProp<std::string>(common_properties::_Name) << " "
<< mol->getProp<double>("Similarity") << std::endl;
}
++i;
}
}
#endif