mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* framework for extended query. serialization works to/from text doesn't work * first pass at getting substructure search working basic tests improved error handling (try not to take down the server thread!) * add serialization to MolBundle * we really need to pickle mol properties * basic support for molbundle including substructure search * tautomer and molbundle queries to JSON * remove debug msg * cleanup debug initial index steps (not tested) * remove indexing stuff since it wasn't working will try to come back to that * add xqm to update script * add c++ testing for molbundle serialization * add serialization of molbundles to python interface * support expanding molbundles to arrays of tautomer queries * edge cases Signed-off-by: greg landrum <greg.landrum@gmail.com> * change in response to review * a bunch of updates * make sure the mol props needed for XQMs are being serialized * update update script * fix binary string output from ExtendedQueryMols in python * tautomer queries should serialize properties * more testing never hurts * combo of generic groups and generalized queries works * Update Code/PgSQL/rdkit/adapter.cpp Co-authored-by: Paolo Tosco <paolo.tosco.mail@gmail.com> * Update Code/PgSQL/rdkit/adapter.cpp Co-authored-by: Paolo Tosco <paolo.tosco.mail@gmail.com> * Update Code/PgSQL/rdkit/adapter.cpp Co-authored-by: Paolo Tosco <paolo.tosco.mail@gmail.com> * Fix weird quotes? --------- Signed-off-by: greg landrum <greg.landrum@gmail.com> Co-authored-by: Paolo Tosco <paolo.tosco.mail@gmail.com>
384 lines
13 KiB
C++
384 lines
13 KiB
C++
//
|
|
// Copyright 2020-2022 Schrodinger, Inc and other RDKit contributors
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
|
|
#include "catch.hpp"
|
|
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/TautomerQuery/TautomerQuery.h>
|
|
#include <GraphMol/ROMol.h>
|
|
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
|
#include <GraphMol/Substruct/SubstructMatch.h>
|
|
#include <GraphMol/Fingerprints/Fingerprints.h>
|
|
#include <DataStructs/BitOps.h>
|
|
#include <GraphMol/MolPickler.h>
|
|
#include <GraphMol/QueryOps.h>
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
|
|
// #define VERBOSE 1
|
|
|
|
using namespace RDKit;
|
|
|
|
TEST_CASE("TEMPLATE_ERROR") {
|
|
// for this guy the template needs to account for bonds modified when
|
|
// tautomers are sanitized
|
|
auto mol = "Cc1nc2ccccc2[nH]1"_smiles;
|
|
REQUIRE(mol);
|
|
auto target = "CN1C2=C(C(=O)Nc3ccccc3)C(=O)CCN2c2ccccc21"_smiles;
|
|
REQUIRE(target);
|
|
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto match = false;
|
|
MatchVectType matchVect;
|
|
for (auto taut : tautomerQuery->getTautomers()) {
|
|
auto test = SubstructMatch(*target, *taut, matchVect);
|
|
#ifdef VERBOSE
|
|
std::cout << "Tautomer " << MolToSmiles(*taut) << " match " << test
|
|
<< std::endl;
|
|
#endif
|
|
if (test) {
|
|
match = true;
|
|
}
|
|
}
|
|
CHECK(match);
|
|
|
|
SubstructMatchParameters params;
|
|
std::vector<ROMOL_SPTR> matchingTautomers;
|
|
auto matches =
|
|
tautomerQuery->substructOf(*target, params, &matchingTautomers);
|
|
CHECK(matches.size() == 1);
|
|
REQUIRE(matchingTautomers.size() == 1);
|
|
}
|
|
|
|
TEST_CASE("TEST_UNIQUIFY") {
|
|
auto mol = "O=C1CCCCC1"_smiles;
|
|
REQUIRE(mol);
|
|
auto target = "O=C1CCCC(CC)C1"_smiles;
|
|
REQUIRE(target);
|
|
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto tautomers = tautomerQuery->getTautomers();
|
|
SubstructMatchParameters params;
|
|
params.maxMatches = 1000;
|
|
std::vector<ROMOL_SPTR> matchingTautomers;
|
|
|
|
params.uniquify = true;
|
|
auto matches =
|
|
tautomerQuery->substructOf(*target, params, &matchingTautomers);
|
|
CHECK(matches.size() == 1);
|
|
REQUIRE(matchingTautomers.size() == 1);
|
|
|
|
params.uniquify = false;
|
|
matches = tautomerQuery->substructOf(*target, params, &matchingTautomers);
|
|
CHECK(matches.size() == 2);
|
|
REQUIRE(matchingTautomers.size() == 2);
|
|
CHECK(matchingTautomers[0] == matchingTautomers[1]);
|
|
}
|
|
|
|
TEST_CASE("DIFFERENT_TO_ENUMERATED") {
|
|
// test shows we need to set uniquify = false when matching template
|
|
auto mol = "NC(N)=O"_smiles;
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto tautomers = tautomerQuery->getTautomers();
|
|
// auto target =
|
|
// "NC1=NC2(CO1)c1cc(-c3cccnc3F)ccc1Oc1cnc(C3=CCCOC3)cc12"_smiles;
|
|
auto target = "NC1=NCCO1"_smiles;
|
|
auto enumMatch = false;
|
|
MatchVectType matchVect;
|
|
for (auto t : tautomers) {
|
|
if (SubstructMatch(*target, *t, matchVect)) {
|
|
enumMatch = true;
|
|
break;
|
|
}
|
|
}
|
|
CHECK(enumMatch);
|
|
|
|
auto match = tautomerQuery->isSubstructOf(*target);
|
|
CHECK(match);
|
|
}
|
|
|
|
TEST_CASE("SIMPLE_ERROR") {
|
|
auto mol = "CC=O"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto target = "OC(C)=O"_smiles;
|
|
auto matches = SubstructMatch(*target, *mol);
|
|
CHECK(matches.size() == 1);
|
|
auto match = tautomerQuery->isSubstructOf(*target);
|
|
CHECK(match);
|
|
target = "CCN(CC)C(=O)COP(=O)(O)COCCn1cnc2c(N)ncnc21"_smiles;
|
|
match = tautomerQuery->isSubstructOf(*target);
|
|
CHECK(match);
|
|
}
|
|
|
|
TEST_CASE("TEST_ENOL") {
|
|
auto mol = "O=C1CCCCC1"_smiles;
|
|
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto tautomers = tautomerQuery->getTautomers();
|
|
CHECK(tautomers.size() == 2);
|
|
auto modifiedAtoms = tautomerQuery->getModifiedAtoms();
|
|
CHECK(modifiedAtoms.size() == 3);
|
|
auto modifiedBonds = tautomerQuery->getModifiedBonds();
|
|
CHECK(modifiedBonds.size() == 3);
|
|
|
|
auto target1 = "OC1=CCCC(CC)C1"_smiles;
|
|
REQUIRE(target1);
|
|
SubstructMatchParameters params;
|
|
std::vector<ROMOL_SPTR> matchingTautomers;
|
|
auto matches =
|
|
tautomerQuery->substructOf(*target1, params, &matchingTautomers);
|
|
CHECK(matches.size() == 1);
|
|
auto match = tautomerQuery->isSubstructOf(*target1, params);
|
|
CHECK(match);
|
|
|
|
REQUIRE(matchingTautomers.size() == 1);
|
|
auto tautomerSmiles = MolToSmiles(*matchingTautomers[0]);
|
|
CHECK(tautomerSmiles == "OC1=CCCCC1");
|
|
|
|
auto target2 = "O=C1CCCC(CC)C1"_smiles;
|
|
REQUIRE(target2);
|
|
matches = tautomerQuery->substructOf(*target2, params, &matchingTautomers);
|
|
|
|
CHECK(matches.size() == 1);
|
|
REQUIRE(matchingTautomers.size() == 1);
|
|
tautomerSmiles = MolToSmiles(*matchingTautomers[0]);
|
|
CHECK(tautomerSmiles == "O=C1CCCCC1");
|
|
|
|
MatchVectType matchVect;
|
|
auto hasMatch = SubstructMatch(*target1, *tautomerQuery, matchVect);
|
|
CHECK(hasMatch);
|
|
|
|
auto templateFingerpint = tautomerQuery->patternFingerprintTemplate();
|
|
REQUIRE(templateFingerpint);
|
|
|
|
auto target1Fingerprint = TautomerQuery::patternFingerprintTarget(*target1);
|
|
REQUIRE(target1Fingerprint);
|
|
CHECK(AllProbeBitsMatch(*templateFingerpint, *target1Fingerprint));
|
|
delete target1Fingerprint;
|
|
|
|
auto target2Fingerprint = TautomerQuery::patternFingerprintTarget(*target2);
|
|
REQUIRE(target2Fingerprint);
|
|
CHECK(AllProbeBitsMatch(*templateFingerpint, *target2Fingerprint));
|
|
delete target2Fingerprint;
|
|
delete templateFingerpint;
|
|
}
|
|
|
|
TEST_CASE("TEST_COMPLEX") {
|
|
auto mol = "Nc1nc(=O)c2nc[nH]c2[nH]1"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
CHECK(15 == tautomerQuery->getTautomers().size());
|
|
|
|
auto queryFingerprint = tautomerQuery->patternFingerprintTemplate();
|
|
REQUIRE(queryFingerprint);
|
|
std::vector<std::string> targetSmis{"CCc1nc2[nH]c(=N)nc(O)c2[nH]1",
|
|
"CN1C2=NC=NC2=C(O)N=C1N"};
|
|
for (auto targetSmiles : targetSmis) {
|
|
auto target = SmilesToMol(targetSmiles);
|
|
REQUIRE(target);
|
|
CHECK(tautomerQuery->isSubstructOf(*target));
|
|
auto targetFingerprint = TautomerQuery::patternFingerprintTarget(*target);
|
|
REQUIRE(targetFingerprint);
|
|
CHECK(AllProbeBitsMatch(*queryFingerprint, *targetFingerprint));
|
|
delete targetFingerprint;
|
|
delete target;
|
|
}
|
|
delete queryFingerprint;
|
|
}
|
|
|
|
TEST_CASE("TEST_PICKLE") {
|
|
auto mol = "O=C1CCCCC1"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto templateMol = tautomerQuery->getTemplateMolecule();
|
|
|
|
std::string pickle;
|
|
MolPickler::pickleMol(templateMol, pickle);
|
|
ROMol pickleMol;
|
|
MolPickler::molFromPickle(pickle, pickleMol);
|
|
|
|
for (auto modifiedBondIdx : tautomerQuery->getModifiedBonds()) {
|
|
auto modifiedBond = pickleMol.getBondWithIdx(modifiedBondIdx);
|
|
REQUIRE(modifiedBond->hasQuery());
|
|
CHECK(modifiedBond->getQuery()->getDescription() ==
|
|
"SingleOrDoubleOrAromaticBond");
|
|
}
|
|
}
|
|
|
|
TEST_CASE("TEST_FINGERPRINT") {
|
|
auto mol = "O=C1CCCCC1"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto templateMol = tautomerQuery->getTemplateMolecule();
|
|
|
|
// this test molecule has complex query bonds where the template has query
|
|
// bonds, but they are not identified as tautomer bonds.
|
|
RWMol molWithoutTautomerBonds(*mol);
|
|
std::vector<std::pair<int, int>> atomIndexes;
|
|
for (auto modifiedBondIdx : tautomerQuery->getModifiedBonds()) {
|
|
auto queryBond = new QueryBond();
|
|
queryBond->setQuery(makeBondOrderEqualsQuery(Bond::BondType::SINGLE));
|
|
queryBond->expandQuery(makeBondOrderEqualsQuery(Bond::BondType::AROMATIC),
|
|
Queries::COMPOSITE_OR);
|
|
molWithoutTautomerBonds.replaceBond(modifiedBondIdx, queryBond, true);
|
|
delete queryBond;
|
|
}
|
|
|
|
// The molecule without tautomer bonds has the same regular fingerprint as the
|
|
// template
|
|
#ifdef VERBOSE
|
|
std::cout << std::endl << "fingerprinting template" << std::endl;
|
|
#endif
|
|
auto templateQueryFingerprint = PatternFingerprintMol(templateMol);
|
|
#ifdef VERBOSE
|
|
std::cout << "fingerprinting mol without bonds" << std::endl;
|
|
#endif
|
|
auto molWithoutTautomerBondsFingerprint =
|
|
PatternFingerprintMol(molWithoutTautomerBonds);
|
|
|
|
REQUIRE(templateQueryFingerprint);
|
|
REQUIRE(molWithoutTautomerBondsFingerprint);
|
|
CHECK(AllProbeBitsMatch(*templateQueryFingerprint,
|
|
*molWithoutTautomerBondsFingerprint));
|
|
CHECK(AllProbeBitsMatch(*molWithoutTautomerBondsFingerprint,
|
|
*templateQueryFingerprint));
|
|
delete templateQueryFingerprint;
|
|
delete molWithoutTautomerBondsFingerprint;
|
|
|
|
// The tautomer fingerprint for the molecule without tautomer bonds has a
|
|
// subset of the template's tautomeric fingerprint.
|
|
templateQueryFingerprint = tautomerQuery->patternFingerprintTemplate();
|
|
molWithoutTautomerBondsFingerprint =
|
|
TautomerQuery::patternFingerprintTarget(molWithoutTautomerBonds);
|
|
REQUIRE(templateQueryFingerprint);
|
|
REQUIRE(molWithoutTautomerBondsFingerprint);
|
|
#ifdef VERBOSE
|
|
std::cout << std::endl << "molWithoutTautomerBonds" << std::endl;
|
|
std::vector<int> onBits;
|
|
molWithoutTautomerBondsFingerprint->getOnBits(onBits);
|
|
for (auto bit : onBits) {
|
|
std::cout << bit << " ";
|
|
}
|
|
std::cout << std::endl;
|
|
std::cout << std::endl << "template" << std::endl;
|
|
templateQueryFingerprint->getOnBits(onBits);
|
|
for (auto bit : onBits) {
|
|
std::cout << bit << " ";
|
|
}
|
|
std::cout << std::endl;
|
|
#endif
|
|
CHECK(AllProbeBitsMatch(*molWithoutTautomerBondsFingerprint,
|
|
*templateQueryFingerprint));
|
|
CHECK(!AllProbeBitsMatch(*templateQueryFingerprint,
|
|
*molWithoutTautomerBondsFingerprint));
|
|
|
|
// The tautomer fingerprint for the template is a subset of the tautomer
|
|
// fingerprint for the original molecule.
|
|
auto molFingerprint = TautomerQuery::patternFingerprintTarget(*mol);
|
|
CHECK(AllProbeBitsMatch(*templateQueryFingerprint, *molFingerprint));
|
|
// This expected bit count for the template tautomeric fingerprint applies for
|
|
// all queries if there are no bit clashes
|
|
auto expectedTemplateBitsCount =
|
|
molWithoutTautomerBondsFingerprint->getNumOnBits() +
|
|
(molFingerprint->getNumOnBits() -
|
|
molWithoutTautomerBondsFingerprint->getNumOnBits()) /
|
|
2;
|
|
CHECK(expectedTemplateBitsCount == templateQueryFingerprint->getNumOnBits());
|
|
|
|
delete templateQueryFingerprint;
|
|
delete molWithoutTautomerBondsFingerprint;
|
|
delete molFingerprint;
|
|
}
|
|
|
|
TEST_CASE("TEST_NOT_TAUTOMER") {
|
|
auto mol = "c1ccccc1"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
CHECK(1 == tautomerQuery->getTautomers().size());
|
|
CHECK(0 == tautomerQuery->getModifiedAtoms().size());
|
|
CHECK(0 == tautomerQuery->getModifiedBonds().size());
|
|
auto target = "CC1=NC2=CC=CC=C2O1"_smiles;
|
|
REQUIRE(target);
|
|
CHECK(tautomerQuery->isSubstructOf(*target));
|
|
}
|
|
|
|
TEST_CASE("github #3821 TAUTOMERQUERY_COPY_CONSTRUCTOR") {
|
|
auto mol = "c1ccccc1"_smiles;
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto tautomerQueryCopyConstructed =
|
|
std::unique_ptr<TautomerQuery>(new TautomerQuery(*tautomerQuery));
|
|
CHECK(&(tautomerQuery->getTemplateMolecule()) !=
|
|
&tautomerQueryCopyConstructed->getTemplateMolecule());
|
|
}
|
|
|
|
TEST_CASE("github #3821 check TAUTOMERQUERY_OPERATOR= does a deep copy") {
|
|
auto mol = "c1ccccc1"_smiles;
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
auto tautomerQueryAssigned = *tautomerQuery;
|
|
CHECK(&(tautomerQuery->getTemplateMolecule()) !=
|
|
&tautomerQueryAssigned.getTemplateMolecule());
|
|
}
|
|
|
|
TEST_CASE("Serialization"){
|
|
#ifdef RDK_USE_BOOST_SERIALIZATION
|
|
SECTION("basics"){auto mol = "Nc1nc(=O)c2nc[nH]c2[nH]1"_smiles;
|
|
REQUIRE(mol);
|
|
auto tautomerQuery =
|
|
std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
CHECK(15 == tautomerQuery->getTautomers().size());
|
|
|
|
std::string pickle = tautomerQuery->serialize();
|
|
TautomerQuery serialized(pickle);
|
|
CHECK(serialized.getTautomers().size() == tautomerQuery->getTautomers().size());
|
|
|
|
auto queryFingerprint = serialized.patternFingerprintTemplate();
|
|
REQUIRE(queryFingerprint);
|
|
std::vector<std::string> targetSmis{"CCc1nc2[nH]c(=N)nc(O)c2[nH]1",
|
|
"CN1C2=NC=NC2=C(O)N=C1N"};
|
|
for (auto targetSmiles : targetSmis) {
|
|
auto target = SmilesToMol(targetSmiles);
|
|
REQUIRE(target);
|
|
CHECK(serialized.isSubstructOf(*target));
|
|
auto targetFingerprint = TautomerQuery::patternFingerprintTarget(*target);
|
|
REQUIRE(targetFingerprint);
|
|
CHECK(AllProbeBitsMatch(*queryFingerprint, *targetFingerprint));
|
|
delete targetFingerprint;
|
|
delete target;
|
|
}
|
|
delete queryFingerprint;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
TEST_CASE("Tautomer queries should propagate atom properties") {
|
|
auto mol = "COC1=NNC(*)=C1"_smiles;
|
|
REQUIRE(mol);
|
|
mol->getAtomWithIdx(6)->setProp("_foo", 6);
|
|
auto tq = std::unique_ptr<TautomerQuery>(TautomerQuery::fromMol(*mol));
|
|
CHECK(tq->getTautomers()[0]->getAtomWithIdx(6)->hasProp("_foo"));
|
|
CHECK(tq->getTautomers()[1]->getAtomWithIdx(6)->hasProp("_foo"));
|
|
CHECK(tq->getTemplateMolecule().getAtomWithIdx(6)->hasProp("_foo"));
|
|
SECTION("serialization") {
|
|
TautomerQuery tq2(tq->serialize());
|
|
CHECK(tq2.getTautomers()[0]->getAtomWithIdx(6)->hasProp("_foo"));
|
|
CHECK(tq2.getTautomers()[1]->getAtomWithIdx(6)->hasProp("_foo"));
|
|
CHECK(tq2.getTemplateMolecule().getAtomWithIdx(6)->hasProp("_foo"));
|
|
}
|
|
} |