Files
rdkit/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp
Greg Landrum 8eb02b8bed switch to C++20 (#8039)
* c++20 builds working

* get MolStandardize building with clang19

* get FMCS building with clang-19

* set cxx version to c++20

* remove a few more compiler warnings

* bump min boost version, CI cleanup

* boost 1.81 is not available from conda-forge

* remove unused constants

* bump linux version for CI

* remove another unused variable

* fix (hopefully) cartridge CI builds

* simplify cartridge environment

* try postgresql14 in CI

* start the postgresql service

* change the columns used in the pandastools nbtest

* remove missed merge conflict artifact

* get github4823 test to pass with numpy 2.2

* remove a compiler warning/error with g++13
2025-04-09 11:57:17 +02:00

1110 lines
35 KiB
C++

//
// Copyright (C) 2021 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <catch2/catch_all.hpp>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/FileParsers/FileParsers.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/RGroupDecomposition/RGroupDecomp.h>
#include <GraphMol/RGroupDecomposition/RGroupUtils.h>
#include <GraphMol/RGroupDecomposition/RGroupData.h>
#include <boost/format.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/trim_all.hpp>
using namespace RDKit;
template <typename T>
void initDataset(T &suppl, ROMOL_SPTR &core, std::vector<ROMOL_SPTR> &mols) {
core.reset(suppl[0]);
REQUIRE(core);
for (unsigned int i = 1; i < suppl.length(); ++i) {
mols.emplace_back(suppl[i]);
REQUIRE(mols.back());
}
}
std::string flatten_whitespace(const std::string &txt) {
auto res = txt;
boost::algorithm::trim_fill_if(res, "", boost::is_any_of(" \t\r\n"));
return res;
}
std::string readReferenceData(const std::string &fname) {
std::ifstream ins(fname);
std::string res;
ins.seekg(0, std::ios::end);
res.reserve(ins.tellg());
ins.seekg(0, std::ios::beg);
res.assign((std::istreambuf_iterator<char>(ins)),
std::istreambuf_iterator<char>());
return res;
}
TEST_CASE("toJSONTests", "[unittests]") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "simple1.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("rows") {
RGroupRows rows;
auto n = RGroupDecompose(cores, mols, rows);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
std::string expected = R"JSON([
{
"Core": "Cc1cccc([*:1])c1[*:2]",
"R1": "[H][*:1]",
"R2": "CO[*:2]"
},
{
"Core": "Cc1cccc([*:1])c1[*:2]",
"R1": "[H][*:1]",
"R2": "CO[*:2]"
},
{
"Core": "Cc1cccc([*:1])c1[*:2]",
"R1": "CO[*:1]",
"R2": "[H][*:2]"
}
])JSON";
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(expected));
}
SECTION("columns") {
RGroupColumns cols;
auto n = RGroupDecompose(cores, mols, cols);
CHECK(n == mols.size());
CHECK(cols.size() == mols.size());
std::string expected = R"JSON([
"Core": [
"Cc1cccc([*:1])c1[*:2]",
"Cc1cccc([*:1])c1[*:2]",
"Cc1cccc([*:1])c1[*:2]"
],
"R1": [
"[H][*:1]",
"[H][*:1]",
"CO[*:1]"
],
"R2": [
"CO[*:2]",
"CO[*:2]",
"[H][*:2]"
]
]
)JSON";
CHECK(flatten_whitespace(toJSON(cols)) == flatten_whitespace(expected));
}
}
TEST_CASE("simple1") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "simple1.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("defaults") {
RGroupRows rows;
auto n = RGroupDecompose(cores, mols, rows);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple1.out1.json")));
}
SECTION("no symmetrization") {
RGroupRows rows;
RGroupDecompositionParameters ps;
ps.matchingStrategy = RGroupMatching::NoSymmetrization;
auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple1.out2.json")));
}
}
TEST_CASE("simple2 with specified R groups") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "simple2.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("defaults") {
RGroupRows rows;
auto n = RGroupDecompose(cores, mols, rows);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple2.out1.json")));
}
SECTION("only match at r groups") {
RGroupRows rows;
RGroupDecompositionParameters ps;
ps.onlyMatchAtRGroups = true;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
CHECK(n == 2);
CHECK(rows.size() == n);
CHECK(unmatched.size() == mols.size() - n);
CHECK(unmatched[0] == 2);
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple2.out2.json")));
}
}
TEST_CASE("simple3 with user labels on aromatic N") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "simple3.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("defaults (allH labels and R-groups are removed)") {
RGroupRows rows;
auto n = RGroupDecompose(cores, mols, rows);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple3.out1.json")));
}
SECTION("removeAllHydrogenRGroups = false (as defaults)") {
RGroupRows rows;
RGroupDecompositionParameters ps;
ps.removeAllHydrogenRGroups = false;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple3.out2.json")));
}
SECTION("removeAllHydrogenRGroupsAndLabels = false (allH labels retained)") {
RGroupRows rows;
RGroupDecompositionParameters ps;
ps.removeAllHydrogenRGroupsAndLabels = false;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple3.out3.json")));
}
SECTION(
"removeAllHydrogenRGroupsAndLabels = false, removeAllHydrogenRGroups = "
"false (allH labels and R-groups are retained)") {
RGroupRows rows;
RGroupDecompositionParameters ps;
ps.removeAllHydrogenRGroups = false;
ps.removeAllHydrogenRGroupsAndLabels = false;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
CHECK(n == mols.size());
CHECK(rows.size() == mols.size());
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "simple3.out4.json")));
}
}
TEST_CASE("jm7b00306 Snippet") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "jm7b00306.excerpt.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("defaults") {
RGroupRows rows;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
// there is one structure in there that doesn't match the core
CHECK(unmatched.size() == mols.size() - n);
CHECK(unmatched[0] == 1);
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "jm7b00306.excerpt.out1.json")));
}
}
TEST_CASE("jm200186n Snippet") {
std::string testDataDir =
std::string(getenv("RDBASE")) +
std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
std::string fName = testDataDir + "jm200186n.excerpt.sdf";
SDMolSupplier suppl(fName);
std::vector<ROMOL_SPTR> cores(1);
std::vector<ROMOL_SPTR> mols;
initDataset(suppl, cores.front(), mols);
SECTION("defaults") {
RGroupRows rows;
std::vector<unsigned> unmatched;
auto n = RGroupDecompose(cores, mols, rows, &unmatched);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
// there is one structure in there that doesn't match the core
CHECK(unmatched.size() == mols.size() - n);
CHECK(unmatched[0] == 3);
CHECK(flatten_whitespace(toJSON(rows)) ==
flatten_whitespace(
readReferenceData(testDataDir + "jm200186n.excerpt.out1.json")));
}
}
std::vector<ROMOL_SPTR> smisToMols(const std::vector<std::string> &smis) {
std::vector<ROMOL_SPTR> mols;
for (const auto &smi : smis) {
auto m = SmilesToMol(smi);
assert(m);
mols.emplace_back(m);
}
return mols;
}
TEST_CASE("substructure parameters and RGD: chirality") {
std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
"C1CN[C@@H]1F", "C1CN[CH]1F"};
auto mols = smisToMols(smis);
std::vector<std::string> csmis = {"C1CNC1[*:1]"};
auto cores = smisToMols(csmis);
std::vector<std::string> csmis2 = {"C1CN[C@H]1[*:1]"};
auto chiral_cores = smisToMols(csmis2);
SECTION("defaults") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"O[*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
{
auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
CHECK(unmatched.size() == 1);
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"O[*:1]",
"R2":"F[*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"[H][*:1]",
"R2":"F[*:2]"
}
]
)JSON"));
}
}
SECTION("not using chirality") {
// this time both cores return the same thing and stereo information is
// removed from the chiral cores
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
params.substructmatchParams.useChirality = false;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"O[*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
{
auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"O[*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1CC([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
}
}
TEST_CASE("substructure parameters and RGD: enhanced stereo") {
std::vector<std::string> smis = {"F[C@H]1CCN1 |&1:1|", "C1CN[C@]1(O)F |&1:3|",
"C1CN[C@@H]1F |&1:3|", "Cl[C@H]1CCN1 |o1:1|",
"C1CN[CH]1F"};
auto mols = smisToMols(smis);
std::vector<std::string> csmis = {"C1CN[C@H]1[*:1] |&1:3|"};
auto cores = smisToMols(csmis);
std::vector<std::string> csmis2 = {"C1CN[C@H]1[*:1] |o1:3|"};
auto cores2 = smisToMols(csmis2);
SECTION("defaults: no enhanced stereo") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
CHECK(unmatched.size() == mols.size() - n);
// std::cerr << toJSON(rows) << std::endl;
// the core output no longer is SMARTS as the core output is the portion
// of the target that matches the core query.
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"O[*:1]",
"R2":"F[*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"[H][*:1]",
"R2":"F[*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"Cl[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
{
auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
CHECK(unmatched.size() == 1);
// std::cerr << toJSON(rows) << std::endl;
// the core output no longer is SMARTS as the core output is the portion
// of the target that matches the core query.
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"O[*:1]",
"R2":"F[*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"[H][*:1]",
"R2":"F[*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"Cl[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
}
SECTION("using enhanced stereo") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
params.substructmatchParams.useEnhancedStereo = true;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size() - 2);
CHECK(rows.size() == n);
CHECK(unmatched.size() == mols.size() - n);
// std::cerr << toJSON(rows) << std::endl;
// the core output no longer is SMARTS as the core output is the portion
// of the target that matches the core query.
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"O[*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
{
auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params);
CHECK(n == mols.size() - 1);
CHECK(rows.size() == n);
CHECK(unmatched.size() == 1);
// std::cerr << toJSON(rows) << std::endl;
// the core output no longer is SMARTS as the core output is the portion
// of the target that matches the core query.
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"O[*:2]"
},
{
"Core":"C1C[C@]([*:1])([*:2])N1",
"R1":"F[*:1]",
"R2":"[H][*:2]"
},
{
"Core":"C1C[C@@]([*:1])([*:2])N1",
"R1":"Cl[*:1]",
"R2":"[H][*:2]"
}
]
)JSON"));
}
}
}
TEST_CASE("github4809: ring double bonds written as crossed bonds after RGD") {
std::vector<std::string> smis = {"C1C=CCC2=C1C=CC=N2"};
auto mols = smisToMols(smis);
std::vector<std::string> csmis = {"c1ccnc([*:1])c1[*:2]"};
auto cores = smisToMols(csmis);
SECTION("basics") {
RGroupRows rows;
{
auto n = RGroupDecompose(cores, mols, rows);
CHECK(n == mols.size());
CHECK(rows.size() == n);
auto r1 = rows[0]["R1"];
auto mb = MolToV3KMolBlock(*r1);
CHECK(mb.find("CFG=2") == std::string::npos);
}
}
}
TEST_CASE("rgroupLabelling") {
std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
"C1CN[C@@H]1F", "C1CN[CH]1F"};
auto mols = smisToMols(smis);
std::vector<std::string> csmis = {"C1CNC1[*:1]"};
auto cores = smisToMols(csmis);
SECTION("Isotope") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.rgroupLabelling = RGroupLabelling::Isotope;
params.allowMultipleRGroupsOnUnlabelled = true;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core": "[1*][C@@]1([2*])CCN1",
"R1":"[1*]F",
"R2":"[2*][H]"
},
{
"Core": "[1*][C@]1([2*])CCN1",
"R1":"[1*]F",
"R2":"[2*]O"
},
{
"Core":"[1*][C@]1([2*])CCN1",
"R1":"[1*]F",
"R2":"[2*][H]"
},
{
"Core":"[1*]C1([2*])CCN1",
"R1":"[1*]F",
"R2":"[2*][H]"
}
]
)JSON"));
}
}
SECTION("RGroup") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.rgroupLabelling = RGroupLabelling::MDLRGroup;
params.allowMultipleRGroupsOnUnlabelled = true;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
// in this case the labels don't show up in the output SMILES
// Presumably the dummy atoms are no longer distinguishable without
// the isotope labels as the smiles no longer contains chiralty.
// Chirality is present in the core SMARTS
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"*C1(*)CCN1",
"R1":"*F",
"R2":"*[H]"
},
{
"Core":"*C1(*)CCN1",
"R1":"*F",
"R2":"*O"
},
{
"Core":"*C1(*)CCN1",
"R1":"*F",
"R2":"*[H]"
},
{
"Core":"*C1(*)CCN1",
"R1":"*F",
"R2":"*[H]"
}
]
)JSON"));
}
}
SECTION("Isotope|Map") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
params.rgroupLabelling =
RGroupLabelling::Isotope | RGroupLabelling::AtomMap;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
{
"Core":"C1C[C@@]([1*:1])([2*:2])N1",
"R1":"F[1*:1]",
"R2":"[H][2*:2]"
},
{
"Core":"C1C[C@]([1*:1])([2*:2])N1",
"R1":"F[1*:1]",
"R2":"O[2*:2]"
},
{
"Core":"C1C[C@]([1*:1])([2*:2])N1",
"R1":"F[1*:1]",
"R2":"[H][2*:2]"
},
{
"Core":"C1CC([1*:1])([2*:2])N1",
"R1":"F[1*:1]",
"R2":"[H][2*:2]"
}
]
)JSON"));
}
}
}
TEST_CASE("MDL R labels from original core") {
std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
"C1CN[C@@H]1F", "C1CN[CH]1F"};
auto mols = smisToMols(smis);
std::vector<std::string> csmis = {"[*]C1CCN1 |$_R1;;;;$|"};
auto cores = smisToMols(csmis);
SECTION("Map") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
params.rgroupLabelling = RGroupLabelling::AtomMap;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0);
CHECK(!rows[0]["Core"]->getAtomWithIdx(4)->hasProp(
common_properties::dummyLabel));
CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0);
CHECK(!rows[0]["Core"]->getAtomWithIdx(5)->hasProp(
common_properties::dummyLabel));
}
}
SECTION("Map | MDL") {
RGroupRows rows;
std::vector<unsigned> unmatched;
RGroupDecompositionParameters params;
params.allowMultipleRGroupsOnUnlabelled = true;
params.rgroupLabelling =
RGroupLabelling::AtomMap | RGroupLabelling::MDLRGroup;
{
auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
CHECK(n == mols.size());
CHECK(rows.size() == n);
CHECK(unmatched.empty());
CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0);
CHECK(rows[0]["Core"]->getAtomWithIdx(4)->hasProp(
common_properties::dummyLabel));
CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0);
CHECK(rows[0]["Core"]->getAtomWithIdx(5)->hasProp(
common_properties::dummyLabel));
}
}
}
TEST_CASE("Mol matches core") {
auto core = "[*:1]c1[!#1]([*:2])cc([*:3])n([*:4])c(=O)1"_smarts;
auto cmol = "Clc1c(C)cc(F)n(CC)c(=O)1"_smiles;
auto nmol = "Clc1ncc(F)n(CC)c(=O)1"_smiles;
auto smol = "Clc1ncc(F)n(CC)c(=S)1"_smiles;
RGroupDecompositionParameters params;
params.onlyMatchAtRGroups = true;
RGroupDecomposition decomp(*core, params);
CHECK(decomp.getMatchingCoreIdx(*cmol) == 0);
CHECK(decomp.getMatchingCoreIdx(*nmol) == 0);
CHECK(decomp.getMatchingCoreIdx(*smol) == -1);
std::vector<MatchVectType> matches;
CHECK(decomp.getMatchingCoreIdx(*cmol, &matches) == 0);
CHECK(matches.size() == 1);
CHECK(matches.front().size() == core->getNumAtoms());
CHECK(decomp.getMatchingCoreIdx(*nmol, &matches) == 0);
CHECK(matches.size() == 1);
CHECK(matches.front().size() == core->getNumAtoms() - 1);
CHECK(decomp.getMatchingCoreIdx(*smol, &matches) == -1);
CHECK(matches.empty());
MolOps::addHs(*cmol);
MolOps::addHs(*nmol);
MatchVectType match;
CHECK(SubstructMatch(*cmol, *core, match));
CHECK(match.size() == core->getNumAtoms());
match.clear();
CHECK(!SubstructMatch(*nmol, *core, match));
}
TEST_CASE("relabelMappedDummies") {
SmilesWriteParams p;
p.canonical = false;
auto allDifferentCore = R"CTAB(
RDKit 2D
8 8 0 0 0 0 0 0 0 0999 V2000
1.0808 -0.8772 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0827 0.1228 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2177 0.6246 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2198 1.6246 0.0000 R# 0 0 0 0 0 15 0 0 0 4 0 0
-0.6493 0.1262 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.5142 0.6280 0.0000 R# 0 0 0 0 0 15 0 0 0 3 0 0
-0.6513 -0.8736 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2137 -1.3754 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0
2 3 1 0
3 4 1 0
3 5 2 0
5 6 1 0
5 7 1 0
7 8 2 0
8 1 1 0
M RGP 2 4 2 6 1
M END
)CTAB"_ctab;
allDifferentCore->removeConformer(0);
allDifferentCore->getAtomWithIdx(3)->setIsotope(6);
allDifferentCore->getAtomWithIdx(5)->setIsotope(5);
CHECK(
MolToCXSmiles(*allDifferentCore, p) ==
"c1cc([6*:4])c([5*:3])cn1 |atomProp:3.dummyLabel.R2:3.molAtomMapNumber.4:5.dummyLabel.R1:5.molAtomMapNumber.3|");
SECTION("AtomMap in, MDLRGroup out") {
auto core = "c1cc([*:2])c([*:1])cn1"_smiles;
CHECK(
MolToCXSmiles(*core, p) ==
"c1cc([*:2])c([*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
relabelMappedDummies(*core);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
}
SECTION("Isotope in, MDLRGroup out") {
auto core = "c1cc([2*])c([1*])cn1"_smiles;
CHECK(MolToCXSmiles(*core, p) ==
"c1cc([2*])c([1*])cn1 |atomProp:3.dummyLabel.*:5.dummyLabel.*|");
relabelMappedDummies(*core);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
}
SECTION("MDLRGroup in, MDLRGroup out") {
auto core = R"CTAB(
RDKit 2D
8 8 0 0 0 0 0 0 0 0999 V2000
1.0808 -0.8772 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1.0827 0.1228 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2177 0.6246 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2198 1.6246 0.0000 R# 0 0 0 0 0 1 0 0 0 0 0 0
-0.6493 0.1262 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
-1.5142 0.6280 0.0000 R# 0 0 0 0 0 1 0 0 0 0 0 0
-0.6513 -0.8736 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
0.2137 -1.3754 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
1 2 2 0
2 3 1 0
3 4 1 0
3 5 2 0
5 6 1 0
5 7 1 0
7 8 2 0
8 1 1 0
M RGP 2 4 2 6 1
M END
)CTAB"_ctab;
core->removeConformer(0);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc([2*])c([1*])cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
relabelMappedDummies(*core);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
}
SECTION("AtomMap and Isotope in, MDLRGroup out - AtomMap has priority") {
auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles;
CHECK(
MolToCXSmiles(*core, p) ==
"c1cc([4*:2])c([3*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
relabelMappedDummies(*core);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
}
SECTION("AtomMap and Isotope in, MDLRGroup out - force Isotope priority") {
auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles;
CHECK(
MolToCXSmiles(*core, p) ==
"c1cc([4*:2])c([3*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
relabelMappedDummies(*core, Isotope);
CHECK(MolToCXSmiles(*core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, MDLRGroup out - AtomMap has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core);
CHECK(MolToCXSmiles(core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force Isotope priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, Isotope);
CHECK(MolToCXSmiles(core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R6:5.dummyLabel.R5|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force MDLRGroup priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, MDLRGroup);
CHECK(MolToCXSmiles(core, p) ==
"c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, AtomMap out - AtomMap has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, AtomMap);
CHECK(
MolToCXSmiles(core, p) ==
"c1cc([*:4])c([*:3])cn1 |atomProp:3.molAtomMapNumber.4:5.molAtomMapNumber.3|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, Isotope out - AtomMap has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, Isotope);
CHECK(MolToCXSmiles(core, p) == "c1cc([4*])c([3*])cn1");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, AtomMap out - Isotope has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, Isotope | MDLRGroup, AtomMap);
CHECK(
MolToCXSmiles(core, p) ==
"c1cc([*:6])c([*:5])cn1 |atomProp:3.molAtomMapNumber.6:5.molAtomMapNumber.5|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, Isotope out - Isotope has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, Isotope | MDLRGroup, Isotope);
CHECK(MolToCXSmiles(core, p) == "c1cc([6*])c([5*])cn1");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, AtomMap out - MDLRGroup has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, MDLRGroup, AtomMap);
CHECK(
MolToCXSmiles(core, p) ==
"c1cc([*:2])c([*:1])cn1 |atomProp:3.molAtomMapNumber.2:5.molAtomMapNumber.1|");
}
SECTION(
"AtomMap, Isotope and MDLRGroup in, Isotope out - MDLRGroup has priority") {
ROMol core(*allDifferentCore);
relabelMappedDummies(core, MDLRGroup, Isotope);
CHECK(MolToCXSmiles(core, p) == "c1cc([2*])c([1*])cn1");
}
}
TEST_CASE("includeTargetMolInResults") {
auto core =
"c1cc(-c2c([*:1])nn3nc([*:2])ccc23)nc(N(c2ccc([*:4])c([*:3])c2))n1"_smiles;
REQUIRE(core);
std::vector<ROMOL_SPTR> mols{
"Cc1ccc2c(c3ccnc(Nc4cccc(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccc(F)cc6"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
"C1CC1c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles,
"Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CC5)cc1F"_smiles,
"C1CCC(CC1)c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles,
"Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CCCCC5)cc1F"_smiles,
"COCCOc1cnn2ncc(c3ccnc(Nc4cccc(OC)c4)n3)c2c1"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccccc5"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccccc5"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccccc6"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccccc4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
"Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6cccc(c6)C(F)(F)F"_smiles,
};
bool areMolsNonNull = std::all_of(mols.begin(), mols.end(),
[](const auto &mol) { return mol; });
REQUIRE(areMolsNonNull);
RGroupDecompositionParameters ps;
ps.includeTargetMolInResults = true;
RGroupDecomposition rgd(*core, ps);
for (const auto &mol : mols) {
CHECK(rgd.add(*mol) != -1);
}
REQUIRE(rgd.process());
auto checkRow = [](const RGroupRow &row) {
ROMOL_SPTR targetMol;
// These are sets of int vectors rather just plain int vectors
// because there can be cyclic R groups with 2 attachment points
// in that case it is OK for 2 R groups to have exactly the same
// target atom and bond indices
std::set<std::vector<int>> allAtomIndices;
std::set<std::vector<int>> allBondIndices;
for (const auto &pair : row) {
if (pair.first == RGroupData::getMolLabel()) {
targetMol = pair.second;
} else {
auto atoms = pair.second->atoms();
unsigned int numNonRAtoms =
std::count_if(atoms.begin(), atoms.end(), [](const auto &atom) {
return atom->getAtomicNum() > 0 || !atom->getAtomMapNum();
});
CHECK(pair.second->getNumAtoms() > numNonRAtoms);
unsigned int numBonds = 0;
if (pair.first == RGroupData::getCoreLabel()) {
auto bonds = pair.second->bonds();
numBonds =
std::count_if(bonds.begin(), bonds.end(), [](const auto &bond) {
return (bond->getBeginAtom()->getAtomicNum() > 0 ||
!bond->getBeginAtom()->getAtomMapNum()) &&
(bond->getEndAtom()->getAtomicNum() > 0 ||
!bond->getEndAtom()->getAtomMapNum());
});
} else {
numBonds = pair.second->getNumBonds();
}
std::vector<int> atomIndices;
std::vector<int> bondIndices;
CHECK(pair.second->getPropIfPresent(
common_properties::_rgroupTargetAtoms, atomIndices));
CHECK(pair.second->getPropIfPresent(
common_properties::_rgroupTargetBonds, bondIndices));
CHECK(atomIndices.size() == numNonRAtoms);
allAtomIndices.insert(atomIndices);
CHECK(bondIndices.size() == numBonds);
allBondIndices.insert(bondIndices);
}
}
REQUIRE(targetMol);
auto flattenedAtomIndices = std::accumulate(
allAtomIndices.begin(), allAtomIndices.end(), std::vector<int>{},
[](std::vector<int> acc, const std::vector<int> &v) {
acc.insert(acc.end(), std::make_move_iterator(v.begin()),
std::make_move_iterator(v.end()));
return acc;
});
auto uniqueAtomIndices = std::accumulate(
allAtomIndices.begin(), allAtomIndices.end(), std::set<int>{},
[](std::set<int> acc, const std::vector<int> &v) {
acc.insert(std::make_move_iterator(v.begin()),
std::make_move_iterator(v.end()));
return acc;
});
CHECK(flattenedAtomIndices.size() == uniqueAtomIndices.size());
CHECK(flattenedAtomIndices.size() == targetMol->getNumAtoms());
auto flattenedBondIndices = std::accumulate(
allBondIndices.begin(), allBondIndices.end(), std::vector<int>{},
[](std::vector<int> acc, const std::vector<int> &v) {
acc.insert(acc.end(), std::make_move_iterator(v.begin()),
std::make_move_iterator(v.end()));
return acc;
});
auto uniqueBondIndices = std::accumulate(
allBondIndices.begin(), allBondIndices.end(), std::set<int>{},
[](std::set<int> acc, const std::vector<int> &v) {
acc.insert(std::make_move_iterator(v.begin()),
std::make_move_iterator(v.end()));
return acc;
});
CHECK(flattenedBondIndices.size() == uniqueBondIndices.size());
CHECK(flattenedBondIndices.size() == targetMol->getNumBonds());
};
SECTION("rows") {
auto rows = rgd.getRGroupsAsRows();
CHECK(rows.size() == mols.size());
for (const auto &row : rows) {
checkRow(row);
}
}
SECTION("columns") {
auto cols = rgd.getRGroupsAsColumns();
RGroupRows rows;
rows.reserve(mols.size());
for (size_t i = 0; i < mols.size(); ++i) {
RGroupRow row;
for (const auto &pair : cols) {
CHECK(pair.second.size() == mols.size());
row.emplace(pair.first, pair.second.at(i));
}
rows.push_back(std::move(row));
}
CHECK(rows.size() == mols.size());
for (const auto &row : rows) {
checkRow(row);
}
}
}