// // Copyright (C) 2021-2025 Greg Landrum and other RDKit contributors // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace RDKit; template void initDataset(T &suppl, ROMOL_SPTR &core, std::vector &mols) { core.reset(suppl[0]); REQUIRE(core); for (unsigned int i = 1; i < suppl.length(); ++i) { mols.emplace_back(suppl[i]); REQUIRE(mols.back()); } } std::string flatten_whitespace(const std::string &txt) { auto res = txt; boost::algorithm::trim_fill_if(res, "", boost::is_any_of(" \t\r\n")); return res; } std::string readReferenceData(const std::string &fname) { std::ifstream ins(fname); std::string res; ins.seekg(0, std::ios::end); res.reserve(ins.tellg()); ins.seekg(0, std::ios::beg); res.assign((std::istreambuf_iterator(ins)), std::istreambuf_iterator()); return res; } TEST_CASE("toJSONTests", "[unittests]") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "simple1.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("rows") { RGroupRows rows; auto n = RGroupDecompose(cores, mols, rows); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); std::string expected = R"JSON([ { "Core": "Cc1cccc([*:1])c1[*:2]", "R1": "[H][*:1]", "R2": "CO[*:2]" }, { "Core": "Cc1cccc([*:1])c1[*:2]", "R1": "[H][*:1]", "R2": "CO[*:2]" }, { "Core": "Cc1cccc([*:1])c1[*:2]", "R1": "CO[*:1]", "R2": "[H][*:2]" } ])JSON"; CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(expected)); } SECTION("columns") { RGroupColumns cols; auto n = RGroupDecompose(cores, mols, cols); CHECK(n == mols.size()); CHECK(cols.size() == mols.size()); std::string expected = R"JSON([ "Core": [ "Cc1cccc([*:1])c1[*:2]", "Cc1cccc([*:1])c1[*:2]", "Cc1cccc([*:1])c1[*:2]" ], "R1": [ "[H][*:1]", "[H][*:1]", "CO[*:1]" ], "R2": [ "CO[*:2]", "CO[*:2]", "[H][*:2]" ] ] )JSON"; CHECK(flatten_whitespace(toJSON(cols)) == flatten_whitespace(expected)); } } TEST_CASE("simple1") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "simple1.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("defaults") { RGroupRows rows; auto n = RGroupDecompose(cores, mols, rows); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple1.out1.json"))); } SECTION("no symmetrization") { RGroupRows rows; RGroupDecompositionParameters ps; ps.matchingStrategy = RGroupMatching::NoSymmetrization; auto n = RGroupDecompose(cores, mols, rows, nullptr, ps); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple1.out2.json"))); } } TEST_CASE("simple2 with specified R groups") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "simple2.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("defaults") { RGroupRows rows; auto n = RGroupDecompose(cores, mols, rows); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple2.out1.json"))); } SECTION("only match at r groups") { RGroupRows rows; RGroupDecompositionParameters ps; ps.onlyMatchAtRGroups = true; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps); CHECK(n == 2); CHECK(rows.size() == n); CHECK(unmatched.size() == mols.size() - n); CHECK(unmatched[0] == 2); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple2.out2.json"))); } } TEST_CASE("simple3 with user labels on aromatic N") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "simple3.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("defaults (allH labels and R-groups are removed)") { RGroupRows rows; auto n = RGroupDecompose(cores, mols, rows); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple3.out1.json"))); } SECTION("removeAllHydrogenRGroups = false (as defaults)") { RGroupRows rows; RGroupDecompositionParameters ps; ps.removeAllHydrogenRGroups = false; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple3.out2.json"))); } SECTION("removeAllHydrogenRGroupsAndLabels = false (allH labels retained)") { RGroupRows rows; RGroupDecompositionParameters ps; ps.removeAllHydrogenRGroupsAndLabels = false; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple3.out3.json"))); } SECTION( "removeAllHydrogenRGroupsAndLabels = false, removeAllHydrogenRGroups = " "false (allH labels and R-groups are retained)") { RGroupRows rows; RGroupDecompositionParameters ps; ps.removeAllHydrogenRGroups = false; ps.removeAllHydrogenRGroupsAndLabels = false; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps); CHECK(n == mols.size()); CHECK(rows.size() == mols.size()); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "simple3.out4.json"))); } } TEST_CASE("jm7b00306 Snippet") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "jm7b00306.excerpt.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("defaults") { RGroupRows rows; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); // there is one structure in there that doesn't match the core CHECK(unmatched.size() == mols.size() - n); CHECK(unmatched[0] == 1); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "jm7b00306.excerpt.out1.json"))); } } TEST_CASE("jm200186n Snippet") { std::string testDataDir = std::string(getenv("RDBASE")) + std::string("/Code/GraphMol/RGroupDecomposition/test_data/"); std::string fName = testDataDir + "jm200186n.excerpt.sdf"; SDMolSupplier suppl(fName); std::vector cores(1); std::vector mols; initDataset(suppl, cores.front(), mols); SECTION("defaults") { RGroupRows rows; std::vector unmatched; auto n = RGroupDecompose(cores, mols, rows, &unmatched); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); // there is one structure in there that doesn't match the core CHECK(unmatched.size() == mols.size() - n); CHECK(unmatched[0] == 3); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace( readReferenceData(testDataDir + "jm200186n.excerpt.out1.json"))); } } std::vector smisToMols(const std::vector &smis) { std::vector mols; for (const auto &smi : smis) { auto m = SmilesToMol(smi); assert(m); mols.emplace_back(m); } return mols; } TEST_CASE("substructure parameters and RGD: chirality") { std::vector smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F", "C1CN[C@@H]1F", "C1CN[CH]1F"}; auto mols = smisToMols(smis); std::vector csmis = {"C1CNC1[*:1]"}; auto cores = smisToMols(csmis); std::vector csmis2 = {"C1CN[C@H]1[*:1]"}; auto chiral_cores = smisToMols(csmis2); SECTION("defaults") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"O[*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" } ] )JSON")); } { auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); CHECK(unmatched.size() == 1); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"O[*:1]", "R2":"F[*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"[H][*:1]", "R2":"F[*:2]" } ] )JSON")); } } SECTION("not using chirality") { // this time both cores return the same thing and stereo information is // removed from the chiral cores RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; params.substructmatchParams.useChirality = false; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"O[*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" } ] )JSON")); } { auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"O[*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1CC([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" } ] )JSON")); } } } TEST_CASE("substructure parameters and RGD: enhanced stereo") { std::vector smis = {"F[C@H]1CCN1 |&1:1|", "C1CN[C@]1(O)F |&1:3|", "C1CN[C@@H]1F |&1:3|", "Cl[C@H]1CCN1 |o1:1|", "C1CN[CH]1F"}; auto mols = smisToMols(smis); std::vector csmis = {"C1CN[C@H]1[*:1] |&1:3|"}; auto cores = smisToMols(csmis); std::vector csmis2 = {"C1CN[C@H]1[*:1] |o1:3|"}; auto cores2 = smisToMols(csmis2); SECTION("defaults: no enhanced stereo") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); CHECK(unmatched.size() == mols.size() - n); // std::cerr << toJSON(rows) << std::endl; // the core output no longer is SMARTS as the core output is the portion // of the target that matches the core query. CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"O[*:1]", "R2":"F[*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"[H][*:1]", "R2":"F[*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"Cl[*:1]", "R2":"[H][*:2]" } ] )JSON")); } { auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); CHECK(unmatched.size() == 1); // std::cerr << toJSON(rows) << std::endl; // the core output no longer is SMARTS as the core output is the portion // of the target that matches the core query. CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"O[*:1]", "R2":"F[*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"[H][*:1]", "R2":"F[*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"Cl[*:1]", "R2":"[H][*:2]" } ] )JSON")); } } SECTION("using enhanced stereo") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; params.substructmatchParams.useEnhancedStereo = true; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size() - 2); CHECK(rows.size() == n); CHECK(unmatched.size() == mols.size() - n); // std::cerr << toJSON(rows) << std::endl; // the core output no longer is SMARTS as the core output is the portion // of the target that matches the core query. CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"O[*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" } ] )JSON")); } { auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params); CHECK(n == mols.size() - 1); CHECK(rows.size() == n); CHECK(unmatched.size() == 1); // std::cerr << toJSON(rows) << std::endl; // the core output no longer is SMARTS as the core output is the portion // of the target that matches the core query. CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"O[*:2]" }, { "Core":"C1C[C@]([*:1])([*:2])N1", "R1":"F[*:1]", "R2":"[H][*:2]" }, { "Core":"C1C[C@@]([*:1])([*:2])N1", "R1":"Cl[*:1]", "R2":"[H][*:2]" } ] )JSON")); } } } TEST_CASE("github4809: ring double bonds written as crossed bonds after RGD") { std::vector smis = {"C1C=CCC2=C1C=CC=N2"}; auto mols = smisToMols(smis); std::vector csmis = {"c1ccnc([*:1])c1[*:2]"}; auto cores = smisToMols(csmis); SECTION("basics") { RGroupRows rows; { auto n = RGroupDecompose(cores, mols, rows); CHECK(n == mols.size()); CHECK(rows.size() == n); auto r1 = rows[0]["R1"]; auto mb = MolToV3KMolBlock(*r1); CHECK(mb.find("CFG=2") == std::string::npos); } } } TEST_CASE("rgroupLabelling") { std::vector smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F", "C1CN[C@@H]1F", "C1CN[CH]1F"}; auto mols = smisToMols(smis); std::vector csmis = {"C1CNC1[*:1]"}; auto cores = smisToMols(csmis); SECTION("Isotope") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.rgroupLabelling = RGroupLabelling::Isotope; params.allowMultipleRGroupsOnUnlabelled = true; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core": "[1*][C@@]1([2*])CCN1", "R1":"[1*]F", "R2":"[2*][H]" }, { "Core": "[1*][C@]1([2*])CCN1", "R1":"[1*]F", "R2":"[2*]O" }, { "Core":"[1*][C@]1([2*])CCN1", "R1":"[1*]F", "R2":"[2*][H]" }, { "Core":"[1*]C1([2*])CCN1", "R1":"[1*]F", "R2":"[2*][H]" } ] )JSON")); } } SECTION("RGroup") { auto useLegacy = GENERATE(true, false); UseLegacyStereoPerceptionFixture fx(useLegacy); RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.rgroupLabelling = RGroupLabelling::MDLRGroup; params.allowMultipleRGroupsOnUnlabelled = true; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); if (useLegacy) { // in this case the labels don't show up in the output SMILES // Presumably the dummy atoms are no longer distinguishable without // the isotope labels as the smiles no longer contains chiralty. // Chirality is present in the core SMARTS CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"*C1(*)CCN1", "R1":"*F", "R2":"*[H]" }, { "Core":"*C1(*)CCN1", "R1":"*F", "R2":"*O" }, { "Core":"*C1(*)CCN1", "R1":"*F", "R2":"*[H]" }, { "Core":"*C1(*)CCN1", "R1":"*F", "R2":"*[H]" } ] )JSON")); } else { // in this case the labels don't show up in the output SMILES // Presumably the dummy atoms are no longer distinguishable without // the isotope labels as the smiles no longer contains chiralty. // Chirality is present in the core SMARTS CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"*[C@@]1(*)CCN1", "R1":"*F", "R2":"*[H]" }, { "Core":"*[C@@]1(*)CCN1", "R1":"*F", "R2":"*O" }, { "Core":"*[C@]1(*)CCN1", "R1":"*F", "R2":"*[H]" }, { "Core":"*C1(*)CCN1", "R1":"*F", "R2":"*[H]" } ] )JSON")); } } } SECTION("Isotope|Map") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; params.rgroupLabelling = RGroupLabelling::Isotope | RGroupLabelling::AtomMap; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"C1C[C@@]([1*:1])([2*:2])N1", "R1":"F[1*:1]", "R2":"[H][2*:2]" }, { "Core":"C1C[C@]([1*:1])([2*:2])N1", "R1":"F[1*:1]", "R2":"O[2*:2]" }, { "Core":"C1C[C@]([1*:1])([2*:2])N1", "R1":"F[1*:1]", "R2":"[H][2*:2]" }, { "Core":"C1CC([1*:1])([2*:2])N1", "R1":"F[1*:1]", "R2":"[H][2*:2]" } ] )JSON")); } } } TEST_CASE("MDL R labels from original core") { std::vector smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F", "C1CN[C@@H]1F", "C1CN[CH]1F"}; auto mols = smisToMols(smis); std::vector csmis = {"[*]C1CCN1 |$_R1;;;;$|"}; auto cores = smisToMols(csmis); SECTION("Map") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; params.rgroupLabelling = RGroupLabelling::AtomMap; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0); CHECK(!rows[0]["Core"]->getAtomWithIdx(4)->hasProp( common_properties::dummyLabel)); CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0); CHECK(!rows[0]["Core"]->getAtomWithIdx(5)->hasProp( common_properties::dummyLabel)); } } SECTION("Map | MDL") { RGroupRows rows; std::vector unmatched; RGroupDecompositionParameters params; params.allowMultipleRGroupsOnUnlabelled = true; params.rgroupLabelling = RGroupLabelling::AtomMap | RGroupLabelling::MDLRGroup; { auto n = RGroupDecompose(cores, mols, rows, &unmatched, params); CHECK(n == mols.size()); CHECK(rows.size() == n); CHECK(unmatched.empty()); CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0); CHECK(rows[0]["Core"]->getAtomWithIdx(4)->hasProp( common_properties::dummyLabel)); CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0); CHECK(rows[0]["Core"]->getAtomWithIdx(5)->hasProp( common_properties::dummyLabel)); } } } TEST_CASE("Mol matches core") { auto core = "[*:1]c1[!#1]([*:2])cc([*:3])n([*:4])c(=O)1"_smarts; auto cmol = "Clc1c(C)cc(F)n(CC)c(=O)1"_smiles; auto nmol = "Clc1ncc(F)n(CC)c(=O)1"_smiles; auto smol = "Clc1ncc(F)n(CC)c(=S)1"_smiles; RGroupDecompositionParameters params; params.onlyMatchAtRGroups = true; RGroupDecomposition decomp(*core, params); CHECK(decomp.getMatchingCoreIdx(*cmol) == 0); CHECK(decomp.getMatchingCoreIdx(*nmol) == 0); CHECK(decomp.getMatchingCoreIdx(*smol) == -1); std::vector matches; CHECK(decomp.getMatchingCoreIdx(*cmol, &matches) == 0); CHECK(matches.size() == 1); CHECK(matches.front().size() == core->getNumAtoms()); CHECK(decomp.getMatchingCoreIdx(*nmol, &matches) == 0); CHECK(matches.size() == 1); CHECK(matches.front().size() == core->getNumAtoms() - 1); CHECK(decomp.getMatchingCoreIdx(*smol, &matches) == -1); CHECK(matches.empty()); MolOps::addHs(*cmol); MolOps::addHs(*nmol); MatchVectType match; CHECK(SubstructMatch(*cmol, *core, match)); CHECK(match.size() == core->getNumAtoms()); match.clear(); CHECK(!SubstructMatch(*nmol, *core, match)); } TEST_CASE("relabelMappedDummies") { SmilesWriteParams p; p.canonical = false; auto allDifferentCore = R"CTAB( RDKit 2D 8 8 0 0 0 0 0 0 0 0999 V2000 1.0808 -0.8772 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.0827 0.1228 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2177 0.6246 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2198 1.6246 0.0000 R# 0 0 0 0 0 15 0 0 0 4 0 0 -0.6493 0.1262 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.5142 0.6280 0.0000 R# 0 0 0 0 0 15 0 0 0 3 0 0 -0.6513 -0.8736 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2137 -1.3754 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 1 0 3 5 2 0 5 6 1 0 5 7 1 0 7 8 2 0 8 1 1 0 M RGP 2 4 2 6 1 M END )CTAB"_ctab; allDifferentCore->removeConformer(0); allDifferentCore->getAtomWithIdx(3)->setIsotope(6); allDifferentCore->getAtomWithIdx(5)->setIsotope(5); CHECK(MolToCXSmiles(*allDifferentCore, p) == "c1cc([6*:4])c([5*:3])cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); SECTION("AtomMap in, MDLRGroup out") { auto core = "c1cc([*:2])c([*:1])cn1"_smiles; CHECK(MolToCXSmiles(*core, p) == "c1cc([*:2])c([*:1])cn1"); relabelMappedDummies(*core); CHECK(MolToCXSmiles(*core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); } SECTION("Isotope in, MDLRGroup out") { auto core = "c1cc([2*])c([1*])cn1"_smiles; CHECK(MolToCXSmiles(*core, p) == "c1cc([2*])c([1*])cn1"); relabelMappedDummies(*core); CHECK(MolToCXSmiles(*core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); } SECTION("MDLRGroup in, MDLRGroup out") { auto core = R"CTAB( RDKit 2D 8 8 0 0 0 0 0 0 0 0999 V2000 1.0808 -0.8772 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.0827 0.1228 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2177 0.6246 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2198 1.6246 0.0000 R# 0 0 0 0 0 1 0 0 0 0 0 0 -0.6493 0.1262 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.5142 0.6280 0.0000 R# 0 0 0 0 0 1 0 0 0 0 0 0 -0.6513 -0.8736 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2137 -1.3754 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 1 2 2 0 2 3 1 0 3 4 1 0 3 5 2 0 5 6 1 0 5 7 1 0 7 8 2 0 8 1 1 0 M RGP 2 4 2 6 1 M END )CTAB"_ctab; core->removeConformer(0); CHECK(MolToCXSmiles(*core, p) == "c1cc([2*])c([1*])cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); relabelMappedDummies(*core); CHECK(MolToCXSmiles(*core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); } SECTION("AtomMap and Isotope in, MDLRGroup out - AtomMap has priority") { auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles; CHECK(MolToCXSmiles(*core, p) == "c1cc([4*:2])c([3*:1])cn1"); relabelMappedDummies(*core); CHECK(MolToCXSmiles(*core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); } SECTION("AtomMap and Isotope in, MDLRGroup out - force Isotope priority") { auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles; CHECK(MolToCXSmiles(*core, p) == "c1cc([4*:2])c([3*:1])cn1"); relabelMappedDummies(*core, Isotope); CHECK(MolToCXSmiles(*core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|"); } SECTION( "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - AtomMap has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core); CHECK(MolToCXSmiles(core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|"); } SECTION( "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force Isotope priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, Isotope); CHECK(MolToCXSmiles(core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R6:5.dummyLabel.R5|"); } SECTION( "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force MDLRGroup priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, MDLRGroup); CHECK(MolToCXSmiles(core, p) == "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|"); } SECTION( "AtomMap, Isotope and MDLRGroup in, AtomMap out - AtomMap has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, AtomMap); CHECK(MolToCXSmiles(core, p) == "c1cc([*:4])c([*:3])cn1"); } SECTION( "AtomMap, Isotope and MDLRGroup in, Isotope out - AtomMap has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, Isotope); CHECK(MolToCXSmiles(core, p) == "c1cc([4*])c([3*])cn1"); } SECTION( "AtomMap, Isotope and MDLRGroup in, AtomMap out - Isotope has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, Isotope | MDLRGroup, AtomMap); CHECK(MolToCXSmiles(core, p) == "c1cc([*:6])c([*:5])cn1"); } SECTION( "AtomMap, Isotope and MDLRGroup in, Isotope out - Isotope has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, Isotope | MDLRGroup, Isotope); CHECK(MolToCXSmiles(core, p) == "c1cc([6*])c([5*])cn1"); } SECTION( "AtomMap, Isotope and MDLRGroup in, AtomMap out - MDLRGroup has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, MDLRGroup, AtomMap); CHECK(MolToCXSmiles(core, p) == "c1cc([*:2])c([*:1])cn1"); } SECTION( "AtomMap, Isotope and MDLRGroup in, Isotope out - MDLRGroup has priority") { ROMol core(*allDifferentCore); relabelMappedDummies(core, MDLRGroup, Isotope); CHECK(MolToCXSmiles(core, p) == "c1cc([2*])c([1*])cn1"); } } TEST_CASE("includeTargetMolInResults") { auto core = "c1cc(-c2c([*:1])nn3nc([*:2])ccc23)nc(N(c2ccc([*:4])c([*:3])c2))n1"_smiles; REQUIRE(core); std::vector mols{ "Cc1ccc2c(c3ccnc(Nc4cccc(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccc(F)cc5"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccc(F)cc6"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles, "C1CC1c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles, "Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CC5)cc1F"_smiles, "C1CCC(CC1)c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles, "Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CCCCC5)cc1F"_smiles, "COCCOc1cnn2ncc(c3ccnc(Nc4cccc(OC)c4)n3)c2c1"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccccc5"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccccc5"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccccc6"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccccc4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles, "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6cccc(c6)C(F)(F)F"_smiles, }; bool areMolsNonNull = std::all_of(mols.begin(), mols.end(), [](const auto &mol) { return mol; }); REQUIRE(areMolsNonNull); RGroupDecompositionParameters ps; ps.includeTargetMolInResults = true; RGroupDecomposition rgd(*core, ps); for (const auto &mol : mols) { CHECK(rgd.add(*mol) != -1); } REQUIRE(rgd.process()); auto checkRow = [](const RGroupRow &row) { ROMOL_SPTR targetMol; // These are sets of int vectors rather just plain int vectors // because there can be cyclic R groups with 2 attachment points // in that case it is OK for 2 R groups to have exactly the same // target atom and bond indices std::set> allAtomIndices; std::set> allBondIndices; for (const auto &pair : row) { if (pair.first == RGroupData::getMolLabel()) { targetMol = pair.second; } else { auto atoms = pair.second->atoms(); unsigned int numNonRAtoms = std::count_if(atoms.begin(), atoms.end(), [](const auto &atom) { return atom->getAtomicNum() > 0 || !atom->getAtomMapNum(); }); CHECK(pair.second->getNumAtoms() > numNonRAtoms); unsigned int numBonds = 0; if (pair.first == RGroupData::getCoreLabel()) { auto bonds = pair.second->bonds(); numBonds = std::count_if(bonds.begin(), bonds.end(), [](const auto &bond) { return (bond->getBeginAtom()->getAtomicNum() > 0 || !bond->getBeginAtom()->getAtomMapNum()) && (bond->getEndAtom()->getAtomicNum() > 0 || !bond->getEndAtom()->getAtomMapNum()); }); } else { numBonds = pair.second->getNumBonds(); } std::vector atomIndices; std::vector bondIndices; CHECK(pair.second->getPropIfPresent( common_properties::_rgroupTargetAtoms, atomIndices)); CHECK(pair.second->getPropIfPresent( common_properties::_rgroupTargetBonds, bondIndices)); CHECK(atomIndices.size() == numNonRAtoms); allAtomIndices.insert(atomIndices); CHECK(bondIndices.size() == numBonds); allBondIndices.insert(bondIndices); } } REQUIRE(targetMol); auto flattenedAtomIndices = std::accumulate( allAtomIndices.begin(), allAtomIndices.end(), std::vector{}, [](std::vector acc, const std::vector &v) { acc.insert(acc.end(), std::make_move_iterator(v.begin()), std::make_move_iterator(v.end())); return acc; }); auto uniqueAtomIndices = std::accumulate( allAtomIndices.begin(), allAtomIndices.end(), std::set{}, [](std::set acc, const std::vector &v) { acc.insert(std::make_move_iterator(v.begin()), std::make_move_iterator(v.end())); return acc; }); CHECK(flattenedAtomIndices.size() == uniqueAtomIndices.size()); CHECK(flattenedAtomIndices.size() == targetMol->getNumAtoms()); auto flattenedBondIndices = std::accumulate( allBondIndices.begin(), allBondIndices.end(), std::vector{}, [](std::vector acc, const std::vector &v) { acc.insert(acc.end(), std::make_move_iterator(v.begin()), std::make_move_iterator(v.end())); return acc; }); auto uniqueBondIndices = std::accumulate( allBondIndices.begin(), allBondIndices.end(), std::set{}, [](std::set acc, const std::vector &v) { acc.insert(std::make_move_iterator(v.begin()), std::make_move_iterator(v.end())); return acc; }); CHECK(flattenedBondIndices.size() == uniqueBondIndices.size()); CHECK(flattenedBondIndices.size() == targetMol->getNumBonds()); }; SECTION("rows") { auto rows = rgd.getRGroupsAsRows(); CHECK(rows.size() == mols.size()); for (const auto &row : rows) { checkRow(row); } } SECTION("columns") { auto cols = rgd.getRGroupsAsColumns(); RGroupRows rows; rows.reserve(mols.size()); for (size_t i = 0; i < mols.size(); ++i) { RGroupRow row; for (const auto &pair : cols) { CHECK(pair.second.size() == mols.size()); row.emplace(pair.first, pair.second.at(i)); } rows.push_back(std::move(row)); } CHECK(rows.size() == mols.size()); for (const auto &row : rows) { checkRow(row); } } } TEST_CASE("Multiple Core Hits") { { std::vector cores{ "c1([*:9])c([*:8])c([*:7])c2c(c1([*:10]))c(c([*:5])n2([*:6]))[CH2]C([*:3])([*:4])[N,n]([*:1])([*:2])"_smarts}; REQUIRE(cores.front()); std::vector mols{ "CC1(C)N2[C@@H](Cc3c1[nH]c4ccccc34)C(=O)N(CCc5c[nH]c6ccccc56)CC2=O"_smiles}; REQUIRE(mols.front()); RGroupRows rows; RGroupDecompositionParameters ps; ps.allowMultipleCoresInSameMol = true; auto n = RGroupDecompose(cores, mols, rows, nullptr, ps); CHECK(n == 1); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"c1ccc2c(C[C@H](N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1", "R1":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]", "R2":"CC(C)([*:2])[*:5]", "R3":"O=C(CN(CCc1c[nH]c2ccccc12)C(=O)[*:3])[*:1]", "R5":"CC(C)([*:2])[*:5]" }, { "Core":"c1ccc2c(CC(N([*:1])[*:2])[*:3])c([*:5])[nH]c2c1", "R1":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]", "R2":"CC1(C)c2[nH]c3ccccc3c2C[C@@H](C(=O)[*:2])N1C(=O)C[*:1]", "R3":"[H][*:3]", "R5":"[H][*:5]" } ])JSON")); } { std::vector cores{"c1ccccc1"_smarts}; std::vector mols{"Fc1ccccc1Nc2ccc(Cl)cc2"_smiles, "c1cc(O)cc(Oc2cccc(Br)c2)c1"_smiles, "Ic1ccccc1"_smiles}; RGroupRows rows; RGroupDecompositionParameters ps; ps.allowMultipleCoresInSameMol = true; auto n = RGroupDecompose(cores, mols, rows, nullptr, ps); CHECK(n == 3); CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON( [ { "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", "R1":"[H][*:1]", "R2":"[H][*:2]", "R3":"F[*:3]", "R4":"Clc1ccc(N[*:4])cc1" }, { "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", "R1":"Fc1ccccc1N[*:1]", "R2":"[H][*:2]", "R3":"[H][*:3]", "R4":"Cl[*:4]" }, { "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", "R1":"[H][*:1]", "R2":"O[*:2]", "R3":"[H][*:3]", "R4":"Brc1cccc(O[*:4])c1" }, { "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", "R1":"[H][*:1]", "R2":"Oc1cccc(O[*:2])c1", "R3":"[H][*:3]", "R4":"Br[*:4]" }, { "Core":"c1cc([*:4])c([*:3])c([*:2])c1[*:1]", "R1":"[H][*:1]", "R2":"[H][*:2]", "R3":"[H][*:3]", "R4":"I[*:4]" } ])JSON")); } }