rdkit/Code/GraphMol/RGroupDecomposition/catch_rgd.cpp

//
//  Copyright (C) 2021 Greg Landrum and other RDKit contributors
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//

#include <catch2/catch_all.hpp>

#include <GraphMol/RDKitBase.h>

#include <GraphMol/FileParsers/FileParsers.h>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/RGroupDecomposition/RGroupDecomp.h>
#include <GraphMol/RGroupDecomposition/RGroupUtils.h>
#include <GraphMol/RGroupDecomposition/RGroupData.h>

#include <boost/format.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/trim_all.hpp>

using namespace RDKit;

template <typename T>
void initDataset(T &suppl, ROMOL_SPTR &core, std::vector<ROMOL_SPTR> &mols) {
  core.reset(suppl[0]);
  REQUIRE(core);
  for (unsigned int i = 1; i < suppl.length(); ++i) {
    mols.emplace_back(suppl[i]);
    REQUIRE(mols.back());
  }
}

std::string flatten_whitespace(const std::string &txt) {
  auto res = txt;
  boost::algorithm::trim_fill_if(res, "", boost::is_any_of(" \t\r\n"));
  return res;
}

std::string readReferenceData(const std::string &fname) {
  std::ifstream ins(fname);
  std::string res;
  ins.seekg(0, std::ios::end);
  res.reserve(ins.tellg());
  ins.seekg(0, std::ios::beg);
  res.assign((std::istreambuf_iterator<char>(ins)),
             std::istreambuf_iterator<char>());
  return res;
}
TEST_CASE("toJSONTests", "[unittests]") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "simple1.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("rows") {
    RGroupRows rows;
    auto n = RGroupDecompose(cores, mols, rows);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    std::string expected = R"JSON([
    {
        "Core": "Cc1cccc([*:1])c1[*:2]",
        "R1": "[H][*:1]",
        "R2": "CO[*:2]"
    },
    {
        "Core": "Cc1cccc([*:1])c1[*:2]",
        "R1": "[H][*:1]",
        "R2": "CO[*:2]"
    },
    {
        "Core": "Cc1cccc([*:1])c1[*:2]",
        "R1": "CO[*:1]",
        "R2": "[H][*:2]"
    }
])JSON";
    CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(expected));
  }
  SECTION("columns") {
    RGroupColumns cols;
    auto n = RGroupDecompose(cores, mols, cols);
    CHECK(n == mols.size());
    CHECK(cols.size() == mols.size());
    std::string expected = R"JSON([
  "Core": [
    "Cc1cccc([*:1])c1[*:2]",
    "Cc1cccc([*:1])c1[*:2]",
    "Cc1cccc([*:1])c1[*:2]"
  ],
  "R1": [
    "[H][*:1]",
    "[H][*:1]",
    "CO[*:1]"
  ],
  "R2": [
    "CO[*:2]",
    "CO[*:2]",
    "[H][*:2]"
  ]
]
)JSON";
    CHECK(flatten_whitespace(toJSON(cols)) == flatten_whitespace(expected));
  }
}
TEST_CASE("simple1") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "simple1.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("defaults") {
    RGroupRows rows;
    auto n = RGroupDecompose(cores, mols, rows);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple1.out1.json")));
  }
  SECTION("no symmetrization") {
    RGroupRows rows;
    RGroupDecompositionParameters ps;
    ps.matchingStrategy = RGroupMatching::NoSymmetrization;
    auto n = RGroupDecompose(cores, mols, rows, nullptr, ps);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple1.out2.json")));
  }
}

TEST_CASE("simple2 with specified R groups") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "simple2.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("defaults") {
    RGroupRows rows;
    auto n = RGroupDecompose(cores, mols, rows);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple2.out1.json")));
  }
  SECTION("only match at r groups") {
    RGroupRows rows;
    RGroupDecompositionParameters ps;
    ps.onlyMatchAtRGroups = true;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
    CHECK(n == 2);
    CHECK(rows.size() == n);
    CHECK(unmatched.size() == mols.size() - n);
    CHECK(unmatched[0] == 2);
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple2.out2.json")));
  }
}

TEST_CASE("simple3 with user labels on aromatic N") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "simple3.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("defaults (allH labels and R-groups are removed)") {
    RGroupRows rows;
    auto n = RGroupDecompose(cores, mols, rows);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple3.out1.json")));
  }
  SECTION("removeAllHydrogenRGroups = false (as defaults)") {
    RGroupRows rows;
    RGroupDecompositionParameters ps;
    ps.removeAllHydrogenRGroups = false;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(unmatched.empty());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple3.out2.json")));
  }
  SECTION("removeAllHydrogenRGroupsAndLabels = false (allH labels retained)") {
    RGroupRows rows;
    RGroupDecompositionParameters ps;
    ps.removeAllHydrogenRGroupsAndLabels = false;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(unmatched.empty());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple3.out3.json")));
  }
  SECTION(
      "removeAllHydrogenRGroupsAndLabels = false, removeAllHydrogenRGroups = "
      "false (allH labels and R-groups are retained)") {
    RGroupRows rows;
    RGroupDecompositionParameters ps;
    ps.removeAllHydrogenRGroups = false;
    ps.removeAllHydrogenRGroupsAndLabels = false;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched, ps);
    CHECK(n == mols.size());
    CHECK(rows.size() == mols.size());
    CHECK(unmatched.empty());
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "simple3.out4.json")));
  }
}

TEST_CASE("jm7b00306 Snippet") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "jm7b00306.excerpt.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("defaults") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched);
    CHECK(n == mols.size() - 1);
    CHECK(rows.size() == n);
    // there is one structure in there that doesn't match the core
    CHECK(unmatched.size() == mols.size() - n);
    CHECK(unmatched[0] == 1);
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "jm7b00306.excerpt.out1.json")));
  }
}

TEST_CASE("jm200186n Snippet") {
  std::string testDataDir =
      std::string(getenv("RDBASE")) +
      std::string("/Code/GraphMol/RGroupDecomposition/test_data/");
  std::string fName = testDataDir + "jm200186n.excerpt.sdf";
  SDMolSupplier suppl(fName);
  std::vector<ROMOL_SPTR> cores(1);
  std::vector<ROMOL_SPTR> mols;
  initDataset(suppl, cores.front(), mols);
  SECTION("defaults") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    auto n = RGroupDecompose(cores, mols, rows, &unmatched);
    CHECK(n == mols.size() - 1);
    CHECK(rows.size() == n);
    // there is one structure in there that doesn't match the core
    CHECK(unmatched.size() == mols.size() - n);
    CHECK(unmatched[0] == 3);
    CHECK(flatten_whitespace(toJSON(rows)) ==
          flatten_whitespace(
              readReferenceData(testDataDir + "jm200186n.excerpt.out1.json")));
  }
}

std::vector<ROMOL_SPTR> smisToMols(const std::vector<std::string> &smis) {
  std::vector<ROMOL_SPTR> mols;
  for (const auto &smi : smis) {
    auto m = SmilesToMol(smi);
    assert(m);
    mols.emplace_back(m);
  }
  return mols;
}

TEST_CASE("substructure parameters and RGD: chirality") {
  std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
                                   "C1CN[C@@H]1F", "C1CN[CH]1F"};
  auto mols = smisToMols(smis);
  std::vector<std::string> csmis = {"C1CNC1[*:1]"};
  auto cores = smisToMols(csmis);
  std::vector<std::string> csmis2 = {"C1CN[C@H]1[*:1]"};
  auto chiral_cores = smisToMols(csmis2);
  SECTION("defaults") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"O[*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
    {
      auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size() - 1);
      CHECK(rows.size() == n);
      CHECK(unmatched.size() == 1);
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"O[*:1]",
    "R2":"F[*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"[H][*:1]",
    "R2":"F[*:2]"
  }
]
    )JSON"));
    }
  }

  SECTION("not using chirality") {
    // this time both cores return the same thing and stereo information is
    // removed from the chiral cores
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    params.substructmatchParams.useChirality = false;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"O[*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
    {
      auto n = RGroupDecompose(chiral_cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"O[*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1CC([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
  }
}

TEST_CASE("substructure parameters and RGD: enhanced stereo") {
  std::vector<std::string> smis = {"F[C@H]1CCN1 |&1:1|", "C1CN[C@]1(O)F |&1:3|",
                                   "C1CN[C@@H]1F |&1:3|", "Cl[C@H]1CCN1 |o1:1|",
                                   "C1CN[CH]1F"};
  auto mols = smisToMols(smis);
  std::vector<std::string> csmis = {"C1CN[C@H]1[*:1] |&1:3|"};
  auto cores = smisToMols(csmis);
  std::vector<std::string> csmis2 = {"C1CN[C@H]1[*:1] |o1:3|"};
  auto cores2 = smisToMols(csmis2);
  SECTION("defaults: no enhanced stereo") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size() - 1);
      CHECK(rows.size() == n);
      CHECK(unmatched.size() == mols.size() - n);
      // std::cerr << toJSON(rows) << std::endl;

      // the core output no longer is SMARTS as the core output is the portion
      // of the target that matches the core query.
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"O[*:1]",
    "R2":"F[*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"[H][*:1]",
    "R2":"F[*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"Cl[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
    {
      auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params);
      CHECK(n == mols.size() - 1);
      CHECK(rows.size() == n);
      CHECK(unmatched.size() == 1);
      // std::cerr << toJSON(rows) << std::endl;

      // the core output no longer is SMARTS as the core output is the portion
      // of the target that matches the core query.
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"O[*:1]",
    "R2":"F[*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"[H][*:1]",
    "R2":"F[*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"Cl[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
  }

  SECTION("using enhanced stereo") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    params.substructmatchParams.useEnhancedStereo = true;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size() - 2);
      CHECK(rows.size() == n);
      CHECK(unmatched.size() == mols.size() - n);
      // std::cerr << toJSON(rows) << std::endl;
      // the core output no longer is SMARTS as the core output is the portion
      // of the target that matches the core query.
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"O[*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
    {
      auto n = RGroupDecompose(cores2, mols, rows, &unmatched, params);
      CHECK(n == mols.size() - 1);
      CHECK(rows.size() == n);
      CHECK(unmatched.size() == 1);
      // std::cerr << toJSON(rows) << std::endl;
      // the core output no longer is SMARTS as the core output is the portion
      // of the target that matches the core query.
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"O[*:2]"
  },
  {
    "Core":"C1C[C@]([*:1])([*:2])N1",
    "R1":"F[*:1]",
    "R2":"[H][*:2]"
  },
  {
    "Core":"C1C[C@@]([*:1])([*:2])N1",
    "R1":"Cl[*:1]",
    "R2":"[H][*:2]"
  }
]
    )JSON"));
    }
  }
}

TEST_CASE("github4809: ring double bonds written as crossed bonds after RGD") {
  std::vector<std::string> smis = {"C1C=CCC2=C1C=CC=N2"};
  auto mols = smisToMols(smis);
  std::vector<std::string> csmis = {"c1ccnc([*:1])c1[*:2]"};
  auto cores = smisToMols(csmis);
  SECTION("basics") {
    RGroupRows rows;
    {
      auto n = RGroupDecompose(cores, mols, rows);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      auto r1 = rows[0]["R1"];
      auto mb = MolToV3KMolBlock(*r1);
      CHECK(mb.find("CFG=2") == std::string::npos);
    }
  }
}

TEST_CASE("rgroupLabelling") {
  std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
                                   "C1CN[C@@H]1F", "C1CN[CH]1F"};
  auto mols = smisToMols(smis);
  std::vector<std::string> csmis = {"C1CNC1[*:1]"};
  auto cores = smisToMols(csmis);
  SECTION("Isotope") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.rgroupLabelling = RGroupLabelling::Isotope;
    params.allowMultipleRGroupsOnUnlabelled = true;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core": "[1*][C@@]1([2*])CCN1",
    "R1":"[1*]F",
    "R2":"[2*][H]"
  },
  {
    "Core": "[1*][C@]1([2*])CCN1",
    "R1":"[1*]F",
    "R2":"[2*]O"
  },
  {
    "Core":"[1*][C@]1([2*])CCN1",
    "R1":"[1*]F",
    "R2":"[2*][H]"
  },
  {
    "Core":"[1*]C1([2*])CCN1",
    "R1":"[1*]F",
    "R2":"[2*][H]"
  }
]
    )JSON"));
    }
  }
  SECTION("RGroup") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.rgroupLabelling = RGroupLabelling::MDLRGroup;
    params.allowMultipleRGroupsOnUnlabelled = true;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      // in this case the labels don't show up in the output SMILES
      // Presumably the dummy atoms are no longer distinguishable without
      // the isotope labels as the smiles no longer contains chiralty.
      // Chirality is present in the core SMARTS
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(
[
  {
    "Core":"*C1(*)CCN1",
    "R1":"*F",
    "R2":"*[H]"
  },
  {
    "Core":"*C1(*)CCN1",
    "R1":"*F",
    "R2":"*O"
  },
  {
    "Core":"*C1(*)CCN1",
    "R1":"*F",
    "R2":"*[H]"
  },
  {
    "Core":"*C1(*)CCN1",
    "R1":"*F",
    "R2":"*[H]"
  }
]
    )JSON"));
    }
  }
  SECTION("Isotope|Map") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    params.rgroupLabelling =
        RGroupLabelling::Isotope | RGroupLabelling::AtomMap;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(flatten_whitespace(toJSON(rows)) == flatten_whitespace(R"JSON(

[
  {
    "Core":"C1C[C@@]([1*:1])([2*:2])N1",
    "R1":"F[1*:1]",
    "R2":"[H][2*:2]"
  },
  {
    "Core":"C1C[C@]([1*:1])([2*:2])N1",
    "R1":"F[1*:1]",
    "R2":"O[2*:2]"
  },
  {
    "Core":"C1C[C@]([1*:1])([2*:2])N1",
    "R1":"F[1*:1]",
    "R2":"[H][2*:2]"
  },
  {
    "Core":"C1CC([1*:1])([2*:2])N1",
    "R1":"F[1*:1]",
    "R2":"[H][2*:2]"
  }
]
    )JSON"));
    }
  }
}

TEST_CASE("MDL R labels from original core") {
  std::vector<std::string> smis = {"C1CN[C@H]1F", "C1CN[C@]1(O)F",
                                   "C1CN[C@@H]1F", "C1CN[CH]1F"};
  auto mols = smisToMols(smis);
  std::vector<std::string> csmis = {"[*]C1CCN1 |$_R1;;;;$|"};
  auto cores = smisToMols(csmis);
  SECTION("Map") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    params.rgroupLabelling = RGroupLabelling::AtomMap;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0);
      CHECK(!rows[0]["Core"]->getAtomWithIdx(4)->hasProp(
          common_properties::dummyLabel));
      CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0);
      CHECK(!rows[0]["Core"]->getAtomWithIdx(5)->hasProp(
          common_properties::dummyLabel));
    }
  }
  SECTION("Map | MDL") {
    RGroupRows rows;
    std::vector<unsigned> unmatched;
    RGroupDecompositionParameters params;
    params.allowMultipleRGroupsOnUnlabelled = true;
    params.rgroupLabelling =
        RGroupLabelling::AtomMap | RGroupLabelling::MDLRGroup;
    {
      auto n = RGroupDecompose(cores, mols, rows, &unmatched, params);
      CHECK(n == mols.size());
      CHECK(rows.size() == n);
      CHECK(unmatched.empty());
      CHECK(rows[0]["Core"]->getAtomWithIdx(4)->getAtomicNum() == 0);
      CHECK(rows[0]["Core"]->getAtomWithIdx(4)->hasProp(
          common_properties::dummyLabel));
      CHECK(rows[0]["Core"]->getAtomWithIdx(5)->getAtomicNum() == 0);
      CHECK(rows[0]["Core"]->getAtomWithIdx(5)->hasProp(
          common_properties::dummyLabel));
    }
  }
}

TEST_CASE("Mol matches core") {
  auto core = "[*:1]c1[!#1]([*:2])cc([*:3])n([*:4])c(=O)1"_smarts;
  auto cmol = "Clc1c(C)cc(F)n(CC)c(=O)1"_smiles;
  auto nmol = "Clc1ncc(F)n(CC)c(=O)1"_smiles;
  auto smol = "Clc1ncc(F)n(CC)c(=S)1"_smiles;
  RGroupDecompositionParameters params;
  params.onlyMatchAtRGroups = true;
  RGroupDecomposition decomp(*core, params);
  CHECK(decomp.getMatchingCoreIdx(*cmol) == 0);
  CHECK(decomp.getMatchingCoreIdx(*nmol) == 0);
  CHECK(decomp.getMatchingCoreIdx(*smol) == -1);
  std::vector<MatchVectType> matches;
  CHECK(decomp.getMatchingCoreIdx(*cmol, &matches) == 0);
  CHECK(matches.size() == 1);
  CHECK(matches.front().size() == core->getNumAtoms());
  CHECK(decomp.getMatchingCoreIdx(*nmol, &matches) == 0);
  CHECK(matches.size() == 1);
  CHECK(matches.front().size() == core->getNumAtoms() - 1);
  CHECK(decomp.getMatchingCoreIdx(*smol, &matches) == -1);
  CHECK(matches.empty());
  MolOps::addHs(*cmol);
  MolOps::addHs(*nmol);
  MatchVectType match;
  CHECK(SubstructMatch(*cmol, *core, match));
  CHECK(match.size() == core->getNumAtoms());
  match.clear();
  CHECK(!SubstructMatch(*nmol, *core, match));
}

TEST_CASE("relabelMappedDummies") {
  SmilesWriteParams p;
  p.canonical = false;
  auto allDifferentCore = R"CTAB(
     RDKit          2D

  8  8  0  0  0  0  0  0  0  0999 V2000
    1.0808   -0.8772    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.0827    0.1228    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2177    0.6246    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2198    1.6246    0.0000 R#  0  0  0  0  0 15  0  0  0  4  0  0
   -0.6493    0.1262    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.5142    0.6280    0.0000 R#  0  0  0  0  0 15  0  0  0  3  0  0
   -0.6513   -0.8736    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2137   -1.3754    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  1  0
  3  5  2  0
  5  6  1  0
  5  7  1  0
  7  8  2  0
  8  1  1  0
M  RGP  2   4   2   6   1
M  END
)CTAB"_ctab;
  allDifferentCore->removeConformer(0);
  allDifferentCore->getAtomWithIdx(3)->setIsotope(6);
  allDifferentCore->getAtomWithIdx(5)->setIsotope(5);
  CHECK(
      MolToCXSmiles(*allDifferentCore, p) ==
      "c1cc([6*:4])c([5*:3])cn1 |atomProp:3.dummyLabel.R2:3.molAtomMapNumber.4:5.dummyLabel.R1:5.molAtomMapNumber.3|");
  SECTION("AtomMap in, MDLRGroup out") {
    auto core = "c1cc([*:2])c([*:1])cn1"_smiles;
    CHECK(
        MolToCXSmiles(*core, p) ==
        "c1cc([*:2])c([*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
    relabelMappedDummies(*core);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
  }
  SECTION("Isotope in, MDLRGroup out") {
    auto core = "c1cc([2*])c([1*])cn1"_smiles;
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc([2*])c([1*])cn1 |atomProp:3.dummyLabel.*:5.dummyLabel.*|");
    relabelMappedDummies(*core);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
  }
  SECTION("MDLRGroup in, MDLRGroup out") {
    auto core = R"CTAB(
     RDKit          2D

  8  8  0  0  0  0  0  0  0  0999 V2000
    1.0808   -0.8772    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.0827    0.1228    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2177    0.6246    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2198    1.6246    0.0000 R#  0  0  0  0  0  1  0  0  0  0  0  0
   -0.6493    0.1262    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
   -1.5142    0.6280    0.0000 R#  0  0  0  0  0  1  0  0  0  0  0  0
   -0.6513   -0.8736    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    0.2137   -1.3754    0.0000 N   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  2  0
  2  3  1  0
  3  4  1  0
  3  5  2  0
  5  6  1  0
  5  7  1  0
  7  8  2  0
  8  1  1  0
M  RGP  2   4   2   6   1
M  END
)CTAB"_ctab;
    core->removeConformer(0);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc([2*])c([1*])cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
    relabelMappedDummies(*core);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
  }
  SECTION("AtomMap and Isotope in, MDLRGroup out - AtomMap has priority") {
    auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles;
    CHECK(
        MolToCXSmiles(*core, p) ==
        "c1cc([4*:2])c([3*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
    relabelMappedDummies(*core);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
  }
  SECTION("AtomMap and Isotope in, MDLRGroup out - force Isotope priority") {
    auto core = "c1cc([4*:2])c([3*:1])cn1"_smiles;
    CHECK(
        MolToCXSmiles(*core, p) ==
        "c1cc([4*:2])c([3*:1])cn1 |atomProp:3.dummyLabel.*:3.molAtomMapNumber.2:5.dummyLabel.*:5.molAtomMapNumber.1|");
    relabelMappedDummies(*core, Isotope);
    CHECK(MolToCXSmiles(*core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - AtomMap has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core);
    CHECK(MolToCXSmiles(core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R4:5.dummyLabel.R3|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force Isotope priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, Isotope);
    CHECK(MolToCXSmiles(core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R6:5.dummyLabel.R5|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, MDLRGroup out - force MDLRGroup priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, MDLRGroup);
    CHECK(MolToCXSmiles(core, p) ==
          "c1cc(*)c(*)cn1 |atomProp:3.dummyLabel.R2:5.dummyLabel.R1|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, AtomMap out - AtomMap has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, AtomMap);
    CHECK(
        MolToCXSmiles(core, p) ==
        "c1cc([*:4])c([*:3])cn1 |atomProp:3.molAtomMapNumber.4:5.molAtomMapNumber.3|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, Isotope out - AtomMap has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, AtomMap | Isotope | MDLRGroup, Isotope);
    CHECK(MolToCXSmiles(core, p) == "c1cc([4*])c([3*])cn1");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, AtomMap out - Isotope has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, Isotope | MDLRGroup, AtomMap);
    CHECK(
        MolToCXSmiles(core, p) ==
        "c1cc([*:6])c([*:5])cn1 |atomProp:3.molAtomMapNumber.6:5.molAtomMapNumber.5|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, Isotope out - Isotope has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, Isotope | MDLRGroup, Isotope);
    CHECK(MolToCXSmiles(core, p) == "c1cc([6*])c([5*])cn1");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, AtomMap out - MDLRGroup has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, MDLRGroup, AtomMap);
    CHECK(
        MolToCXSmiles(core, p) ==
        "c1cc([*:2])c([*:1])cn1 |atomProp:3.molAtomMapNumber.2:5.molAtomMapNumber.1|");
  }
  SECTION(
      "AtomMap, Isotope and MDLRGroup in, Isotope out - MDLRGroup has priority") {
    ROMol core(*allDifferentCore);
    relabelMappedDummies(core, MDLRGroup, Isotope);
    CHECK(MolToCXSmiles(core, p) == "c1cc([2*])c([1*])cn1");
  }
}

TEST_CASE("includeTargetMolInResults") {
  auto core =
      "c1cc(-c2c([*:1])nn3nc([*:2])ccc23)nc(N(c2ccc([*:4])c([*:3])c2))n1"_smiles;
  REQUIRE(core);
  std::vector<ROMOL_SPTR> mols{
      "Cc1ccc2c(c3ccnc(Nc4cccc(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccc(F)cc6"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccc(F)cc5"_smiles,
      "C1CC1c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles,
      "Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CC5)cc1F"_smiles,
      "C1CCC(CC1)c2nn3ncccc3c2c4ccnc(Nc5ccccc5)n4"_smiles,
      "Fc1ccc(Nc2nccc(n2)c3c(nn4ncccc34)C5CCCCC5)cc1F"_smiles,
      "COCCOc1cnn2ncc(c3ccnc(Nc4cccc(OC)c4)n3)c2c1"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5ccccc5"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5ccccc5"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6ccccc6"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccccc4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(F)c(F)c4)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc(Cl)c(c4)C(F)(F)F)n3)c(nn2n1)c5cccc(c5)C(F)(F)F"_smiles,
      "Cc1ccc2c(c3ccnc(Nc4ccc5OCCOc5c4)n3)c(nn2n1)c6cccc(c6)C(F)(F)F"_smiles,
  };
  bool areMolsNonNull = std::all_of(mols.begin(), mols.end(),
                                    [](const auto &mol) { return mol; });
  REQUIRE(areMolsNonNull);
  RGroupDecompositionParameters ps;
  ps.includeTargetMolInResults = true;
  RGroupDecomposition rgd(*core, ps);
  for (const auto &mol : mols) {
    CHECK(rgd.add(*mol) != -1);
  }
  REQUIRE(rgd.process());
  auto checkRow = [](const RGroupRow &row) {
    ROMOL_SPTR targetMol;
    // These are sets of int vectors rather just plain int vectors
    // because there can be cyclic R groups with 2 attachment points
    // in that case it is OK for 2 R groups to have exactly the same
    // target atom and bond indices
    std::set<std::vector<int>> allAtomIndices;
    std::set<std::vector<int>> allBondIndices;
    for (const auto &pair : row) {
      if (pair.first == RGroupData::getMolLabel()) {
        targetMol = pair.second;
      } else {
        auto atoms = pair.second->atoms();
        unsigned int numNonRAtoms =
            std::count_if(atoms.begin(), atoms.end(), [](const auto &atom) {
              return atom->getAtomicNum() > 0 || !atom->getAtomMapNum();
            });
        CHECK(pair.second->getNumAtoms() > numNonRAtoms);
        unsigned int numBonds = 0;
        if (pair.first == RGroupData::getCoreLabel()) {
          auto bonds = pair.second->bonds();
          numBonds =
              std::count_if(bonds.begin(), bonds.end(), [](const auto &bond) {
                return (bond->getBeginAtom()->getAtomicNum() > 0 ||
                        !bond->getBeginAtom()->getAtomMapNum()) &&
                       (bond->getEndAtom()->getAtomicNum() > 0 ||
                        !bond->getEndAtom()->getAtomMapNum());
              });
        } else {
          numBonds = pair.second->getNumBonds();
        }
        std::vector<int> atomIndices;
        std::vector<int> bondIndices;
        CHECK(pair.second->getPropIfPresent(
            common_properties::_rgroupTargetAtoms, atomIndices));
        CHECK(pair.second->getPropIfPresent(
            common_properties::_rgroupTargetBonds, bondIndices));
        CHECK(atomIndices.size() == numNonRAtoms);
        allAtomIndices.insert(atomIndices);
        CHECK(bondIndices.size() == numBonds);
        allBondIndices.insert(bondIndices);
      }
    }
    REQUIRE(targetMol);
    auto flattenedAtomIndices = std::accumulate(
        allAtomIndices.begin(), allAtomIndices.end(), std::vector<int>{},
        [](std::vector<int> acc, const std::vector<int> &v) {
          acc.insert(acc.end(), std::make_move_iterator(v.begin()),
                     std::make_move_iterator(v.end()));
          return acc;
        });
    auto uniqueAtomIndices = std::accumulate(
        allAtomIndices.begin(), allAtomIndices.end(), std::set<int>{},
        [](std::set<int> acc, const std::vector<int> &v) {
          acc.insert(std::make_move_iterator(v.begin()),
                     std::make_move_iterator(v.end()));
          return acc;
        });
    CHECK(flattenedAtomIndices.size() == uniqueAtomIndices.size());
    CHECK(flattenedAtomIndices.size() == targetMol->getNumAtoms());
    auto flattenedBondIndices = std::accumulate(
        allBondIndices.begin(), allBondIndices.end(), std::vector<int>{},
        [](std::vector<int> acc, const std::vector<int> &v) {
          acc.insert(acc.end(), std::make_move_iterator(v.begin()),
                     std::make_move_iterator(v.end()));
          return acc;
        });
    auto uniqueBondIndices = std::accumulate(
        allBondIndices.begin(), allBondIndices.end(), std::set<int>{},
        [](std::set<int> acc, const std::vector<int> &v) {
          acc.insert(std::make_move_iterator(v.begin()),
                     std::make_move_iterator(v.end()));
          return acc;
        });
    CHECK(flattenedBondIndices.size() == uniqueBondIndices.size());
    CHECK(flattenedBondIndices.size() == targetMol->getNumBonds());
  };
  SECTION("rows") {
    auto rows = rgd.getRGroupsAsRows();
    CHECK(rows.size() == mols.size());
    for (const auto &row : rows) {
      checkRow(row);
    }
  }
  SECTION("columns") {
    auto cols = rgd.getRGroupsAsColumns();
    RGroupRows rows;
    rows.reserve(mols.size());
    for (size_t i = 0; i < mols.size(); ++i) {
      RGroupRow row;
      for (const auto &pair : cols) {
        CHECK(pair.second.size() == mols.size());
        row.emplace(pair.first, pair.second.at(i));
      }
      rows.push_back(std::move(row));
    }
    CHECK(rows.size() == mols.size());
    for (const auto &row : rows) {
      checkRow(row);
    }
  }
}