Files
rdkit/Code/GraphMol/RascalMCES/mces_cluster_catch.cpp
Yakov Pechersky 0986d22c58 Deterministic kekulize, independent of atom and bond order (#9125)
* Make kekulization deterministic

* Add tautomer order-independence regression (python)

* Adjust tautomer tests for deterministic kekulization

* Update graphmol wedged-bond kekulization checks

* SmilesParse: update aromatic bond index expectations

* SmilesParse: refresh cxsmilesTest expected files

* Depictor: update testDepictor expected MolBlocks

* Depictor: update depictorCatch expectations

* Depictor Wrap: update expected MolBlock for pyDepictor

* MarvinParse: update testMrvToMol expected outputs

* FileParsers: refresh testAtropisomers expected outputs

* FileParsers: update tests for deterministic kekulization

* MolDraw2D: refresh brittle bond assertions

* RascalMCES: update expected cluster size

* MinimalLib: make cffi wedging check order-independent

* documentation fix

* MinimalLib: update Kekulé bond table in aligned-coords test

* Hoist duplicated lambdas to TEST_CASE scope

* Remove unused originalWedges variable

* Remove redundant bounds check; clarify wedge-end preference

* Pre-sort allAtms by wedge-end + rank

* Use mol.atomNeighbors() for neighbor iteration

* Check inAllAtms before linear-scanning done

* Drop redundant optsV/wedgedOptsV sorts

* Remove unused Canon.h include

* Add canonical parameter to Kekulize; skip ranking during sanitization

* Test canonical re-kekulization preserves stereo across atom orderings

* MinimalLib: update Kekulé bond orders in invertedWedges

* Change Kekulize canonical default to false, expose in Python wrappers

* keep rank order, push_back

* Revert "RascalMCES: update expected cluster size"

This reverts commit a81bb39495.

* docstring change

* expose new flag to python wrapper

* document changes in ReleaseNotes.md

* revert minimallib test changes again

* canonical = true defaults

* Revert "revert minimallib test changes again"

This reverts commit 039e1d84da.

* Reapply "RascalMCES: update expected cluster size"

This reverts commit 7b83a7a3e8.

---------

Co-authored-by: greg landrum <greg.landrum@gmail.com>
2026-03-19 08:43:13 +01:00

153 lines
4.9 KiB
C++

//
// Copyright (C) 2023 David Cosgrove
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <chrono>
#include <random>
#include <vector>
#include <GraphMol/FileParsers/MolSupplier.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <catch2/catch_all.hpp>
#include <GraphMol/RascalMCES/RascalMCES.h>
#include <GraphMol/RascalMCES/RascalClusterOptions.h>
#include <GraphMol/RascalMCES/RascalResult.h>
TEST_CASE("Small test", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("BLSets subset", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Code/GraphMol/RascalMCES/data/test_cluster1.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
auto clusters = RDKit::RascalMCES::rascalCluster(mols);
REQUIRE(clusters.size() == 12);
std::vector<size_t> expSizes{8, 4, 4, 3, 3, 3, 2, 2, 2, 2, 2, 21};
for (size_t i = 0; i < 12; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("ChEMBL 1907596") {
std::string fName = getenv("RDBASE");
fName += "/Code/GraphMol/RascalMCES/data/chembl_1907596.smi";
std::cout << fName << std::endl;
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.similarityCutoff = 0.7;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 21);
std::vector<size_t> expSizes{343, 71, 64, 33, 23, 11, 10, 6, 6, 5, 5,
4, 3, 3, 3, 3, 3, 2, 2, 2, 14};
for (size_t i = 0; i < 21; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("Small Butina test", "[basics]") {
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
RDKit::RascalMCES::RascalClusterOptions clusOpts;
auto clusters = RDKit::RascalMCES::rascalButinaCluster(mols, clusOpts);
unsigned int numMols = 0;
for (const auto &cl : clusters) {
numMols += cl.size();
}
REQUIRE(numMols == mols.size());
REQUIRE(clusters.size() == 29);
std::vector<size_t> expSizes{6, 6, 6, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
for (size_t i = 0; i < 29; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
TEST_CASE("Small test, smaller number of threads", "[basics]") {
// I'm not sure how to test whether this has had the desired effect,
// but at least we'll know that it runs ok.
std::string fName = getenv("RDBASE");
fName += "/Contrib/Fastcluster/cdk2.smi";
RDKit::SmilesMolSupplier suppl(fName, "\t", 1, 0, false);
std::vector<std::shared_ptr<RDKit::ROMol>> mols;
while (!suppl.atEnd()) {
std::shared_ptr<RDKit::ROMol> mol(suppl.next());
if (!mol) {
continue;
}
mols.push_back(mol);
}
{
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.numThreads = 2;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
{
RDKit::RascalMCES::RascalClusterOptions clusOpts;
clusOpts.numThreads = -2;
auto clusters = RDKit::RascalMCES::rascalCluster(mols, clusOpts);
REQUIRE(clusters.size() == 8);
std::vector<size_t> expSizes{7, 7, 6, 2, 2, 2, 2, 20};
for (size_t i = 0; i < 8; ++i) {
REQUIRE(clusters[i].size() == expSizes[i]);
}
}
}