CIP labeler: attempt to resolve "easy" stereo centers first (#8582)

* resolve easy chirality labels first

* add a test
This commit is contained in:
Ricardo Rodriguez
2025-06-15 10:22:22 -04:00
committed by GitHub
parent 7b9c1a9ca9
commit d570dee093
2 changed files with 179 additions and 13 deletions

View File

@@ -11,14 +11,16 @@
#include <algorithm>
#include <memory>
#include <GraphMol/RDKitBase.h>
#include <boost/algorithm/string.hpp>
#include "GraphMol/Chirality.h"
#include "GraphMol/RDKitBase.h"
#include "CIPLabeler.h"
#include "CIPMol.h"
#include "configs/Sp2Bond.h"
#include "configs/Tetrahedral.h"
#include "configs/AtropisomerBond.h"
#include <boost/algorithm/string.hpp>
#include "rules/Rules.h"
#include "rules/Rule1a.h"
@@ -30,7 +32,6 @@
#include "rules/Rule4c.h"
#include "rules/Rule5New.h"
#include "rules/Rule6.h"
#include <GraphMol/Chirality.h>
namespace RDKit {
namespace CIPLabeler {
@@ -161,8 +162,48 @@ bool labelAux(std::vector<std::unique_ptr<Configuration>> &configs,
return true;
}
void label(std::vector<std::unique_ptr<Configuration>> &configs) {
thread_local unsigned int remainingCallCount = 0;
// The chiral centers in current rdkit examples that can be resolved using only
// the constitutional rules average about 8 iterations (the highest count is
// 1039, in one of the examples in the CIP validation suite). We use 2000 as
// threshold to allow some margin.
constexpr unsigned int constitutionalRuleTimeout = 2000;
void label(std::vector<std::unique_ptr<Configuration>> &configs,
unsigned int maxRecursiveIterations) {
// First, if the specified number of iterations allows it, run all centers
// through a fast pass with the constitutional rules allow easy stuff to be
// resolved.
for (auto &conf : configs) {
// Make sure this stereo center has no label
conf->getFocus()->clearProp(common_properties::_CIPCode);
remainingCallCount = constitutionalRuleTimeout;
try {
auto desc = conf->label(constitutional_rules);
if (desc != Descriptor::UNKNOWN) {
conf->setPrimaryLabel(desc);
}
} catch (const MaxIterationsExceeded &) {
}
}
// Now, retry everything that hasn't been solved with a more generous
// threshold
if (maxRecursiveIterations != 0) {
remainingCallCount = maxRecursiveIterations;
} else {
remainingCallCount = UINT_MAX; // really big - will never be hit
}
// try again on everything that hasn't been resolved yet
for (const auto &conf : configs) {
if (conf->getFocus()->hasProp(common_properties::_CIPCode)) {
// already resolved!
continue;
}
auto desc = conf->label(constitutional_rules);
if (desc != Descriptor::UNKNOWN) {
conf->setPrimaryLabel(desc);
@@ -178,22 +219,16 @@ void label(std::vector<std::unique_ptr<Configuration>> &configs) {
}
}
thread_local unsigned int remainingCallCount = 0;
} // namespace
void assignCIPLabels(ROMol &mol, const boost::dynamic_bitset<> &atoms,
const boost::dynamic_bitset<> &bonds,
unsigned int maxRecursiveIterations) {
if (maxRecursiveIterations != 0) {
remainingCallCount = maxRecursiveIterations;
} else {
remainingCallCount = UINT_MAX; // really big - will never be hit
}
// reset the mark, for the case that this fails
mol.clearProp(common_properties::_CIPComputed);
CIPMol cipmol{mol};
auto configs = findConfigs(cipmol, atoms, bonds);
label(configs);
label(configs, maxRecursiveIterations);
const bool computed = true;
mol.setProp(common_properties::_CIPComputed, true, computed);
}

View File

@@ -1246,3 +1246,134 @@ M END
CHECK(thisVal == "M");
}
}
TEST_CASE("Resolve easy CIP labels first", "[accurateCIP]") {
constexpr const char *molBlock = R"(
RDKit 3D
0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 44 54 0 0 1
M V30 BEGIN ATOM
M V30 1 H 7.548300 -7.745100 -3.341900 0
M V30 2 H 9.398400 -5.879200 -3.431800 0
M V30 3 C 9.339900 -6.542500 -1.989800 0 CFG=2
M V30 4 H 8.682000 -3.942500 -1.647400 0
M V30 5 H 5.816300 -7.103400 -1.663500 0
M V30 6 H 6.572200 -4.734300 -0.630400 0
M V30 7 C 7.170100 -7.238000 -0.889000 0 CFG=1
M V30 8 H 11.830000 -6.976500 -2.554400 0
M V30 9 C 10.663100 -7.169000 -1.537500 0 CFG=1
M V30 10 H 11.407700 -9.523900 -2.147200 0
M V30 11 H 11.900600 -10.013900 0.472200 0
M V30 12 C 10.660000 -8.874900 0.291400 0 CFG=2
M V30 13 H 12.513600 -5.911500 -0.419000 0
M V30 14 H 12.533800 -7.840600 1.437900 0
M V30 15 H 8.698100 -8.865400 3.417700 0
M V30 16 H 9.404600 -10.690700 1.714300 0
M V30 17 C 9.318500 -9.310600 0.970000 0 CFG=1
M V30 18 H 7.495800 -10.807500 -0.178300 0
M V30 19 H 6.555200 -7.878000 2.698300 0
M V30 20 H 5.852200 -9.114700 0.451400 0
M V30 21 C 7.160100 -8.299700 0.235300 0 CFG=2
M V30 22 C 7.577400 -7.598900 1.570000 0 CFG=1
M V30 23 C 10.647193 -4.032680 0.167792 0 CFG=1
M V30 24 C 11.080800 -6.489400 -0.198200 0 CFG=2
M V30 25 C 8.912300 -5.469900 -0.942400 0 CFG=1
M V30 26 C 8.915500 -8.232900 2.018400 0 CFG=2
M V30 27 C 11.076600 -7.551300 0.914700 0 CFG=1
M V30 28 H 10.678000 -6.985700 3.302100 0
M V30 29 C 7.586100 -5.893900 -0.266700 0 CFG=2
M V30 30 H 6.997300 -5.059700 1.987500 0
M V30 31 C 9.319300 -5.842900 1.504800 0 CFG=2
M V30 32 H 9.474000 -4.717300 2.574300 0
M V30 33 C 9.980900 -7.145700 1.981100 0 CFG=1
M V30 34 C 7.827000 -6.124500 1.248300 0 CFG=2
M V30 35 C 10.002300 -5.430800 0.136600 0
M V30 36 C 8.912500 -8.938500 -1.473700 0 CFG=2
M V30 37 C 8.237400 -9.352200 -0.117300 0
M V30 38 C 10.411600 -8.646100 -1.214800 0 CFG=1
M V30 39 H 8.772600 -9.975800 -2.504700 0
M V30 40 C 8.257700 -7.625000 -1.947800 0 CFG=2
M V30 41 O 9.665066 -3.068040 0.472747 0
M V30 42 Cl 11.932408 -3.993449 1.420272 0
M V30 43 H 11.083087 -3.811232 -0.806404 0
M V30 44 H 10.067077 -2.196484 0.492191 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 3 2 CFG=3
M V30 2 1 7 5 CFG=3
M V30 3 1 9 3
M V30 4 1 9 8 CFG=3
M V30 5 1 12 11 CFG=3
M V30 6 1 17 12
M V30 7 1 17 16 CFG=1
M V30 8 1 21 7
M V30 9 1 21 20 CFG=1
M V30 10 1 22 19 CFG=1
M V30 11 1 22 21
M V30 12 1 24 9
M V30 13 1 24 13 CFG=3
M V30 14 1 25 3
M V30 15 1 25 4 CFG=3
M V30 16 1 26 15 CFG=1
M V30 17 1 26 17
M V30 18 1 26 22
M V30 19 1 27 12
M V30 20 1 27 14 CFG=1
M V30 21 1 27 24
M V30 22 1 29 6 CFG=1
M V30 23 1 29 7
M V30 24 1 29 25
M V30 25 1 31 32 CFG=1
M V30 26 1 33 26
M V30 27 1 33 27
M V30 28 1 33 28 CFG=1
M V30 29 1 33 31
M V30 30 1 34 22
M V30 31 1 34 29
M V30 32 1 34 30 CFG=1
M V30 33 1 34 31
M V30 34 1 35 23
M V30 35 1 35 24
M V30 36 1 35 25
M V30 37 1 35 31
M V30 38 1 37 17
M V30 39 1 37 18
M V30 40 1 37 21
M V30 41 1 37 36
M V30 42 1 38 9
M V30 43 1 38 10 CFG=3
M V30 44 1 38 12
M V30 45 1 38 36
M V30 46 1 36 39 CFG=3
M V30 47 1 40 1 CFG=3
M V30 48 1 40 3
M V30 49 1 40 7
M V30 50 1 40 36
M V30 51 1 41 23
M V30 52 1 42 23
M V30 53 1 23 43 CFG=3
M V30 54 1 44 41
M V30 END BOND
M V30 END CTAB
M END
$$$$
)";
// The mol is a modification of the one in the above test
// "CIP max iterations test"
v2::FileParsers::MolFileParserParams params{.sanitize = false};
auto mol = v2::FileParsers::MolFromMolBlock(molBlock, params);
REQUIRE(mol);
REQUIRE_THROWS_AS(CIPLabeler::assignCIPLabels(*mol, 1000),
CIPLabeler::MaxIterationsExceeded);
auto at = mol->getAtomWithIdx(22);
REQUIRE(at->getChiralTag() == Atom::ChiralType::CHI_TETRAHEDRAL_CW);
// This will fail if this chiral center is not resolved first (which
// depends on the order of the atoms in the molBlock).
CHECK(at->getProp<std::string>(common_properties::_CIPCode) == "S");
}