CIP labeller performance: Don't calculate auxiliary descriptors unnecessarily (#9171)

* CIP labeller: Don't calculate auxiliary descriptors unnecessarily

The first 3 rules (the constitutional rules) are pretty easy
to understand. After rule 3, we need to calculate auxiliary
stereo descriptors to break ties.

However, we _were actually_ calculating auxiliary stereodescriptors
for all centers! We should only need to calculate auxiliary
stereocenters for sites that are needed to break ties.

This cost time - it also caused errors if the auxiliary descriptors
needed a graph expansion, because bonds in the digraph might be
pointed in the wrong direction.

Example case PDB ID 4AXM
Before this commit, errored with "Could not calculate parity! Carrier mismatch"
after 14s. After this commit, completes successfully in 0.036s.
Labelled centers all match (for the centers that had labels in
the failure case).

Includes a test that I can imagine breaking with this optimization.
The reference labels are from before this change

* Ensure all "arms" of stereo bonds and atropisomer bonds are expanded

For tetrahedral centers, ranking using the constitutional rules
always expands as far as is needed (but no further). For SP2bond
and atropisomers, if the first side is not resolvable, the
second side is never visited.

If the constitutional rules don't resolve a side, we need to
label the auxiliary centers. It's important to label all
auxiliary centers that _will_ be visited, so we need to know
what centers will be visited.

This commit updates the label() call in SP2 and Atropisomer
bonds to always attempt to label both sides if using the
constitutional rule set.

The constitutional rules are cheap, and if they fail, we
always go on to the full rule set. It is not a savings to skip
the search on the second side if we're going to keep going
anyway!

Includes a test that reproduces Ricardo's example.

This has no measurable effect on performance relative to the
original solution

* If any parts of the center have been seen, label it.

I couldn't make an example hit this, but Ric is totally
theoretically right

* Greg's ranges suggestion #2

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>

* any_of for container search

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>

---------

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
This commit is contained in:
Dan Nealschneider
2026-05-05 21:12:50 -07:00
committed by GitHub
parent 372fbad131
commit 1663989053
6 changed files with 68 additions and 9 deletions

View File

@@ -113,6 +113,12 @@ bool labelAux(std::vector<std::unique_ptr<Configuration>> &configs,
}
// FIXME: specific to each descriptor
const auto &foci = config->getFoci();
// Skip if none of the foci atoms were reached during expansion
if (std::ranges::none_of(foci,
[&](auto f) { return digraph.seenAtom(f); })) {
continue;
}
for (const auto &node : digraph.getNodes(foci[0])) {
if (node->isDuplicate()) {
continue;

View File

@@ -41,6 +41,11 @@ Node &Digraph::addNode(std::vector<char> &&visit, Atom *atom,
return d_nodes.back();
}
bool Digraph::seenAtom(Atom *atom) const {
return std::ranges::any_of(
d_nodes, [&](const auto &n) { return n.getAtom() == atom; });
}
void Digraph::addEdge(Node *beg, Bond *bond, Node *end) {
d_edges.emplace_back(beg, end, bond);
auto &e = d_edges.back();
@@ -74,9 +79,9 @@ int Digraph::getNumNodes() const { return d_nodes.size(); }
std::vector<Node *> Digraph::getNodes(Atom *atom) const {
std::vector<Node *> result;
std::vector<Node*> queue = {getCurrentRoot()};
std::vector<Node *> queue = {getCurrentRoot()};
for (size_t i=0; i<queue.size(); ++i) {
for (size_t i = 0; i < queue.size(); ++i) {
auto node = queue[i];
if (atom == node->getAtom()) {
result.push_back(node);

View File

@@ -94,6 +94,9 @@ class Digraph {
Node &addNode(std::vector<char> &&visit, Atom *atom,
boost::rational<int> &&frac, int dist, int flags);
// Has `atom` been seen yet?
bool seenAtom(Atom *atom) const;
private:
const CIPMol &d_mol;

View File

@@ -492,6 +492,22 @@ TEST_CASE("para-stereochemistry", "[accurateCIP]") {
chirality));
CHECK(chirality == "S");
}
SECTION("auxiliary stereochem beyond initial expansion") {
// The label on atom 12 depends on the chirality at
// atom 2. The label on atom 9 depends on the label of
// atom 12. Atom 2 is not reached in the initial
// expansion to score atom 9
auto mol = "CC[C@H](C)CCCCC[C@H]1CC[C@@H](C)CC1"_smiles;
CIPLabeler::assignCIPLabels(*mol);
std::string chirality;
CHECK(mol->getAtomWithIdx(2)->getProp<std::string>(
common_properties::_CIPCode) == "S");
CHECK(mol->getAtomWithIdx(9)->getProp<std::string>(
common_properties::_CIPCode) == "s");
CHECK(mol->getAtomWithIdx(12)->getProp<std::string>(
common_properties::_CIPCode) == "S");
}
}
TEST_CASE("para-stereochemistry2", "[accurateCIP]") {
SECTION("example 1") {
@@ -629,9 +645,34 @@ TEST_CASE("GitHub Issue #5142", "[bug][accurateCIP]") {
CIPLabeler::assignCIPLabels(*mol);
}
auto view_labels(const ROMol &mol) {
std::stringstream msg;
std::string label;
for (auto a : mol.atoms()) {
if (a->getPropIfPresent(common_properties::_CIPCode, label)) {
msg << a->getIdx() << label << ' ';
}
}
for (auto b : mol.bonds()) {
if (b->getPropIfPresent(common_properties::_CIPCode, label)) {
msg << b->getBeginAtomIdx() << '=' << b->getEndAtomIdx() << label << ' ';
}
}
return msg.str();
}
TEST_CASE("Bad skip side", "[accurateCIP]") {
auto m1 = "C1CC[C@H]2C/C(=C3\\C[C@H]4CCCC[C@H]4C3)C[C@H]2C1"_smiles;
CIPLabeler::assignCIPLabels(*m1);
CHECK("3S 8R 13S 16R 5=6E " == view_labels(*m1));
auto m2 = "C1/C(=C2\\C[C@H]3CCCC[C@H]3C2)C[C@H]2CCCC[C@@H]12"_smiles;
CIPLabeler::assignCIPLabels(*m2);
CHECK("4R 9S 12R 17S 1=2E " == view_labels(*m2));
}
TEST_CASE("Test early termination of CIP calculation", "[accurateCIP]") {
constexpr const char *molBlock = R"(
Mrv2117 11112217353D
Mrv2117 11112217353D
40 50 0 0 0 0 999 V2000
7.5483 -7.7451 -3.3419 H 0 0 0 0 0 0 0 0 0 0 0 0
@@ -1534,4 +1575,4 @@ $$$$
ranked_anchors) == true);
CHECK(ranked_anchors == std::vector<unsigned int>{0, Atom::NOATOM});
}
}
}

View File

@@ -94,6 +94,8 @@ Descriptor AtropisomerBond::label(Node *root1, Digraph &digraph,
const auto &focus1 = getFoci()[0];
const auto &focus2 = getFoci()[1];
const bool is_constitutional = comp.getNumSubRules() == 3;
d_ranked_anchors.clear();
const auto &internal = findInternalEdge(root1->getEdges(), focus1, focus2);
@@ -119,7 +121,7 @@ Descriptor AtropisomerBond::label(Node *root1, Digraph &digraph,
digraph.changeRoot(root1);
const auto &priority1 = comp.sort(root1, edges1);
if (!priority1.isUnique()) {
if (!priority1.isUnique() && !is_constitutional) {
return Descriptor::UNKNOWN;
}
// swap
@@ -132,7 +134,7 @@ Descriptor AtropisomerBond::label(Node *root1, Digraph &digraph,
}
digraph.changeRoot(root2);
const auto &priority2 = comp.sort(root2, edges2);
if (!priority2.isUnique()) {
if (!priority2.isUnique() || !priority1.isUnique()) {
return Descriptor::UNKNOWN;
}
// swap
@@ -184,4 +186,4 @@ Descriptor AtropisomerBond::label(Node *root1, Digraph &digraph,
}
} // namespace CIPLabeler
} // namespace RDKit
} // namespace RDKit

View File

@@ -88,6 +88,8 @@ Descriptor Sp2Bond::label(Node *root1, Digraph &digraph, const Rules &comp) {
const auto &focus1 = getFoci()[0];
const auto &focus2 = getFoci()[1];
const bool is_constitutional = comp.getNumSubRules() == 3;
d_ranked_anchors.clear();
const auto &internal = findInternalEdge(root1->getEdges(), focus1, focus2);
@@ -110,7 +112,7 @@ Descriptor Sp2Bond::label(Node *root1, Digraph &digraph, const Rules &comp) {
digraph.changeRoot(root1);
const auto &priority1 = comp.sort(root1, edges1);
if (!priority1.isUnique()) {
if (!priority1.isUnique() && !is_constitutional) {
return Descriptor::UNKNOWN;
}
// swap
@@ -123,7 +125,7 @@ Descriptor Sp2Bond::label(Node *root1, Digraph &digraph, const Rules &comp) {
}
digraph.changeRoot(root2);
const auto &priority2 = comp.sort(root2, edges2);
if (!priority2.isUnique()) {
if (!priority2.isUnique() || !priority1.isUnique()) {
return Descriptor::UNKNOWN;
}
// swap