Fixes #9231: improve escaping values in CXSMILES (#9273)

* this is a fix, but it breaks other tests

* all tests pass.

This undoes the changes made as part of the "fix" for #5466

* update js tests

* response to review

---------

Co-authored-by: = <=>
This commit is contained in:
Greg Landrum
2026-05-17 17:51:14 +02:00
committed by GitHub
parent 434714e7d4
commit 1dfc9b7a1b
5 changed files with 123 additions and 49 deletions

View File

@@ -1,5 +1,5 @@
//
// Copyright (C) 2016-2021 Greg Landrum and other RDKit contributors
// Copyright (C) 2016-2026 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
@@ -9,7 +9,6 @@
//
#include <RDGeneral/BoostStartInclude.h>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/format.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <GraphMol/RDKitBase.h>
@@ -28,6 +27,7 @@
#include <array>
#include <map>
#include <string_view>
#include <string>
namespace SmilesParseOps {
using namespace RDKit;
@@ -124,7 +124,7 @@ bool read_int(Iterator &first, Iterator last, unsigned int &res) {
if (num.empty()) {
return false;
}
res = boost::lexical_cast<unsigned int>(num);
res = std::atoi(num.c_str());
return true;
}
template <typename Iterator>
@@ -137,7 +137,7 @@ bool read_int_list(Iterator &first, Iterator last,
++first;
}
if (!num.empty()) {
res.push_back(boost::lexical_cast<unsigned int>(num));
res.push_back(std::atoi(num.c_str()));
}
if (first >= last || *first != sep) {
break;
@@ -182,7 +182,7 @@ std::string read_text_to(Iterator &first, Iterator last, std::string delims) {
}
if (next > first + 2) {
std::string blk = std::string(first + 2, next);
res += (char)(boost::lexical_cast<int>(blk));
res += (char)(std::atoi(blk.c_str()));
}
first = next + 1;
start = first;
@@ -441,13 +441,13 @@ bool parse_coords(Iterator &first, Iterator last, RDKit::RWMol &mol,
std::vector<std::string> tokens;
boost::split(tokens, tkn, boost::is_any_of(std::string(",")));
if (tokens.size() >= 1 && tokens[0].size()) {
pt.x = boost::lexical_cast<double>(tokens[0]);
pt.x = std::atof(tokens[0].c_str());
}
if (tokens.size() >= 2 && tokens[1].size()) {
pt.y = boost::lexical_cast<double>(tokens[1]);
pt.y = std::atof(tokens[1].c_str());
}
if (tokens.size() >= 3 && tokens[2].size()) {
pt.z = boost::lexical_cast<double>(tokens[2]);
pt.z = std::atof(tokens[2].c_str());
is3D = true;
}
}
@@ -1621,17 +1621,23 @@ getSortedStereoGroupsAndIndices(
return {std::move(sgs), std::move(sgAtomIdxs)};
}
std::string quote_string(const std::string &txt) {
// FIX
return txt;
bool is_alphanumeric(char c) {
return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z');
}
std::string quote_atomprop_string(const std::string &txt) {
// at a bare minimum, . needs to be escaped
// from:
// https://docs.chemaxon.com/latest/formats_chemaxon-extended-smiles-and-smarts-cxsmiles-and-cxsmarts.html#escaping
const std::string sgroupAllowedSpecialChars = "><\"!@#$%()[]./\\?-+*^_~= ";
const std::string atomPropAllowedSpecialChars = "><\"!@#$%()[]./\\?-+*^_~= ";
const std::string labelAllowedSpecialChars = "><\"!@#%()[]./\\?-+*^_~=,: ";
std::string quote_string(const std::string &txt,
std::string allowedSpecialChars) {
std::string res;
for (auto c : txt) {
if (c == '.') {
res += "&#46;";
if (!allowedSpecialChars.empty() && !is_alphanumeric(c) &&
allowedSpecialChars.find(c) == std::string::npos) {
res += "&#" + std::to_string(static_cast<unsigned int>(c)) + ";";
} else {
res += c;
}
@@ -1801,7 +1807,7 @@ std::string get_sgroup_polymer_block(
res << ":";
std::string label;
if (sg.getPropIfPresent("LABEL", label)) {
res << label;
res << quote_string(label, sgroupAllowedSpecialChars);
}
res << ":";
std::string connect;
@@ -1876,13 +1882,13 @@ std::string get_sgroup_data_block(const ROMol &mol,
res << ":";
std::string prop;
if (sg.getPropIfPresent("FIELDNAME", prop) && !prop.empty()) {
res << prop;
res << quote_string(prop, sgroupAllowedSpecialChars);
}
res << ":";
std::vector<std::string> vprop;
if (sg.getPropIfPresent("DATAFIELDS", vprop) && !vprop.empty()) {
for (const auto &pv : vprop) {
res << pv << ",";
res << quote_string(pv, sgroupAllowedSpecialChars) << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
@@ -1893,11 +1899,11 @@ std::string get_sgroup_data_block(const ROMol &mol,
}
res << ":";
if (sg.getPropIfPresent("FIELDINFO", prop) && !prop.empty()) {
res << prop;
res << quote_string(prop, sgroupAllowedSpecialChars);
}
res << ":";
if (sg.getPropIfPresent("FIELDTAG", prop) && !prop.empty()) {
res << prop;
res << quote_string(prop, sgroupAllowedSpecialChars);
}
res << ":";
// FIX: do something about the coordinates
@@ -1926,20 +1932,21 @@ std::string get_atomlabel_block(const ROMol &mol,
const auto atom = mol.getAtomWithIdx(idx);
if (atom->getPropIfPresent(common_properties::_QueryAtomGenericLabel,
lbl)) {
res += quote_string(lbl + "_p");
res += quote_string(lbl + "_p", labelAllowedSpecialChars);
} else if (!atom->getAtomicNum() &&
atom->getPropIfPresent(common_properties::dummyLabel, lbl) &&
std::find(SmilesParseOps::pseudoatoms.begin(),
SmilesParseOps::pseudoatoms.end(),
lbl) != SmilesParseOps::pseudoatoms.end()) {
res += quote_string(lbl + "_p");
res += quote_string(lbl + "_p", labelAllowedSpecialChars);
} else if (!atom->getAtomicNum() &&
atom->getPropIfPresent(common_properties::_fromAttachPoint,
val) &&
(val == 1 || val == 2)) {
res += quote_string("_AP" + std::to_string(val));
res +=
quote_string("_AP" + std::to_string(val), labelAllowedSpecialChars);
} else if (atom->getPropIfPresent(common_properties::atomLabel, lbl)) {
res += quote_string(lbl);
res += quote_string(lbl, labelAllowedSpecialChars);
}
}
// if we didn't find anything return an empty string
@@ -1963,7 +1970,7 @@ std::string get_value_block(const ROMol &mol,
}
std::string lbl;
if (mol.getAtomWithIdx(idx)->getPropIfPresent(prop, lbl)) {
res += quote_string(lbl);
res += quote_string(lbl, atomPropAllowedSpecialChars);
}
}
return res;
@@ -2060,9 +2067,9 @@ std::string get_atom_props_block(const ROMol &mol,
if (res.empty()) {
res += "atomProp";
}
res +=
boost::str(boost::format(":%d.%s.%s") % which %
quote_atomprop_string(pn) % quote_atomprop_string(pv));
res += boost::str(boost::format(":%d.%s.%s") % which %
quote_string(pn, atomPropAllowedSpecialChars) %
quote_string(pv, atomPropAllowedSpecialChars));
}
}
++which;

View File

@@ -256,7 +256,8 @@ TEST_CASE("github #2257: writing cxsmiles", "[smiles][cxsmiles]") {
CHECK(mol->getAtomWithIdx(1)->getProp<std::string>("p1") == "v1;2;3");
auto smi = MolToCXSmiles(*mol);
CHECK(smi == "CC1CN1 |atomProp:2.p2.v2:2.p1.v1;2;3:3.p2.v2:3.p1.v1|");
CHECK(smi ==
"CC1CN1 |atomProp:2.p2.v2:2.p1.v1&#59;2&#59;3:3.p2.v2:3.p1.v1|");
}
SECTION("atom props and values") {
//"CN |$_AV:atomv0;atomv1$,atomProp:0.p2.v2:1.p2.v1|";
@@ -2005,12 +2006,12 @@ TEST_CASE("github #5466 writing floating point atom props cxsmiles",
mol->getAtomWithIdx(0)->setProp<std::string>("foo", "7.6");
auto smi = MolToCXSmiles(*mol);
CHECK(smi == "C |atomProp:0.foo.7&#46;6|");
CHECK(smi == "C |atomProp:0.foo.7.6|");
// 7.5 is exactly representable in IEEE so this helps :)
mol->getAtomWithIdx(0)->setProp<double>("foo", 7.5);
smi = MolToCXSmiles(*mol);
CHECK(smi == "C |atomProp:0.foo.7&#46;5|");
CHECK(smi == "C |atomProp:0.foo.7.5|");
}
SECTION("label with .") {
auto mol = "C"_smiles;
@@ -2018,7 +2019,7 @@ TEST_CASE("github #5466 writing floating point atom props cxsmiles",
mol->getAtomWithIdx(0)->setProp<int>("foo.foo", 7);
auto smi = MolToCXSmiles(*mol);
CHECK(smi == "C |atomProp:0.foo&#46;foo.7|");
CHECK(smi == "C |atomProp:0.foo.foo.7|");
}
}

View File

@@ -1,5 +1,5 @@
//
// Copyright (C) 2016-2023 Greg Landrum
// Copyright (C) 2016-2026 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
@@ -477,15 +477,51 @@ TEST_CASE("enhanced stereo") {
TEST_CASE("HTML char codes") {
{
std::string smiles = R"(CCCC* |$;;;;_AP1$,Sg:n:2:2&#44;6-7:ht|)";
SmilesParserParams params;
v2::SmilesParse::SmilesParserParams params;
params.allowCXSMILES = true;
ROMol *m = SmilesToMol(smiles, params);
auto m = v2::SmilesParse::MolFromSmiles(smiles, params);
REQUIRE(m);
CHECK(m->getNumAtoms() == 5);
auto sgs = getSubstanceGroups(*m);
REQUIRE(sgs.size() == 1);
CHECK(sgs[0].getProp<std::string>("TYPE") == "SRU");
}
{
auto m = R"CTAB(
Mrv2222 05072606252D
delete m;
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 4 3 1 0 0
M V30 BEGIN ATOM
M V30 1 C 2.31 -1.3337 0 0
M V30 2 C 1.54 -0 0 0
M V30 3 C -0 -0 0 0
M V30 4 C -0.77 1.3337 0 0 ATTCHPT=1
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 2 3
M V30 3 1 3 4
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SRU 0 ATOMS=(1 3) XBONDS=(2 2 3) BRKXYZ=(9 -1.1173 0.0872 0 0.4832 -
M V30 1.0112 0 0 0 0) BRKXYZ=(9 0.6341 0.924 0 0.6341 -0.924 0 0 0 0) -
M V30 CONNECT=HT LABEL="2,6-7"
M V30 END SGROUP
M V30 END CTAB
M END
)CTAB"_ctab;
REQUIRE(m);
CHECK(m->getNumAtoms() == 4);
m->clearConformers();
auto sgs = getSubstanceGroups(*m);
REQUIRE(sgs.size() == 1);
CHECK(sgs[0].getProp<std::string>("TYPE") == "SRU");
auto smi = MolToCXSmiles(*m);
CHECK(smi == "CCCC |atomProp:3.molAttchpt.1,Sg:n:2:2&#44;6-7:ht:::|");
}
}
@@ -1094,8 +1130,7 @@ TEST_CASE("testAtropisomersInCXSmiles") {
};
for (auto smiTest : smiTests) {
printf("Test\n\n %s\n\n", smiTest.fileName.c_str());
// RDDepict::preferCoordGen = true;
INFO(smiTest.fileName);
testOneAtropisomers(&smiTest);
}
}
@@ -1112,7 +1147,7 @@ TEST_CASE("testAtropisomersInCXSmilesCanon") {
};
for (auto smiTest : smiTests) {
printf("Test\n\n %s\n\n", smiTest.fileName.c_str());
INFO(smiTest.fileName);
testOneAtropisomersCanon(&smiTest);
}
}
@@ -1128,7 +1163,7 @@ TEST_CASE("test3DChiral") {
};
for (auto smiTest : smiTests) {
printf("Test\n\n %s\n\n", smiTest.fileName.c_str());
INFO(smiTest.fileName);
// RDDepict::preferCoordGen = true;
testOne3dChiral(&smiTest);
}
@@ -1739,4 +1774,35 @@ TEST_CASE("duplicate atoms in StereoGroup") {
CHECK_THROWS_AS(SmilesToMol(smiles), SmilesParseException);
}
}
TEST_CASE("Github #9231: quoting in CXSMILES") {
auto m = "CCCCC"_smiles;
REQUIRE(m);
SECTION("as reported") {
m->getAtomWithIdx(0)->setProp("p1", "1,2");
m->getAtomWithIdx(0)->setProp("p2", "0");
m->getAtomWithIdx(0)->setProp("p3", "|");
m->getAtomWithIdx(0)->setProp("p4,5", "foo");
auto smi = MolToCXSmiles(*m);
CHECK(smi ==
"CCCCC |atomProp:0.p1.1&#44;2:0.p2.0:0.p3.&#124;:0.p4&#44;5.foo|");
auto nmol = v2::SmilesParse::MolFromSmiles(smi);
REQUIRE(nmol);
CHECK(m->getAtomWithIdx(0)->getProp<std::string>("p1") == "1,2");
CHECK(m->getAtomWithIdx(0)->getProp<std::string>("p2") == "0");
CHECK(m->getAtomWithIdx(0)->getProp<std::string>("p3") == "|");
CHECK(m->getAtomWithIdx(0)->getProp<std::string>("p4,5") == "foo");
}
SECTION("perverse example") {
// bp-kelley came up with this beauty. :-)
m->getAtomWithIdx(0)->setProp("cxsmiles", "C |atomProp:0.apKa.7.54|");
auto smi = MolToCXSmiles(*m);
CHECK(smi ==
"CCCCC |atomProp:0.cxsmiles.C &#124;atomProp&#58;0.apKa.7.54&#124;|");
auto nmol = v2::SmilesParse::MolFromSmiles(smi);
REQUIRE(nmol);
CHECK(m->getAtomWithIdx(0)->getProp<std::string>("cxsmiles") ==
"C |atomProp:0.apKa.7.54|");
}
}

View File

@@ -2897,7 +2897,7 @@ M END\n\
"{\"CX_ALL_BUT_COORDS\":true}");
assert(cxsmiles_with_atom_prop);
assert(!strcmp(cxsmiles_with_atom_prop,
"NC1CC2CC(O)C1C2 |atomProp:5.atomProp.1&#46;234|"));
"NC1CC2CC(O)C1C2 |atomProp:5.atomProp.1.234|"));
free(cxsmiles_with_atom_prop);
free(mpkl_atom_prop);
free(mpkl);
@@ -2920,13 +2920,13 @@ M END\n\
smarts = get_cxsmarts(mpkl, mpkl_size, empty_json[i]);
assert(smarts);
assert(!strcmp(
smarts, "N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|"));
smarts, "N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|"));
free(smarts);
}
smarts = get_cxsmarts(mpkl, mpkl_size, "{\"doIsomericSmiles\":false}");
assert(smarts);
assert(!strcmp(smarts,
"N-[C&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|"));
"N-[C&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|"));
free(smarts);
free(mpkl);
}

View File

@@ -3176,11 +3176,11 @@ M END
{
const nonCanonicalCXSmilesNoStereo = mol.get_cxsmiles(JSON.stringify({doIsomericSmiles: false, canonical: false, CX_ALL_BUT_COORDS: true}));
assert(nonCanonicalCXSmilesNoStereo === 'OC1CC2CC(N)C1C2');
const nonCanonicalCXSmilesNoStereoAtomProp = `${nonCanonicalCXSmilesNoStereo} |atomProp:1.atomProp.1&#46;234|`;
const nonCanonicalCXSmilesNoStereoAtomProp = `${nonCanonicalCXSmilesNoStereo} |atomProp:1.atomProp.1.234|`;
const molWithAtomProp = RDKitModule.get_mol(nonCanonicalCXSmilesNoStereoAtomProp);
assert(molWithAtomProp);
const cxSmilesWithAtomProp = molWithAtomProp.get_cxsmiles(JSON.stringify({CX_ALL_BUT_COORDS: true}));
assert(cxSmilesWithAtomProp === 'NC1CC2CC(O)C1C2 |atomProp:5.atomProp.1&#46;234|');
assert(cxSmilesWithAtomProp === 'NC1CC2CC(O)C1C2 |atomProp:5.atomProp.1.234|');
molWithAtomProp.delete();
}
mol.delete();
@@ -3194,12 +3194,12 @@ M END
assert(chiralQuery.get_smarts(JSON.stringify({doIsomericSmiles: false})) === 'N-[C&H1](-C(-O)=O)-C(-C)-C');
}
{
const chiralQuery = RDKitModule.get_qmol('N-[C@H](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|');
assert(chiralQuery.get_cxsmarts() === 'N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|');
const chiralQuery = RDKitModule.get_qmol('N-[C@H](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|');
assert(chiralQuery.get_cxsmarts() === 'N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|');
['', '{}'].forEach((emptyJson) => {
assert(chiralQuery.get_cxsmarts(emptyJson) === 'N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|');
assert(chiralQuery.get_cxsmarts(emptyJson) === 'N-[C@&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|');
});
assert(chiralQuery.get_cxsmarts(JSON.stringify({doIsomericSmiles: false})) === 'N-[C&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1&#46;234|');
assert(chiralQuery.get_cxsmarts(JSON.stringify({doIsomericSmiles: false})) === 'N-[C&H1](-C(-O)=O)-C(-C)-C |atomProp:1.atomProp.1.234|');
}
}