CXSMILES: do not add separators for unserializable Substance Groups (#9048)

* do not write extra separators

* add a test

* update tests

* Update Code/GraphMol/SmilesParse/CXSmilesOps.cpp

---------

Co-authored-by: Greg Landrum <greg.landrum@gmail.com>
This commit is contained in:
Ricardo Rodriguez
2026-01-27 06:35:18 -05:00
committed by GitHub
parent e598f608fe
commit d3d4170e7c
6 changed files with 60 additions and 30 deletions

View File

@@ -1220,7 +1220,7 @@ TEST_CASE("CXSMILES for reactions", "[cxsmiles]") {
REQUIRE(rxn);
std::string expected_cxsmarts =
"[#6:6]-[#8:5]-[#6:3](-*)-[#8:2]-*>>[#6:6]-[#7:5]-[#6:3](-*)-[#8:2]-* |$;;;star_e;;star_e;;;;star_e;;star_e$,SgD:1,0:foo:bar::::,,SgD:7,6:foo:baz::::,,,Sg:n:4,2,1,0::ht:::,,Sg:n:10,8,7,6::ht:::,SgH:3:1.1|";
"[#6:6]-[#8:5]-[#6:3](-*)-[#8:2]-*>>[#6:6]-[#7:5]-[#6:3](-*)-[#8:2]-* |$;;;star_e;;star_e;;;;star_e;;star_e$,SgD:1,0:foo:bar::::,SgD:7,6:foo:baz::::,Sg:n:4,2,1,0::ht:::,Sg:n:10,8,7,6::ht:::,SgH:3:1.1|";
SmilesWriteParams params;
auto flags = RDKit::SmilesWrite::CX_ALL ^ RDKit::SmilesWrite::CX_ATOM_PROPS;
std::string output_cxsmarts =

View File

@@ -1 +1 @@
*C(=O)C(C)NC(=O)C(C)NC(=O)C(C)N[H] |(-9.10047,-0.90832,;-5.97455,-0.07112,;-5.97455,1.46869,;-4.64081,-0.84112,;-4.64081,-2.38112,;-3.30727,-0.07112,;-5.97455,-0.07112,;-5.97455,1.46869,;-4.64081,-0.84112,;-4.64081,-2.38112,;-3.30727,-0.07112,;0.097533,0.84112,;0.097533,2.38112,;1.43127,0.07112,;1.43127,-1.46869,;5.31655,1.11216,;9.10047,1.11216,),,,,Sg:n:11,13,12,14:1:ht:::|
*C(=O)C(C)NC(=O)C(C)NC(=O)C(C)N[H] |(-9.10047,-0.90832,;-5.97455,-0.07112,;-5.97455,1.46869,;-4.64081,-0.84112,;-4.64081,-2.38112,;-3.30727,-0.07112,;-5.97455,-0.07112,;-5.97455,1.46869,;-4.64081,-0.84112,;-4.64081,-2.38112,;-3.30727,-0.07112,;0.097533,0.84112,;0.097533,2.38112,;1.43127,0.07112,;1.43127,-1.46869,;5.31655,1.11216,;9.10047,1.11216,),Sg:n:11,13,12,14:1:ht:::|

View File

@@ -1 +1 @@
[H]NC(C)C(=O)NC(C)C(=O)NC(C)C(N)=O |(2.7453,0.5608,;0.7182,0.5608,;-1.3632,0.0031,;-1.3632,-0.8218,;-2.0777,0.4156,;-2.0777,1.2406,;-3.9017,-0.0731,;-4.6161,-0.4856,;-4.6161,-1.3106,;-5.3306,-0.0731,;-5.3306,0.7518,;-3.9017,-0.0731,;-4.6161,-0.4856,;-4.6161,-1.3106,;-5.3306,-0.0731,;-7.0052,-0.5216,;-5.3306,0.7518,),,,,Sg:n:4,2,5,3:1:ht:::|
[H]NC(C)C(=O)NC(C)C(=O)NC(C)C(N)=O |(2.7453,0.5608,;0.7182,0.5608,;-1.3632,0.0031,;-1.3632,-0.8218,;-2.0777,0.4156,;-2.0777,1.2406,;-3.9017,-0.0731,;-4.6161,-0.4856,;-4.6161,-1.3106,;-5.3306,-0.0731,;-5.3306,0.7518,;-3.9017,-0.0731,;-4.6161,-0.4856,;-4.6161,-1.3106,;-5.3306,-0.0731,;-7.0052,-0.5216,;-5.3306,0.7518,),Sg:n:4,2,5,3:1:ht:::|

View File

@@ -1768,7 +1768,7 @@ std::string get_sgroup_polymer_block(
if (sg.getPropIfPresent("TYPE", typ) &&
reverseTypemap.find(typ) != reverseTypemap.end()) {
sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
sgroupOutputIndex++;
++sgroupOutputIndex;
res << "Sg:";
std::string subtype;
@@ -1823,8 +1823,8 @@ std::string get_sgroup_polymer_block(
res.seekp(-1, res.cur);
}
res << ":";
res << ","; // only add a comma if we wrote something
}
res << ",";
}
std::string resStr = res.str();
@@ -1856,7 +1856,7 @@ std::string get_sgroup_data_block(const ROMol &mol,
for (const auto &sg : sgs) {
if (sg.hasProp("TYPE") && sg.getProp<std::string>("TYPE") == "DAT") {
sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
sgroupOutputIndex++;
++sgroupOutputIndex;
res << "SgD:";
// we don't attempt to canonicalize the atom order because the user
@@ -1894,8 +1894,8 @@ std::string get_sgroup_data_block(const ROMol &mol,
}
res << ":";
// FIX: do something about the coordinates
res << ","; // only add a comma if we wrote something
}
res << ",";
}
std::string resStr = res.str();
@@ -2113,7 +2113,7 @@ std::string get_bond_config_block(
if (!Atropisomers::getAtropisomerAtomsAndBonds(
bondNbr, atomAndBondVecs, mol)) {
throw ValueErrorException("Internal error - should not occur");
// should not happend
// should not happen
} else {
unsigned int swaps = 0;

View File

@@ -1205,7 +1205,7 @@ TEST_CASE("polymer SGroups") {
auto smi = MolToCXSmiles(*mol);
CHECK(smi ==
"*NCCO* |$star_e;;;;;star_e$,,,Sg:n:1,2::ht:::,Sg:any:3,4::hh:::|");
"*NCCO* |$star_e;;;;;star_e$,Sg:n:1,2::ht:::,Sg:any:3,4::hh:::|");
}
{ // multiple s groups + data
@@ -1245,7 +1245,7 @@ TEST_CASE("polymer SGroups") {
auto smi = MolToCXSmiles(*mol);
CHECK(smi ==
"*OCCNCC "
"|$star_e;;;;;;$,SgD:5:atomdata:val::::,,,,Sg:n:4,3::ht:::,Sg:any:"
"|$star_e;;;;;;$,SgD:5:atomdata:val::::,Sg:n:4,3::ht:::,Sg:any:"
"2,1::hh:::|");
}
}
@@ -1270,7 +1270,7 @@ TEST_CASE("SGroup hierarchy") {
CHECK(!sgs[1].hasProp("PARENT"));
CHECK(MolToCXSmiles(*mol) ==
"*CNC(C*)O* "
"|$star_e;;;;;star_e;;star_e$,,,Sg:any:2,1::ht:::,Sg:any:4,3,2,1,0,6:"
"|$star_e;;;;;star_e;;star_e$,Sg:any:2,1::ht:::,Sg:any:4,3,2,1,0,6:"
":ht:::,SgH:1:0|");
}
SECTION("nested") {
@@ -1291,7 +1291,7 @@ TEST_CASE("SGroup hierarchy") {
MolToCXSmiles(*mol) ==
"*CNC(CC(*)C*)O* |$star_e;;;;;;star_e;;star_e;;star_e$,SgD:4:internal "
"data:val::::,SgD:7:atom "
"value:value2::::,,,,,,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,3,2,"
"value:value2::::,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,3,2,"
"1,0,9::ht:::,SgH:2:1,4:0.2.3|");
}
}
@@ -1358,9 +1358,8 @@ TEST_CASE("Github #4320: Support toggling components of CXSMILES output") {
"*CNC(CC(*)C*)O* "
"|$star_e;;;;;;star_e;;star_e;;star_e$,SgD:4:internal "
"data:val::::,SgD:7:atom "
"value:value2::::,,,,,,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,"
"3,2,"
"1,0,9::ht:::,SgH:2:1,4:0.2.3|");
"value:value2::::,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,"
"3,2,1,0,9::ht:::,SgH:2:1,4:0.2.3|");
CHECK(std::unique_ptr<ROMol>(SmilesToMol(cxsmi)));
}
{
@@ -1374,11 +1373,10 @@ TEST_CASE("Github #4320: Support toggling components of CXSMILES output") {
MolToCXSmiles(*mol, ps,
SmilesWrite::CXSmilesFields::CX_ALL ^
SmilesWrite::CXSmilesFields::CX_ATOM_LABELS);
CHECK(
cxsmi ==
"*CNC(CC(*)C*)O* |SgD:4:internal data:val::::,SgD:7:atom "
"value:value2::::,,,,,,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,3,"
"2,1,0,9::ht:::,SgH:2:1,4:0.2.3|");
CHECK(cxsmi ==
"*CNC(CC(*)C*)O* |SgD:4:internal data:val::::,SgD:7:atom "
"value:value2::::,Sg:n:7::ht:::,Sg:n:2::ht:::,Sg:any:5,7,8,4,3,"
"2,1,0,9::ht:::,SgH:2:1,4:0.2.3|");
CHECK(std::unique_ptr<ROMol>(SmilesToMol(cxsmi)));
}
{
@@ -1387,7 +1385,7 @@ TEST_CASE("Github #4320: Support toggling components of CXSMILES output") {
SmilesWrite::CXSmilesFields::CX_SGROUPS);
CHECK(cxsmi ==
"*CNC(CC(*)C*)O* "
"|$star_e;;;;;;star_e;;star_e;;star_e$,,,Sg:n:7::ht:::,Sg:n:2::ht::"
"|$star_e;;;;;;star_e;;star_e;;star_e$,Sg:n:7::ht:::,Sg:n:2::ht::"
":,Sg:any:5,7,8,4,3,2,1,0,9::ht:::,SgH:2:0.1|");
CHECK(std::unique_ptr<ROMol>(SmilesToMol(cxsmi)));
}
@@ -1398,7 +1396,7 @@ TEST_CASE("Github #4320: Support toggling components of CXSMILES output") {
CHECK(cxsmi ==
"*CNC(CC(*)C*)O* "
"|$star_e;;;;;;star_e;;star_e;;star_e$,SgD:4:internal "
"data:val::::,SgD:7:atom value:value2::::,,,|");
"data:val::::,SgD:7:atom value:value2::::|");
CHECK(std::unique_ptr<ROMol>(SmilesToMol(cxsmi)));
}
}
@@ -3342,4 +3340,39 @@ TEST_CASE("chiral class must be nonzero") {
common_properties::_chiralPermutation) == 1);
}
}
}
}
TEST_CASE("Non-encodeable SGroups", "[CX][CXSmiles]") {
constexpr const char *molblock = R"CTAB(
RDKit 2D
0 0 0 0 0 0 0 0 0 0999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 2 0 0
M V30 BEGIN ATOM
M V30 1 C -1.242424 -0.515152 0.000000 0
M V30 2 C 0.257576 -0.515152 0.000000 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 1 ATOMS=(1 1)
M V30 2 SUP 2 ATOMS=(1 2)
M V30 END SGROUP
M V30 END CTAB
M END
$$$$)CTAB";
auto m = v2::FileParsers::MolFromMolBlock(molblock);
REQUIRE(m);
// Write to smiles to populate atom and bond output order properties
MolToSmiles(*m);
CHECK(SmilesWrite::getCXExtensions(*m) ==
"|(-1.24242,-0.515152,;0.257576,-0.515152,)|");
CHECK(SmilesWrite::getCXExtensions(
*m, RDKit::SmilesWrite::CXSmilesFields::CX_ALL_BUT_COORDS) == "");
}

View File

@@ -3604,20 +3604,17 @@ $$$$
TEST_CASE("ValidateStereo", "[accurateCIP]") {
SECTION("validateStereoUniqOldCanon1") {
testStereoValidationFromMol(validateStereoUniq1,
"*/C=C/C(=O)C1=C(*)N(*)C(=O)NC1* |,,,|", true,
true);
"*/C=C/C(=O)C1=C(*)N(*)C(=O)NC1*", true, true);
}
SECTION("validateStereoUniqNewCanon1") {
testStereoValidationFromMol(validateStereoUniq1,
"*/C=C/C(=O)C1=C(*)N(*)C(=O)NC1* |,,,|", false,
true);
"*/C=C/C(=O)C1=C(*)N(*)C(=O)NC1*", false, true);
}
SECTION("validateStereoUniqNewNoCanon1") {
testStereoValidationFromMol(validateStereoUniq1,
"N1C(=O)N(*)C(*)=C(C(/C=C/*)=O)C1* |,,,|",
false, false);
testStereoValidationFromMol(
validateStereoUniq1, "N1C(=O)N(*)C(*)=C(C(/C=C/*)=O)C1*", false, false);
}
SECTION("SprioChiralLostOldNoCanon") {