Add a custom CXSMILES feature to indicate Zero Order Bonds (#8454)

* implement the ZOB CXSMILES feature

* restore release notes
This commit is contained in:
Ricardo Rodriguez
2025-04-22 03:56:58 -04:00
committed by GitHub
parent 3ec88d643a
commit 15e0f784b7
5 changed files with 111 additions and 6 deletions

View File

@@ -509,6 +509,42 @@ bool parse_coordinate_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
return true;
}
template <typename Iterator>
bool parse_zero_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int, unsigned int startBondIdx) {
// these look like: C1CCCCC~CCCC1 |Z:5|
if (first >= last || *first != 'Z') {
return false;
}
++first;
if (first >= last || *first != ':') {
return false;
}
++first;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int bondIdx;
if (!read_int(first, last, bondIdx)) {
return false;
}
if (VALID_BNDIDX(bondIdx)) {
auto bond = get_bond_with_smiles_idx(mol, bondIdx - startBondIdx);
if (!bond) {
BOOST_LOG(rdWarningLog)
<< "bond " << bondIdx
<< " not found, cannot mark as zero order bond." << std::endl;
return false;
}
bond->setBondType(Bond::ZERO);
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_unsaturation(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
@@ -1407,6 +1443,10 @@ bool parse_it(Iterator &first, Iterator last, RDKit::RWMol &mol,
startAtomIdx, startBondIdx)) {
return false;
}
} else if (*first == 'Z') {
if (!parse_zero_bonds(first, last, mol, startAtomIdx, startBondIdx)) {
return false;
}
} else if (*first == '^') {
if (!parse_radicals(first, last, mol, startAtomIdx)) {
return false;
@@ -2238,6 +2278,26 @@ std::string get_coordbonds_block(const ROMol &mol,
return res;
}
std::string get_zerobonds_block(const ROMol &mol,
const std::vector<unsigned int> &,
const std::vector<unsigned int> &bondOrder) {
std::string res = "";
for (unsigned int i = 0; i < bondOrder.size(); ++i) {
auto idx = bondOrder[i];
const auto bond = mol.getBondWithIdx(idx);
if (bond->getBondType() != Bond::BondType::ZERO) {
continue;
}
if (!res.empty()) {
res += ",";
} else {
res = "Z:";
}
res += boost::str(boost::format("%d") % i);
}
return res;
}
std::string get_ringbond_cistrans_block(
const ROMol &mol, const std::vector<unsigned int> &atomOrder,
const std::vector<unsigned int> &bondOrder) {
@@ -2533,6 +2593,11 @@ std::string getCXExtensions(const ROMol &mol, std::uint32_t flags) {
appendToCXExtension(block, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_ZERO_BONDS) {
const auto block = get_zerobonds_block(mol, atomOrder, bondOrder);
appendToCXExtension(block, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_LINKNODES) {
const auto linknodeblock = get_linknodes_block(mol, atomOrder);
appendToCXExtension(linknodeblock, res);

View File

@@ -54,7 +54,7 @@ struct RDKIT_SMILESPARSE_EXPORT SmilesWriteParams {
namespace SmilesWrite {
BETTER_ENUM(CXSmilesFields, uint32_t,
BETTER_ENUM(CXSmilesFields, uint32_t, // clang-format off
CX_NONE = 0,
CX_ATOM_LABELS = 1 << 0,
CX_MOLFILE_VALUES = 1 << 1,
@@ -68,6 +68,7 @@ BETTER_ENUM(CXSmilesFields, uint32_t,
CX_BOND_CFG = 1 << 9,
CX_BOND_ATROPISOMER = 1 << 10,
CX_COORDINATE_BONDS = 1 << 11,
CX_ZERO_BONDS = 1 << 12,
CX_ALL = 0x7fffffff,
CX_ALL_BUT_COORDS = CX_ALL ^ CX_COORDS
);

View File

@@ -3093,4 +3093,37 @@ TEST_CASE("atoms bound to metals should always have Hs specified") {
CHECK(osmi == expected);
}
}
}
TEST_CASE("ZOB cx smiles extension", "[smiles][cxsmiles]") {
SECTION("basics") {
auto m = "CC"_smiles;
REQUIRE(m);
auto b = m->getBondWithIdx(0);
b->setBondType(Bond::ZERO);
auto smi = MolToCXSmiles(*m);
REQUIRE(smi == "C~C |Z:0|");
auto m2 = RDKit::v2::SmilesParse::MolFromSmiles(smi);
REQUIRE(m2);
CHECK(m2->getBondWithIdx(0)->getBondType() == Bond::ZERO);
}
SECTION("Reverse") {
constexpr const char *smi = "FB1(F)N2CCCC/C2=N/C2=[NH+]~1CCC=C2 |Z:12|";
auto p = v2::SmilesParse::SmilesParserParams();
p.sanitize = false;
auto m = v2::SmilesParse::MolFromSmiles(smi, p);
REQUIRE(m);
auto b = m->getBondWithIdx(15);
CHECK(b->getBondType() == Bond::BondType::ZERO);
CHECK(b->getBeginAtom()->getAtomicNum() == 7);
CHECK(b->getEndAtom()->getAtomicNum() == 5);
REQUIRE(MolToCXSmiles(*m) == smi);
}
}

View File

@@ -156,8 +156,8 @@ ROMol *MolFromMolBlock(python::object imolBlock, bool sanitize, bool removeHs,
return static_cast<ROMol *>(newM);
}
ROMol *MolFromMolFile(const std::string &molFilename, bool sanitize, bool removeHs,
bool strictParsing) {
ROMol *MolFromMolFile(const std::string &molFilename, bool sanitize,
bool removeHs, bool strictParsing) {
RWMol *newM = nullptr;
try {
newM = MolFileToMol(molFilename, sanitize, removeHs, strictParsing);
@@ -171,7 +171,8 @@ ROMol *MolFromMolFile(const std::string &molFilename, bool sanitize, bool remove
return static_cast<ROMol *>(newM);
}
ROMol *MolFromMrvFile(const std::string &molFilename, bool sanitize, bool removeHs) {
ROMol *MolFromMrvFile(const std::string &molFilename, bool sanitize,
bool removeHs) {
RWMol *newM = nullptr;
try {
newM = MrvFileToMol(molFilename, sanitize, removeHs);
@@ -604,7 +605,8 @@ python::object addMetadataToPNGStringHelper(python::dict pymetadata,
return retval;
}
python::object MolsFromPNGFile(const std::string &filename, const std::string &tag,
python::object MolsFromPNGFile(const std::string &filename,
const std::string &tag,
python::object pyParams) {
SmilesParserParams params;
if (pyParams) {
@@ -1729,6 +1731,7 @@ BOOST_PYTHON_MODULE(rdmolfiles) {
RDKit::SmilesWrite::CXSmilesFields::CX_BOND_ATROPISOMER)
.value("CX_COORDINATE_BONDS",
RDKit::SmilesWrite::CXSmilesFields::CX_COORDINATE_BONDS)
.value("CX_ZERO_BONDS", RDKit::SmilesWrite::CXSmilesFields::CX_ZERO_BONDS)
.value("CX_ALL", RDKit::SmilesWrite::CXSmilesFields::CX_ALL)
.value("CX_ALL_BUT_COORDS",
RDKit::SmilesWrite::CXSmilesFields::CX_ALL_BUT_COORDS);

View File

@@ -251,6 +251,8 @@ The features which are parsed include:
``Q_e``, ``QH_p``, ``AH_P``, ``X_p``, ``XH_p``, ``M_p``, ``MH_p``, ``*``)
- atomic properties ``atomprop``
- coordinate/dative bonds ``C`` (these are translated into dative bonds)
- hydrogen bonds ``H``
- zero order bonds bonds ``Z`` (custom extension, same syntax as c/t/ctu below)
- radicals ``^``
- enhanced stereo (these are converted into ``StereoGroups``)
- linknodes ``LN``
@@ -292,8 +294,9 @@ The features which are written by :py:func:`rdkit.Chem.rdmolfiles.MolToCXSmiles`
>>> m.GetAtomWithIdx(1).SetProp('p2','A1')
>>> m.GetAtomWithIdx(0).SetProp('atomLabel','O1')
>>> m.GetAtomWithIdx(1).SetProp('atomLabel','C2')
>>> m.GetBondWithIdx(0).SetBondType(Chem.BondType.ZERO)
>>> Chem.MolToCXSmiles(m)
'CO |$C2;O1$,atomProp:0.p1.5:0.p2.A1:1.p1.2|'
'C~O |$C2;O1$,atomProp:0.p1.5:0.p2.A1:1.p1.2,Z:0|'
Reading molecule names
----------------------