Files
rdkit/Code/GraphMol/SmilesParse/CXSmilesOps.cpp

2582 lines
79 KiB
C++

//
// Copyright (C) 2016-2021 Greg Landrum and other RDKit contributors
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/BoostStartInclude.h>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/format.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/FileParsers/MolFileStereochem.h>
#include <GraphMol/Atropisomers.h>
#include <GraphMol/Chirality.h>
#include <iostream>
#include <algorithm>
#include "SmilesWrite.h"
#include "SmilesParse.h"
#include "SmilesParseOps.h"
#include <GraphMol/MolEnumerator/LinkNode.h>
#include <GraphMol/Chirality.h>
#include <map>
namespace SmilesParseOps {
using namespace RDKit;
const std::string cxsmilesindex = "_cxsmilesindex";
const std::string cxsgTracker = "_sgTracker";
// FIX: once this can be automated using constexpr, do so
const std::vector<std::string_view> pseudoatoms{"Pol", "Mod"};
const std::vector<std::string_view> pseudoatoms_p{"Pol_p", "Mod_p"};
std::map<std::string, std::string> sgroupTypemap = {
{"n", "SRU"}, {"mon", "MON"}, {"mer", "MER"}, {"co", "COP"},
{"xl", "CRO"}, {"mod", "MOD"}, {"mix", "MIX"}, {"f", "FOR"},
{"any", "ANY"}, {"gen", "GEN"}, {"c", "COM"}, {"grf", "GRA"},
{"alt", "COP"}, {"ran", "COP"}, {"blk", "COP"}};
template <typename Q>
void addquery(Q *qry, std::string symbol, RDKit::RWMol &mol, unsigned int idx) {
PRECONDITION(qry, "bad query");
auto *qa = new QueryAtom(0);
qa->setQuery(qry);
qa->setNoImplicit(true);
mol.replaceAtom(idx, qa);
if (symbol != "") {
mol.getAtomWithIdx(idx)->setProp(RDKit::common_properties::atomLabel,
symbol);
}
delete qa;
}
void processCXSmilesLabels(RWMol &mol) {
if (mol.hasProp("_cxsmilesLabelsProcessed")) {
return;
}
for (auto atom : mol.atoms()) {
std::string symb = "";
if (atom->getPropIfPresent(common_properties::atomLabel, symb)) {
atom->clearProp(common_properties::dummyLabel);
if (symb == "star_e") {
/* according to the MDL spec, these match anything, but in MARVIN they
are "unspecified end groups" for polymers */
addquery(makeAtomNullQuery(), symb, mol, atom->getIdx());
} else if (symb == "Q_e") {
addquery(makeQAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "QH_p") {
addquery(makeQHAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "AH_p") { // this seems wrong...
/* According to the MARVIN Sketch, AH is "any atom, including H" -
this would be "*" in SMILES - and "A" is "any atom except H".
The CXSMILES docs say that "A" can be represented normally in SMILES
and that "AH" needs to be written out as AH_p. I'm going to assume that
this is a Marvin internal thing and just parse it as they describe it.
This means that "*" in the SMILES itself needs to be treated
differently, which we do below. */
addquery(makeAHAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "X_p") {
addquery(makeXAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "XH_p") {
addquery(makeXHAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "M_p") {
addquery(makeMAtomQuery(), symb, mol, atom->getIdx());
} else if (symb == "MH_p") {
addquery(makeMHAtomQuery(), symb, mol, atom->getIdx());
} else if (std::find(pseudoatoms_p.begin(), pseudoatoms_p.end(), symb) !=
pseudoatoms_p.end()) {
// strip off the "_p":
atom->setProp(common_properties::dummyLabel,
symb.substr(0, symb.size() - 2));
atom->clearProp(common_properties::atomLabel);
}
} else if (atom->getAtomicNum() == 0 && !atom->hasQuery() &&
atom->getSymbol() == "*") {
addquery(makeAAtomQuery(), "", mol, atom->getIdx());
}
}
mol.setProp("_cxsmilesLabelsProcessed", 1, true);
}
namespace parser {
const std::string _headCrossings = "_headCrossings";
const std::string _tailCrossings = "_tailCrossings";
template <typename Iterator>
bool read_int(Iterator &first, Iterator last, unsigned int &res) {
std::string num = "";
while (first <= last && *first >= '0' && *first <= '9') {
num += *first;
++first;
}
if (num.empty()) {
return false;
}
res = boost::lexical_cast<unsigned int>(num);
return true;
}
template <typename Iterator>
bool read_int_list(Iterator &first, Iterator last,
std::vector<unsigned int> &res, char sep = ',') {
while (1) {
std::string num = "";
while (first <= last && *first >= '0' && *first <= '9') {
num += *first;
++first;
}
if (!num.empty()) {
res.push_back(boost::lexical_cast<unsigned int>(num));
}
if (first >= last || *first != sep) {
break;
}
++first;
}
return true;
}
template <typename Iterator>
bool read_int_pair(Iterator &first, Iterator last, unsigned int &n1,
unsigned int &n2, char sep = '.') {
if (!read_int(first, last, n1)) {
return false;
}
if (first >= last || *first != sep) {
return false;
}
++first;
return read_int(first, last, n2);
}
template <typename Iterator>
std::string read_text_to(Iterator &first, Iterator last, std::string delims) {
std::string res = "";
Iterator start = first;
// EFF: there are certainly faster ways to do this
while (first <= last && delims.find_first_of(*first) == std::string::npos) {
if (*first == '&' && std::distance(first, last) > 2 &&
*(first + 1) == '#') {
// escaped char
if (start != first) {
res += std::string(start, first);
}
Iterator next = first + 2;
while (next != last && *next >= '0' && *next <= '9') {
++next;
}
if (next == last || *next != ';') {
throw RDKit::SmilesParseException(
"failure parsing CXSMILES extensions: quoted block not terminated "
"with ';'");
}
if (next > first + 2) {
std::string blk = std::string(first + 2, next);
res += (char)(boost::lexical_cast<int>(blk));
}
first = next + 1;
start = first;
} else {
++first;
}
}
if (start != first) {
res += std::string(start, first);
}
return res;
}
namespace {
// this is the super fun case where no information about bonds in/out of the
// sgroup is present.
void setupUnmarkedPolymerSGroup(RWMol &mol, SubstanceGroup &sgroup,
std::vector<unsigned int> &headCrossings,
std::vector<unsigned int> &tailCrossings) {
const auto &atoms = sgroup.getAtoms();
if (atoms.empty()) {
throw SmilesParseException("no atoms in polymer sgroup");
}
const auto firstAtom = mol.getAtomWithIdx(atoms.front());
for (auto nbr : boost::make_iterator_range(mol.getAtomNeighbors(firstAtom))) {
const auto nbrAtom = mol[nbr];
if (std::find(atoms.begin(), atoms.end(), nbrAtom->getIdx()) ==
atoms.end()) {
// in most cases we just add this to the set of headCrossings.
// The exception occurs when there's only one atom in the SGroup and
// we already have a headCrossing, in which case we may put this one
// as a tailCrossing
if (atoms.size() > 1 || headCrossings.empty()) {
headCrossings.push_back(
mol.getBondBetweenAtoms(firstAtom->getIdx(), nbrAtom->getIdx())
->getIdx());
} else if (atoms.size() == 1) {
if (tailCrossings.empty()) {
tailCrossings.push_back(
mol.getBondBetweenAtoms(firstAtom->getIdx(), nbrAtom->getIdx())
->getIdx());
} else {
BOOST_LOG(rdWarningLog)
<< " single atom polymer Sgroup has more than two bonds to "
"external atoms. Ignoring all bonds after the first two."
<< std::endl;
}
}
}
}
if (atoms.size() > 1) {
const auto lastAtom = mol.getAtomWithIdx(atoms.back());
for (auto nbr :
boost::make_iterator_range(mol.getAtomNeighbors(lastAtom))) {
const auto nbrAtom = mol[nbr];
if (std::find(atoms.begin(), atoms.end(), nbrAtom->getIdx()) ==
atoms.end()) {
tailCrossings.push_back(
mol.getBondBetweenAtoms(lastAtom->getIdx(), nbrAtom->getIdx())
->getIdx());
}
}
}
}
// deal with setting up the crossing bonds, etc.
void finalizePolymerSGroup(RWMol &mol, SubstanceGroup &sgroup) {
bool isFlipped = false;
std::string connect = "EU";
if (sgroup.getPropIfPresent("CONNECT", connect)) {
if (connect.find(",f") != std::string::npos) {
isFlipped = true;
boost::replace_all(connect, ",f", "");
}
}
if (connect == "hh") {
connect = "HH";
} else if (connect == "ht") {
connect = "HT";
} else if (connect == "eu") {
connect = "EU";
} else {
BOOST_LOG(rdWarningLog) << "unrecognized CXSMILES CONNECT value: '"
<< connect << "'. Assuming 'eu'" << std::endl;
connect = "EU";
}
sgroup.setProp("CONNECT", connect);
std::vector<unsigned int> headCrossings;
std::vector<unsigned int> tailCrossings;
sgroup.getPropIfPresent(_headCrossings, headCrossings);
sgroup.clearProp(_headCrossings);
sgroup.getPropIfPresent(_tailCrossings, tailCrossings);
sgroup.clearProp(_tailCrossings);
if (headCrossings.empty() && tailCrossings.empty()) {
setupUnmarkedPolymerSGroup(mol, sgroup, headCrossings, tailCrossings);
}
if (headCrossings.empty() && tailCrossings.empty()) {
// we tried... nothing more we can do
return;
}
for (auto &bondIdx : headCrossings) {
sgroup.addBondWithIdx(bondIdx);
}
sgroup.setProp("XBHEAD", headCrossings);
for (auto &bondIdx : tailCrossings) {
sgroup.addBondWithIdx(bondIdx);
}
// now we can setup XBCORR
std::vector<unsigned int> xbcorr;
for (unsigned int i = 0;
i < std::min(headCrossings.size(), tailCrossings.size()); ++i) {
unsigned headIdx = headCrossings[i];
unsigned tailIdx = tailCrossings[i];
if (isFlipped) {
tailIdx = tailCrossings[tailCrossings.size() - i - 1];
}
xbcorr.push_back(headIdx);
xbcorr.push_back(tailIdx);
}
sgroup.setProp("XBCORR", xbcorr);
}
Bond *get_bond_with_smiles_idx(const ROMol &mol, unsigned idx) {
for (auto bnd : mol.bonds()) {
unsigned int smilesIdx;
if (bnd->getPropIfPresent("_cxsmilesBondIdx", smilesIdx) &&
smilesIdx == idx) {
return bnd;
}
}
return nullptr;
}
} // end of anonymous namespace
// we use this pattern a lot and it's a long function call, but a very short
// #define
#define VALID_ATIDX(_atidx_) \
((_atidx_) >= startAtomIdx && (_atidx_) < startAtomIdx + mol.getNumAtoms())
#define VALID_BNDIDX(_bidx_) \
((_bidx_) >= startBondIdx && (_bidx_) < startBondIdx + mol.getNumBonds())
template <typename Iterator>
bool parse_atom_values(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last || *first != ':') {
return false;
}
++first;
unsigned int atIdx = 0;
while (first <= last && *first != '$') {
std::string tkn = read_text_to(first, last, ";$");
if (tkn != "" && VALID_ATIDX(atIdx)) {
mol.getAtomWithIdx(atIdx)->setProp(RDKit::common_properties::molFileValue,
tkn);
}
++atIdx;
if (first <= last && *first != '$') {
++first;
}
}
if (first >= last || *first != '$') {
return false;
}
++first;
return true;
}
template <typename Iterator>
bool parse_atom_props(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last) {
return false;
}
while (first <= last && *first != '|' && *first != ',') {
unsigned int atIdx;
if (read_int(first, last, atIdx)) {
if (first >= last || *first != '.') {
return false;
}
++first;
std::string pname = read_text_to(first, last, ".");
if (!pname.empty()) {
if (first >= last || *first != '.') {
return false;
}
++first;
std::string pval = read_text_to(first, last, ":|,");
if (VALID_ATIDX(atIdx) && !pval.empty()) {
mol.getAtomWithIdx(atIdx - startAtomIdx)->setProp(pname, pval);
}
}
}
if (first <= last && *first != '|' && *first != ',') {
++first;
}
}
if (first <= last && *first != '|' && *first != ',') {
return false;
}
if (*first != '|') {
++first;
}
return true;
}
template <typename Iterator>
bool parse_atom_labels(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last || *first != '$') {
return false;
}
++first;
unsigned int atIdx = 0;
while (first <= last && *first != '$') {
std::string tkn = read_text_to(first, last, ";$");
if (!tkn.empty() && VALID_ATIDX(atIdx)) {
mol.getAtomWithIdx(atIdx - startAtomIdx)
->setProp(RDKit::common_properties::atomLabel, tkn);
}
++atIdx;
if (first <= last && *first != '$') {
++first;
}
}
if (first >= last || *first != '$') {
return false;
}
++first;
return true;
}
template <typename Iterator>
bool parse_coords(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx, unsigned int confIdx) {
if (first >= last || *first != '(') {
return false;
}
auto *conf = new Conformer(mol.getNumAtoms());
mol.addConformer(conf);
conf->setId(confIdx);
++first;
unsigned int atIdx = 0;
bool is3D = false;
while (first <= last && *first != ')') {
RDGeom::Point3D pt;
std::string tkn = read_text_to(first, last, ";)");
if (VALID_ATIDX(atIdx)) {
if (!tkn.empty()) {
std::vector<std::string> tokens;
boost::split(tokens, tkn, boost::is_any_of(std::string(",")));
if (tokens.size() >= 1 && tokens[0].size()) {
pt.x = boost::lexical_cast<double>(tokens[0]);
}
if (tokens.size() >= 2 && tokens[1].size()) {
pt.y = boost::lexical_cast<double>(tokens[1]);
}
if (tokens.size() >= 3 && tokens[2].size()) {
pt.z = boost::lexical_cast<double>(tokens[2]);
is3D = true;
}
}
conf->setAtomPos(atIdx - startAtomIdx, pt);
}
++atIdx;
if (first <= last && *first != ')') {
++first;
}
}
// make sure that the conformer really is 3D!
if (is3D && hasNonZeroZCoords(*conf)) {
conf->set3D(true);
} else {
conf->set3D(false);
}
if (first >= last || *first != ')') {
return false;
}
++first;
return true;
}
template <typename Iterator>
bool parse_coordinate_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
Bond::BondType typ, unsigned int startAtomIdx,
unsigned int startBondIdx) {
if (first >= last || (*first != 'C' && *first != 'H')) {
return false;
}
++first;
if (first >= last || *first != ':') {
return false;
}
++first;
while (first <= last && *first >= '0' && *first <= '9') {
unsigned int aidx;
unsigned int bidx;
if (read_int_pair(first, last, aidx, bidx)) {
if (VALID_ATIDX(aidx) && VALID_BNDIDX(bidx)) {
auto bnd = get_bond_with_smiles_idx(mol, bidx - startBondIdx);
if (!bnd || (bnd->getBeginAtomIdx() != aidx - startAtomIdx &&
bnd->getEndAtomIdx() != aidx - startAtomIdx)) {
BOOST_LOG(rdWarningLog) << "BOND NOT FOUND! " << bidx
<< " involving atom " << aidx << std::endl;
return false;
}
bnd->setBondType(typ);
if (bnd->getBeginAtomIdx() != aidx - startAtomIdx) {
unsigned int tmp = bnd->getBeginAtomIdx();
bnd->setBeginAtomIdx(aidx - startAtomIdx);
bnd->setEndAtomIdx(tmp);
}
}
} else {
return false;
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_unsaturation(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first + 1 >= last || *first != 'u') {
return false;
}
++first;
if (first >= last || *first != ':') {
return false;
}
++first;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int idx;
if (!read_int(first, last, idx)) {
return false;
}
if (VALID_ATIDX(idx)) {
auto atom = mol.getAtomWithIdx(idx - startAtomIdx);
if (!atom->hasQuery()) {
atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
}
atom->expandQuery(makeAtomUnsaturatedQuery(), Queries::COMPOSITE_AND);
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_ring_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last || *first != 'r' || first + 1 >= last ||
*(first + 1) != 'b' || first + 2 >= last || *(first + 2) != ':') {
return false;
}
first += 3;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int n1;
if (!read_int(first, last, n1)) {
return false;
}
// check that we can read at least two more characters:
if (first + 1 >= last || *first != ':') {
return false;
}
++first;
unsigned int n2;
bool gt = false;
if (*first == '*') {
++first;
n2 = 0xDEADBEEF;
if (VALID_ATIDX(n1)) {
mol.setProp(common_properties::_NeedsQueryScan, 1);
}
} else {
if (!read_int(first, last, n2)) {
return false;
}
switch (n2) {
case 0:
case 2:
case 3:
break;
case 4:
gt = true;
break;
default:
BOOST_LOG(rdWarningLog)
<< "unrecognized rb value: " << n2 << std::endl;
return false;
}
}
if (VALID_ATIDX(n1)) {
auto atom = mol.getAtomWithIdx(n1 - startAtomIdx);
if (!atom->hasQuery()) {
atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
}
if (!gt) {
atom->expandQuery(makeAtomRingBondCountQuery(n2),
Queries::COMPOSITE_AND);
} else {
auto q = static_cast<ATOM_EQUALS_QUERY *>(new ATOM_LESSEQUAL_QUERY);
q->setVal(n2);
q->setDescription("AtomRingBondCount");
q->setDataFunc(queryAtomRingBondCount);
atom->expandQuery(q, Queries::COMPOSITE_AND);
}
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_linknodes(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
// these look like: |LN:1:1.3.2.6,4:1.4.3.6|
// that's two records:
// 1:1.3.2.6: 1-3 repeats, atom 1-2, 1-6
// 4:1.4.3.6: 1-4 repeats, atom 4-3, 4-6
// which maps to the property value "1 3 2 2 3 2 7|1 4 2 5 4 5 7"
// If the linking atom only has two neighbors then the outer atom
// specification (the last two digits) can be left out. So for a molecule
// where atom 1 has bonds only to atoms 2 and 6 we could have
// |LN:1:1.3|
// instead of
// |LN:1:1.3.2.6|
if (first >= last || *first != 'L' || first + 1 >= last ||
*(first + 1) != 'N' || first + 2 >= last || *(first + 2) != ':') {
return false;
}
first += 3;
std::string accum = "";
while (first < last && *first >= '0' && *first <= '9') {
unsigned int atidx;
if (!read_int(first, last, atidx)) {
return false;
}
// check that we can read at least two more characters:
if (first + 1 >= last || *first != ':') {
return false;
}
++first;
unsigned int startReps;
if (!read_int(first, last, startReps)) {
return false;
}
if (first + 1 >= last || *first != '.') {
return false;
}
++first;
unsigned int endReps;
if (!read_int(first, last, endReps)) {
return false;
}
unsigned int idx1;
unsigned int idx2;
if (first < last && *first == '.') {
++first;
if (!read_int(first, last, idx1)) {
return false;
}
++first;
if (!read_int(first, last, idx2)) {
return false;
}
} else if (VALID_ATIDX(atidx) &&
mol.getAtomWithIdx(atidx - startAtomIdx)->getDegree() == 2) {
auto nbrs =
mol.getAtomNeighbors(mol.getAtomWithIdx(atidx - startAtomIdx));
idx1 = *nbrs.first;
nbrs.first++;
idx2 = *nbrs.first;
} else if (VALID_ATIDX(atidx)) {
return false;
}
if (first < last && *first == ',') {
++first;
}
if (VALID_ATIDX(atidx)) {
if (!accum.empty()) {
accum += "|";
}
accum += (boost::format("%d %d 2 %d %d %d %d") % startReps % endReps %
(atidx - startAtomIdx + 1) % (idx1 - startAtomIdx + 1) %
(atidx - startAtomIdx + 1) % (idx2 - startAtomIdx + 1))
.str();
}
}
if (!accum.empty()) {
mol.setProp(common_properties::molFileLinkNodes, accum);
}
return true;
}
template <typename Iterator>
void parse_data_sgroup_attr(Iterator &first, Iterator last,
SubstanceGroup &sgroup, bool keepSGroup,
std::string fieldName, bool fieldIsArray = false) {
PRECONDITION(first < last);
if (first != last && *first != '|') {
std::string data = read_text_to(first, last, ":");
++first;
if (!data.empty() && keepSGroup) {
if (fieldIsArray) {
std::vector<std::string> dataFields = {data};
sgroup.setProp(fieldName, dataFields);
} else {
sgroup.setProp(fieldName, data);
}
}
}
}
template <typename Iterator>
bool parse_data_sgroup(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx, unsigned int nSGroups) {
// these look like: |SgD:2,1:FIELD:info::::|
// example from CXSMILES docs:
// SgD:3,2,1,0:name:data:like:unit:t:(1.,1.)
// the fields are:
// SgD:[atom indices]:[field name]:[data value]:[query
// operator]:[unit]:[tag]:[coords]
// coords are (-1) if atomic coordinates are present
if (first >= last || *first != 'S' || first + 3 >= last ||
*(first + 1) != 'g' || *(first + 2) != 'D' || *(first + 3) != ':') {
return false;
}
first += 4;
std::vector<unsigned int> atoms;
if (!read_int_list(first, last, atoms)) {
return false;
}
SubstanceGroup sgroup(&mol, std::string("DAT"));
sgroup.setProp(cxsmilesindex, nSGroups);
bool keepSGroup = false;
for (auto idx : atoms) {
if (VALID_ATIDX(idx)) {
keepSGroup = true;
sgroup.addAtomWithIdx(idx - startAtomIdx);
}
}
++first;
if (first >= last) {
return false;
}
parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDNAME");
// FIX:
if (keepSGroup) {
sgroup.setProp("FIELDDISP", " 0.0000 0.0000 DR ALL 0 0");
}
if (first >= last) {
return false;
}
parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "DATAFIELDS", true);
parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "QUERYOP");
if (first >= last) {
return false;
}
parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDINFO");
if (first >= last) {
return false;
}
parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDTAG");
if (first < last && *first == '(') {
// FIX
std::string coords = read_text_to(first, last, ")");
++first;
if (keepSGroup) {
sgroup.setProp("COORDS", coords);
}
}
// the label processing can destroy sgroup info, so do that now
// (the function will immediately return if already called)
if (keepSGroup) {
processCXSmilesLabels(mol);
sgroup.setProp<unsigned int>("index", getSubstanceGroups(mol).size() + 1);
addSubstanceGroup(mol, sgroup);
}
return true;
}
namespace {
std::vector<RDKit::SubstanceGroup>::iterator find_matching_sgroup(
std::vector<RDKit::SubstanceGroup> &sgs, unsigned int targetId) {
return std::find_if(sgs.begin(), sgs.end(), [targetId](const auto &sg) {
unsigned int pval;
if (sg.getPropIfPresent(cxsmilesindex, pval)) {
if (pval == targetId) {
return true;
}
}
return false;
});
}
} // namespace
template <typename Iterator>
bool parse_sgroup_hierarchy(Iterator &first, Iterator last, RDKit::RWMol &mol) {
// these look like: |SgH:1:0|
// from CXSMILES docs:
// SgH:parentSgroupIndex1:childSgroupIndex1.childSgroupIndex2,parentSgroupIndex2:childSgroupIndex1
if (first >= last || *first != 'S' || first + 3 >= last ||
*(first + 1) != 'g' || *(first + 2) != 'H' || *(first + 3) != ':') {
return false;
}
first += 4;
auto &sgs = getSubstanceGroups(mol);
while (1) {
unsigned int parentId;
if (!read_int(first, last, parentId)) {
return false;
}
bool validParent = true;
auto psg = find_matching_sgroup(sgs, parentId);
if (psg == sgs.end()) {
validParent = false;
} else {
psg->getPropIfPresent("index", parentId);
}
if (first <= last && *first == ':') {
++first;
std::vector<unsigned int> children;
if (!read_int_list(first, last, children, '.')) {
return false;
}
if (validParent) {
for (auto childId : children) {
if (childId >= sgs.size()) {
throw SmilesParseException(
"child id references non-existent SGroup");
}
auto csg = find_matching_sgroup(sgs, childId);
if (csg != sgs.end()) {
unsigned int cid;
csg->getProp("index", cid);
csg->setProp("PARENT", parentId);
}
}
}
if (first <= last && *first == ',') {
++first;
} else {
break;
}
} else {
return false;
}
}
return true;
}
template <typename Iterator>
bool parse_polymer_sgroup(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx, unsigned int nSGroups) {
// these look like:
// |Sg:n:6,1,2,4::hh&#44;f:6,0,:4,2,|
// example from CXSMILES docs:
// the fields are:
// Sg:[type]:[atom indices]:[subscript]:[superscript]:[head crossing
// bonds]:[tail crossing bonds]:
//
// note that it's legit for empty fields to be completely missing.
// for example, this doesn't have any crossing bonds indicated:
// *-CCCN-* |$star_e;;;;;star_e$,Sg:n:4,1,2,3::hh|
// this last bit makes the whole thing doubleplusfun to parse
if (first >= last || *first != 'S' || first + 2 >= last ||
*(first + 1) != 'g' || *(first + 2) != ':') {
return false;
}
first += 3;
std::string typ = read_text_to(first, last, ":");
++first;
if (sgroupTypemap.find(typ) == sgroupTypemap.end()) {
return false;
}
bool keepSGroup = false;
SubstanceGroup sgroup(&mol, sgroupTypemap[typ]);
sgroup.setProp(cxsmilesindex, nSGroups);
if (typ == "alt") {
sgroup.setProp("SUBTYPE", std::string("ALT"));
} else if (typ == "ran") {
sgroup.setProp("SUBTYPE", std::string("RAN"));
} else if (typ == "blk") {
sgroup.setProp("SUBTYPE", std::string("BLO"));
}
std::vector<unsigned int> atoms;
if (!read_int_list(first, last, atoms)) {
return false;
}
//++first;
for (auto idx : atoms) {
if (VALID_ATIDX(idx)) {
sgroup.addAtomWithIdx(idx - startAtomIdx);
keepSGroup = true;
}
}
std::vector<unsigned int> headCrossing;
std::vector<unsigned int> tailCrossing;
if (first <= last && *first == ':') {
++first;
std::string subscript = read_text_to(first, last, ":|");
if (keepSGroup && !subscript.empty()) {
sgroup.setProp("LABEL", subscript);
}
if (first <= last && *first == ':') {
++first;
std::string superscript = read_text_to(first, last, ":|,");
if (keepSGroup && !superscript.empty()) {
sgroup.setProp("CONNECT", superscript);
}
if (first <= last && *first == ':') {
++first;
if (!read_int_list(first, last, headCrossing)) {
return false;
}
if (keepSGroup && !headCrossing.empty()) {
for (auto &cidx : headCrossing) {
if (VALID_ATIDX(cidx)) {
cidx -= startAtomIdx;
} else {
keepSGroup = false;
break;
}
}
sgroup.setProp(_headCrossings, headCrossing, true);
}
if (first <= last && *first == ':') {
++first;
if (!read_int_list(first, last, tailCrossing)) {
return false;
}
}
if (keepSGroup && !tailCrossing.empty()) {
for (auto &cidx : tailCrossing) {
if (VALID_ATIDX(cidx)) {
cidx -= startAtomIdx;
} else {
keepSGroup = false;
break;
}
}
sgroup.setProp("_tailCrossings", tailCrossing, true);
}
}
}
}
if (keepSGroup) { // the label processing can destroy sgroup info, so do that
// now (the function will immediately return if already
// called)
processCXSmilesLabels(mol);
finalizePolymerSGroup(mol, sgroup);
sgroup.setProp<unsigned int>("index", getSubstanceGroups(mol).size() + 1);
addSubstanceGroup(mol, sgroup);
}
return true;
}
template <typename Iterator>
bool parse_variable_attachments(Iterator &first, Iterator last,
RDKit::RWMol &mol, unsigned int startAtomIdx) {
// these look like: CO*.C1=CC=NC=C1 |m:2:3.5.4|
// that corresponds to replacing the bond to atom 2 with bonds to atom 3, 5,
// or 4
//
if (first >= last || *first != 'm' || first + 1 >= last ||
*(first + 1) != ':') {
return false;
}
first += 2;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int at1idx;
if (!read_int(first, last, at1idx)) {
return false;
}
if (VALID_ATIDX(at1idx) &&
mol.getAtomWithIdx(at1idx - startAtomIdx)->getDegree() != 1) {
BOOST_LOG(rdWarningLog)
<< "position variation bond to atom with more than one bond"
<< std::endl;
return false;
}
if (first < last && *first == ':') {
++first;
} else {
BOOST_LOG(rdWarningLog) << "improperly formatted m: block" << std::endl;
return false;
}
std::vector<std::string> others;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int aidx;
if (!read_int(first, last, aidx)) {
return false;
}
if (VALID_ATIDX(aidx)) {
others.push_back(std::to_string(aidx - startAtomIdx + 1));
}
if (first < last && *first == '.') {
++first;
}
}
if (VALID_ATIDX(at1idx)) {
std::string endPts = "(" + std::to_string(others.size());
for (auto idx : others) {
endPts += " " + idx;
}
endPts += ")";
for (auto nbri : boost::make_iterator_range(
mol.getAtomBonds(mol.getAtomWithIdx(at1idx - startAtomIdx)))) {
auto bnd = mol[nbri];
bnd->setProp(common_properties::_MolFileBondEndPts, endPts);
bnd->setProp(common_properties::_MolFileBondAttach, std::string("ANY"));
}
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_wedged_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx, unsigned int startBondIdx) {
// these look like: CC(O)Cl |w:1.0|
// also wD and wU for down and up wedges.
//
// We do not end up using this to set stereochemistry, but the relevant bond
// properties are set in case client code wants to do something with the
// information.
if (first >= last || *first != 'w' || first + 1 >= last) {
return false;
}
++first;
Bond::BondDir state = Bond::BondDir::NONE;
unsigned int cfg = 0;
switch (*first) {
case ':':
state = Bond::BondDir::UNKNOWN;
cfg = 2;
break;
case 'U':
state = Bond::BondDir::BEGINWEDGE;
cfg = 1;
++first;
break;
case 'D':
state = Bond::BondDir::BEGINDASH;
cfg = 3;
++first;
break;
default:
break;
}
if (state == Bond::BondDir::NONE || first >= last || first + 1 >= last ||
*first != ':') {
return false;
}
++first;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int atomIdx;
if (!read_int(first, last, atomIdx)) {
return false;
}
if (first < last && *first == '.') {
++first;
} else {
BOOST_LOG(rdWarningLog) << "improperly formatted w block" << std::endl;
return false;
}
unsigned int bondIdx;
if (!read_int(first, last, bondIdx)) {
return false;
}
if (VALID_ATIDX(atomIdx) && VALID_BNDIDX(bondIdx)) {
auto atom = mol.getAtomWithIdx(atomIdx - startAtomIdx);
auto bond = get_bond_with_smiles_idx(mol, bondIdx - startBondIdx);
if (!bond) {
BOOST_LOG(rdWarningLog)
<< "bond " << bondIdx << " not found, wedge from atom " << atomIdx
<< " cannot be applied." << std::endl;
return false;
}
// we can't set wedging twice:
if (bond->hasProp(common_properties::_MolFileBondCfg)) {
BOOST_LOG(rdWarningLog)
<< "w block attempts to set wedging on bond " << bond->getIdx()
<< " more than once." << std::endl;
return false;
}
// first things first, the atom needs to be the start atom of the bond for
// any of this to make sense
if (atom->getIdx() != bond->getBeginAtomIdx()) {
if (atom->getIdx() != bond->getEndAtomIdx()) {
BOOST_LOG(rdWarningLog)
<< "atom " << atomIdx << " is not associated with bond "
<< bondIdx << "(" << bond->getBeginAtomIdx() + startAtomIdx << "-"
<< bond->getEndAtomIdx() + startAtomIdx << ")"
<< " in w block" << std::endl;
return false;
}
auto eidx = bond->getBeginAtomIdx();
bond->setBeginAtomIdx(atom->getIdx());
bond->setEndAtomIdx(eidx);
}
bond->setProp(common_properties::_MolFileBondCfg, cfg);
bond->setBondDir(state);
if (cfg == 2 && canHaveDirection(*bond)) {
bond->getBeginAtom()->setChiralTag(Atom::ChiralType::CHI_UNSPECIFIED);
mol.setProp(detail::_needsDetectBondStereo, 1);
}
if ((cfg == 1 || cfg == 3) && canHaveDirection(*bond)) {
mol.setProp(detail::_needsDetectAtomStereo, 1);
}
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_doublebond_stereo(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int, unsigned int startBondIdx,
Bond::BondStereo stereo) {
// these look like: C1CCCC/C=C/CCC1 |ctu:5|
// also c and t for cis or trans
//
while (first < last && *first != ':') {
++first;
}
if (first >= last || *first != ':') {
return false;
}
++first;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int bondIdx;
if (!read_int(first, last, bondIdx)) {
return false;
}
if (VALID_BNDIDX(bondIdx)) {
auto bond = get_bond_with_smiles_idx(mol, bondIdx - startBondIdx);
if (!bond) {
BOOST_LOG(rdWarningLog)
<< "bond " << bondIdx
<< " not found, cannot mark as stereo double bond." << std::endl;
return false;
}
Chirality::detail::setStereoForBond(mol, bond, stereo);
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool parse_substitution(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last || *first != 's' || first + 1 >= last ||
*(first + 1) != ':') {
return false;
}
first += 2;
while (first < last && *first >= '0' && *first <= '9') {
unsigned int n1;
if (!read_int(first, last, n1)) {
return false;
}
// check that we can read at least two more characters:
if (first + 1 >= last || *first != ':') {
return false;
}
++first;
unsigned int n2;
if (*first == '*') {
++first;
n2 = 0xDEADBEEF;
if (VALID_ATIDX(n1)) {
mol.setProp(common_properties::_NeedsQueryScan, 1);
}
} else {
if (!read_int(first, last, n2)) {
return false;
}
}
if (VALID_ATIDX(n1)) {
auto atom = mol.getAtomWithIdx(n1 - startAtomIdx);
if (!atom->hasQuery()) {
atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
}
atom->expandQuery(makeAtomNonHydrogenDegreeQuery(n2),
Queries::COMPOSITE_AND);
}
if (first < last && *first == ',') {
++first;
}
}
return true;
}
template <typename Iterator>
bool processRadicalSection(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int numRadicalElectrons,
unsigned int startAtomIdx) {
if (first >= last) {
return false;
}
++first;
if (first >= last || *first != ':') {
return false;
}
++first;
unsigned int atIdx;
if (!read_int(first, last, atIdx)) {
return false;
}
if (VALID_ATIDX(atIdx)) {
mol.getAtomWithIdx(atIdx - startAtomIdx)
->setNumRadicalElectrons(numRadicalElectrons);
}
while (first < last && *first == ',') {
++first;
if (first < last && (*first < '0' || *first > '9')) {
return true;
}
if (!read_int(first, last, atIdx)) {
return false;
}
if (VALID_ATIDX(atIdx)) {
mol.getAtomWithIdx(atIdx - startAtomIdx)
->setNumRadicalElectrons(numRadicalElectrons);
}
}
return first < last;
}
template <typename Iterator>
bool parse_radicals(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
if (first >= last || *first != '^') {
return false;
}
while (*first == '^') {
++first;
if (first >= last) {
return false;
}
if (*first < '1' || *first > '7') {
return false; // these are the values that are allowed to be there
}
switch (*first) {
case '1':
if (!processRadicalSection(first, last, mol, 1, startAtomIdx)) {
return false;
}
break;
case '2':
case '3':
case '4':
if (!processRadicalSection(first, last, mol, 2, startAtomIdx)) {
return false;
}
break;
case '5':
case '6':
case '7':
if (!processRadicalSection(first, last, mol, 3, startAtomIdx)) {
return false;
}
break;
default:
BOOST_LOG(rdWarningLog)
<< "Radical specification " << *first << " ignored.";
}
}
return true;
}
template <typename Iterator>
bool parse_enhanced_stereo(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx) {
StereoGroupType group_type = StereoGroupType::STEREO_ABSOLUTE;
if (*first == 'a') {
group_type = StereoGroupType::STEREO_ABSOLUTE;
} else if (*first == 'o') {
group_type = StereoGroupType::STEREO_OR;
} else if (*first == '&') {
group_type = StereoGroupType::STEREO_AND;
}
++first;
// OR and AND groups carry a group number
unsigned int group_id = 0;
if (group_type != StereoGroupType::STEREO_ABSOLUTE) {
read_int(first, last, group_id);
}
if (first >= last || *first != ':') {
return false;
}
++first;
std::vector<Atom *> atoms;
std::vector<Bond *> bonds;
while (first <= last && *first >= '0' && *first <= '9') {
unsigned int aidx;
if (read_int(first, last, aidx)) {
if (VALID_ATIDX(aidx)) {
Atom *atom = mol.getAtomWithIdx(aidx - startAtomIdx);
if (!atom) {
BOOST_LOG(rdWarningLog)
<< "Atom " << aidx << " not found!" << std::endl;
return false;
}
atoms.push_back(atom);
}
} else {
return false;
}
if (first < last && *first == ',') {
++first;
}
}
if (!atoms.empty()) {
// we need to do a bit of work to check whether or not we've already seen
// this particular StereoGroup (was Github #6050)
const auto group_hash =
10 * group_id + static_cast<unsigned int>(group_type);
std::vector<unsigned int> sgTracker;
mol.getPropIfPresent(cxsgTracker, sgTracker);
std::vector<StereoGroup> mol_stereo_groups(mol.getStereoGroups());
TEST_ASSERT(mol_stereo_groups.size() == sgTracker.size());
auto iter = std::find(sgTracker.begin(), sgTracker.end(), group_hash);
if (iter != sgTracker.end()) {
auto index = iter - sgTracker.begin();
auto gAtoms = mol_stereo_groups[index].getAtoms();
gAtoms.insert(gAtoms.end(), atoms.begin(), atoms.end());
mol_stereo_groups[index] =
StereoGroup(mol_stereo_groups[index].getGroupType(),
std::move(gAtoms), std::move(bonds), group_id);
} else {
// not seen this before, create a new stereogroup
mol_stereo_groups.emplace_back(group_type, std::move(atoms),
std::move(bonds), group_id);
sgTracker.push_back(group_hash);
mol.setProp(cxsgTracker, sgTracker);
}
mol.setStereoGroups(std::move(mol_stereo_groups));
}
return true;
}
template <typename Iterator>
bool parse_it(Iterator &first, Iterator last, RDKit::RWMol &mol,
unsigned int startAtomIdx, unsigned int startBondIdx) {
if (first >= last || *first != '|') {
return false;
}
++first;
unsigned int nSGroups = 0;
unsigned int confIndex = 0;
while (first < last && *first != '|') {
typename Iterator::difference_type length = std::distance(first, last);
if (*first == '(') {
if (!parse_coords(first, last, mol, startAtomIdx, confIndex++)) {
return false;
}
} else if (*first == '$') {
if (length > 4 && *(first + 1) == '_' && *(first + 2) == 'A' &&
*(first + 3) == 'V' && *(first + 4) == ':') {
first += 4;
if (!parse_atom_values(first, last, mol, startAtomIdx)) {
return false;
}
} else {
if (!parse_atom_labels(first, last, mol, startAtomIdx)) {
return false;
}
}
} else if (length > 9 && std::string(first, first + 9) == "atomProp:") {
first += 9;
if (!parse_atom_props(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'C') {
if (!parse_coordinate_bonds(first, last, mol, Bond::DATIVE, startAtomIdx,
startBondIdx)) {
return false;
}
} else if (*first == 'H') {
if (!parse_coordinate_bonds(first, last, mol, Bond::HYDROGEN,
startAtomIdx, startBondIdx)) {
return false;
}
} else if (*first == '^') {
if (!parse_radicals(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'a' || *first == 'o' ||
(*first == '&' && first + 1 < last && first[1] != '#')) {
if (!parse_enhanced_stereo(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'r' && first + 1 < last && first[1] == 'b') {
if (!parse_ring_bonds(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'L' && first + 1 < last && first[1] == 'N') {
if (!parse_linknodes(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'S' && first + 2 < last && first[1] == 'g' &&
first[2] == 'D') {
if (!parse_data_sgroup(first, last, mol, startAtomIdx, nSGroups++)) {
return false;
}
} else if (*first == 'S' && first + 2 < last && first[1] == 'g' &&
first[2] == 'H') {
if (!parse_sgroup_hierarchy(first, last, mol)) {
return false;
}
} else if (*first == 'S' && first + 1 < last && first[1] == 'g') {
if (!parse_polymer_sgroup(first, last, mol, startAtomIdx, nSGroups++)) {
return false;
}
} else if (*first == 'u') {
if (!parse_unsaturation(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 's') {
if (!parse_substitution(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'm') {
if (!parse_variable_attachments(first, last, mol, startAtomIdx)) {
return false;
}
} else if (*first == 'w') {
if (!parse_wedged_bonds(first, last, mol, startAtomIdx, startBondIdx)) {
return false;
}
} else if (*first == 'c' && first + 2 < last && first[1] == 't' &&
first[2] == 'u') {
if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
Bond::BondStereo::STEREOANY)) {
return false;
}
} else if (*first == 'c') {
if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
Bond::BondStereo::STEREOCIS)) {
return false;
}
} else if (*first == 't') {
if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
Bond::BondStereo::STEREOTRANS)) {
return false;
}
} else {
++first;
}
// if(first < last && *first != '|') ++first;
}
if (first >= last || *first != '|') {
return false;
}
++first; // step past the last '|'
return true;
}
} // namespace parser
void parseCXExtensions(RDKit::RWMol &mol, const std::string &extText,
std::string::const_iterator &first,
unsigned int startAtomIdx, unsigned int startBondIdx) {
// BOOST_LOG(rdWarningLog) << "parseCXNExtensions: " << extText << std::endl;
if (extText.empty()) {
return;
}
if (extText[0] != '|') {
throw RDKit::SmilesParseException(
"CXSMILES extension does not start with |");
}
first = extText.begin();
bool ok =
parser::parse_it(first, extText.end(), mol, startAtomIdx, startBondIdx);
if (!ok) {
throw RDKit::SmilesParseException("failure parsing CXSMILES extensions");
}
processCXSmilesLabels(mol);
mol.clearProp("_cxsmilesLabelsProcessed");
mol.clearProp(cxsgTracker);
}
} // end of namespace SmilesParseOps
namespace RDKit {
namespace SmilesWrite {
namespace {
std::vector<unsigned> getSortedMappedIndexes(
const std::vector<unsigned int> &atomIds,
const std::vector<unsigned> &revOrder) {
std::vector<unsigned> res;
res.reserve(atomIds.size());
for (auto atomId : atomIds) {
res.push_back(revOrder[atomId]);
}
std::sort(res.begin(), res.end());
return res;
}
std::pair<std::vector<StereoGroup>, std::vector<std::vector<unsigned>>>
getSortedStereoGroupsAndIndices(
const ROMol &mol, const std::vector<unsigned int> &revOrder,
std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>>
&wedgeBonds) {
using StGrpIdxPair = std::pair<StereoGroup, std::vector<unsigned>>;
auto &groups = mol.getStereoGroups();
std::vector<StGrpIdxPair> sortingGroups;
sortingGroups.reserve(groups.size());
for (const auto &sg : groups) {
std::vector<unsigned int> atomIds;
Atropisomers::getAllAtomIdsForStereoGroup(mol, sg, atomIds, wedgeBonds);
const auto newAtomIndexes = getSortedMappedIndexes(atomIds, revOrder);
if (!newAtomIndexes.empty()) {
sortingGroups.emplace_back(sg, newAtomIndexes);
}
}
// sort by 1) StereoGroup type; 2) StereoGroup id; 3) atom indexes
std::sort(sortingGroups.begin(), sortingGroups.end(),
[](const StGrpIdxPair &a, const StGrpIdxPair &b) {
const auto &[sgA, idxsA] = a;
const auto &[sgB, idxsB] = b;
if (sgA.getGroupType() == sgB.getGroupType()) {
if (sgA.getWriteId() == sgB.getWriteId()) {
return idxsA < idxsB;
}
return sgA.getWriteId() < sgB.getWriteId();
}
return sgA.getGroupType() < sgB.getGroupType();
});
std::vector<StereoGroup> sgs;
std::vector<std::vector<unsigned>> sgAtomIdxs;
sgs.reserve(sortingGroups.size());
sgAtomIdxs.reserve(sortingGroups.size());
for (auto &&p : sortingGroups) {
sgs.push_back(std::move(p.first));
sgAtomIdxs.push_back(std::move(p.second));
}
return {std::move(sgs), std::move(sgAtomIdxs)};
}
std::string quote_string(const std::string &txt) {
// FIX
return txt;
}
std::string quote_atomprop_string(const std::string &txt) {
// at a bare minimum, . needs to be escaped
std::string res;
for (auto c : txt) {
if (c == '.') {
res += "&#46;";
} else {
res += c;
}
}
return res;
}
std::string get_enhanced_stereo_block(
const ROMol &mol, const std::vector<unsigned int> &atomOrder,
std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>>
&wedgeBonds) {
if (mol.getStereoGroups().empty()) {
return "";
}
std::stringstream res;
// we need a map from original atom idx to output idx:
std::vector<unsigned int> revOrder(mol.getNumAtoms());
for (unsigned i = 0; i < atomOrder.size(); ++i) {
revOrder[atomOrder[i]] = i;
}
auto [groups, groupsAtoms] =
getSortedStereoGroupsAndIndices(mol, revOrder, wedgeBonds);
assignStereoGroupIds(groups);
auto grpAtomsItr = groupsAtoms.begin();
for (auto sgItr = groups.begin(); sgItr != groups.end();
++sgItr, ++grpAtomsItr) {
switch (sgItr->getGroupType()) {
case StereoGroupType::STEREO_ABSOLUTE:
res << "a:";
break;
case StereoGroupType::STEREO_OR:
res << "o" << sgItr->getWriteId() << ":";
break;
case StereoGroupType::STEREO_AND:
res << "&" << sgItr->getWriteId() << ":";
break;
}
for (const auto &aid : *grpAtomsItr) {
res << aid << ",";
}
}
std::string resStr = res.str();
if (!resStr.empty() && resStr.back() == ',') {
resStr.pop_back();
}
return resStr;
}
std::string get_sgroup_hierarchy_block(const ROMol &mol) {
const auto &sgs = getSubstanceGroups(mol);
if (sgs.empty()) {
return "";
}
std::stringstream res;
// we need a map from sgroup index to output index;
std::map<unsigned int, unsigned int> sgroupOrder;
bool parentPresent = false;
for (const auto &sg : sgs) {
if (sg.hasProp("_cxsmilesOutputIndex")) {
unsigned int sgidx = sg.getIndexInMol();
sg.getPropIfPresent("index", sgidx);
sgroupOrder[sgidx] = sg.getProp<unsigned int>("_cxsmilesOutputIndex");
sg.clearProp("_cxsmilesOutputIndex");
}
if (sg.hasProp("PARENT")) {
parentPresent = true;
}
}
if (parentPresent) {
// now loop over them and add the information
std::map<unsigned int, std::vector<unsigned int>> accum;
for (const auto &sg : sgs) {
unsigned pidx;
if (sg.getPropIfPresent("PARENT", pidx) &&
sgroupOrder.find(pidx) != sgroupOrder.end()) {
unsigned int sgidx = sg.getIndexInMol();
sg.getPropIfPresent("index", sgidx);
if (sgroupOrder.find(sgidx) != sgroupOrder.end()) {
accum[sgroupOrder[pidx]].push_back(sgroupOrder[sgidx]);
}
}
}
if (!accum.empty()) {
res << "SgH:";
for (const auto &pr : accum) {
res << pr.first << ":";
for (auto v : pr.second) {
res << v << ".";
}
// remove the extra ".":
res.seekp(-1, res.cur);
res << ",";
}
}
std::string resStr = res.str();
while (!resStr.empty() && resStr.back() == ',') {
resStr.pop_back();
}
return resStr;
} else {
return "";
}
}
std::string get_sgroup_polymer_block(
const ROMol &mol, const std::vector<unsigned int> &atomOrder,
const std::vector<unsigned int> &bondOrder) {
const auto &sgs = getSubstanceGroups(mol);
if (sgs.empty()) {
return "";
}
unsigned int sgroupOutputIndex = 0;
mol.getPropIfPresent("_cxsmilesOutputIndex", sgroupOutputIndex);
std::stringstream res;
// we need a map from original atom idx to output idx:
std::vector<unsigned int> revAtomOrder(mol.getNumAtoms());
for (unsigned i = 0; i < atomOrder.size(); ++i) {
revAtomOrder[atomOrder[i]] = i;
}
// we need a map from original bond idx to output idx:
std::vector<unsigned int> revBondOrder(mol.getNumBonds());
for (unsigned i = 0; i < bondOrder.size(); ++i) {
revBondOrder[bondOrder[i]] = i;
}
std::map<std::string, std::string> reverseTypemap;
for (const auto &pr : SmilesParseOps::sgroupTypemap) {
if (reverseTypemap.find(pr.second) == reverseTypemap.end()) {
reverseTypemap[pr.second] = pr.first;
}
}
for (const auto &sg : sgs) {
std::string typ;
if (sg.getPropIfPresent("TYPE", typ) &&
reverseTypemap.find(typ) != reverseTypemap.end()) {
sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
sgroupOutputIndex++;
res << "Sg:";
std::string subtype;
if (typ == "COP" && sg.getPropIfPresent("SUBTYPE", subtype)) {
if (subtype == "ALT") {
res << "alt";
} else if (subtype == "RAN") {
res << "ran";
} else if (subtype == "BLO") {
res << "blk";
} else {
res << reverseTypemap["COP"];
}
} else {
res << reverseTypemap[typ];
}
res << ":";
for (const auto oaid : sg.getAtoms()) {
res << revAtomOrder[oaid] << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
res << ":";
std::string label;
if (sg.getPropIfPresent("LABEL", label)) {
res << label;
}
res << ":";
std::string connect;
if (sg.getPropIfPresent("CONNECT", connect)) {
boost::algorithm::to_lower(connect);
res << connect;
}
res << ":";
std::vector<unsigned int> headCrossings;
if (sg.getPropIfPresent("XBHEAD", headCrossings) &&
headCrossings.size() > 1) {
for (auto v : headCrossings) {
res << bondOrder[v] << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
}
res << ":";
std::vector<unsigned int> tailCrossings;
if (sg.getPropIfPresent("XBCORR", tailCrossings) &&
tailCrossings.size() > 2) {
for (unsigned int i = 1; i < tailCrossings.size(); i += 2) {
res << bondOrder[tailCrossings[i]] << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
}
res << ":";
}
res << ",";
}
std::string resStr = res.str();
while (!resStr.empty() && resStr.back() == ',') {
resStr.pop_back();
}
mol.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
return resStr;
}
std::string get_sgroup_data_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
const auto &sgs = getSubstanceGroups(mol);
if (sgs.empty()) {
return "";
}
unsigned int sgroupOutputIndex = 0;
mol.getPropIfPresent("_cxsmilesOutputIndex", sgroupOutputIndex);
std::stringstream res;
// we need a map from original atom idx to output idx:
std::vector<unsigned int> revOrder(mol.getNumAtoms());
for (unsigned i = 0; i < atomOrder.size(); ++i) {
revOrder[atomOrder[i]] = i;
}
for (const auto &sg : sgs) {
if (sg.hasProp("TYPE") && sg.getProp<std::string>("TYPE") == "DAT") {
sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
sgroupOutputIndex++;
res << "SgD:";
// we don't attempt to canonicalize the atom order because the user
// may ascribe some significance to the ordering of the atoms
for (const auto oaid : sg.getAtoms()) {
res << revOrder[oaid] << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
res << ":";
std::string prop;
if (sg.getPropIfPresent("FIELDNAME", prop) && !prop.empty()) {
res << prop;
}
res << ":";
std::vector<std::string> vprop;
if (sg.getPropIfPresent("DATAFIELDS", vprop) && !vprop.empty()) {
for (const auto &pv : vprop) {
res << pv << ",";
}
// remove the extra ",":
res.seekp(-1, res.cur);
}
res << ":";
if (sg.getPropIfPresent("QUERYOP", prop) && !prop.empty()) {
res << prop;
}
res << ":";
if (sg.getPropIfPresent("FIELDINFO", prop) && !prop.empty()) {
res << prop;
}
res << ":";
if (sg.getPropIfPresent("FIELDTAG", prop) && !prop.empty()) {
res << prop;
}
res << ":";
// FIX: do something about the coordinates
}
res << ",";
}
std::string resStr = res.str();
if (!resStr.empty() && resStr.back() == ',') {
resStr.pop_back();
}
mol.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
return resStr;
}
std::string get_atomlabel_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
std::string res = "";
for (auto idx : atomOrder) {
if (idx != atomOrder.front()) {
res += ";";
}
std::string lbl;
int val;
const auto atom = mol.getAtomWithIdx(idx);
if (atom->getPropIfPresent(common_properties::_QueryAtomGenericLabel,
lbl)) {
res += quote_string(lbl + "_p");
} else if (!atom->getAtomicNum() &&
atom->getPropIfPresent(common_properties::dummyLabel, lbl) &&
std::find(SmilesParseOps::pseudoatoms.begin(),
SmilesParseOps::pseudoatoms.end(),
lbl) != SmilesParseOps::pseudoatoms.end()) {
res += quote_string(lbl + "_p");
} else if (!atom->getAtomicNum() &&
atom->getPropIfPresent(common_properties::_fromAttachPoint,
val) &&
(val == 1 || val == 2)) {
res += quote_string("_AP" + std::to_string(val));
} else if (atom->getPropIfPresent(common_properties::atomLabel, lbl)) {
res += quote_string(lbl);
}
}
// if we didn't find anything return an empty string
if (std::find_if_not(res.begin(), res.end(),
[](const auto c) { return c == ';'; }) == res.end()) {
res.clear();
}
return res;
}
std::string get_value_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder,
const std::string &prop) {
std::string res = "";
bool first = true;
for (auto idx : atomOrder) {
if (!first) {
res += ";";
} else {
first = false;
}
std::string lbl;
if (mol.getAtomWithIdx(idx)->getPropIfPresent(prop, lbl)) {
res += quote_string(lbl);
}
}
return res;
}
std::string get_radical_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
std::string res = "";
std::map<unsigned int, std::vector<unsigned int>> rads;
for (unsigned int i = 0; i < atomOrder.size(); ++i) {
auto idx = atomOrder[i];
auto nrad = mol.getAtomWithIdx(idx)->getNumRadicalElectrons();
if (nrad) {
rads[nrad].push_back(i);
}
}
if (rads.size()) {
for (const auto &pr : rads) {
switch (pr.first) {
case 1:
res += "^1:";
break;
case 2:
res += "^2:";
break;
case 3:
res += "^5:";
break;
default:
BOOST_LOG(rdWarningLog) << "unsupported number of radical electrons "
<< pr.first << std::endl;
}
for (auto aidx : pr.second) {
res += boost::str(boost::format("%d,") % aidx);
}
}
}
return res;
}
double zero_small_vals(double val) {
if (fabs(val) < 1e-4) {
return 0.0;
}
return val;
}
std::string get_coords_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
std::string res = "";
const auto &conf = mol.getConformer();
bool first = true;
for (auto idx : atomOrder) {
const auto &pt = conf.getAtomPos(idx);
if (!first) {
res += ";";
} else {
first = false;
}
res += boost::str(boost::format("%g,%g,") % zero_small_vals(pt.x) %
zero_small_vals(pt.y));
if (conf.is3D()) {
auto zc = boost::str(boost::format("%g") % zero_small_vals(pt.z));
if (zc != "0") {
res += zc;
}
}
}
return res;
}
std::string get_atom_props_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
std::vector<std::string> skip = {common_properties::atomLabel,
common_properties::molFileValue,
common_properties::molParity};
std::string res = "";
unsigned int which = 0;
for (auto idx : atomOrder) {
const auto atom = mol.getAtomWithIdx(idx);
bool isAttachmentPoint = !atom->getAtomicNum() &&
atom->hasProp(common_properties::_fromAttachPoint);
bool includePrivate = false, includeComputed = false;
for (const auto &pn : atom->getPropList(includePrivate, includeComputed)) {
if (std::find(skip.begin(), skip.end(), pn) == skip.end()) {
std::string pv = atom->getProp<std::string>(pn);
if (pn == "dummyLabel" &&
(isAttachmentPoint ||
std::find(SmilesParseOps::pseudoatoms.begin(),
SmilesParseOps::pseudoatoms.end(),
pv) != SmilesParseOps::pseudoatoms.end())) {
// it's a pseudoatom or attachment point, skip it
continue;
}
if (res.empty()) {
res += "atomProp";
}
res +=
boost::str(boost::format(":%d.%s.%s") % which %
quote_atomprop_string(pn) % quote_atomprop_string(pv));
}
}
++which;
}
return res;
}
std::string get_bond_config_block(
const ROMol &mol, const std::vector<unsigned int> &atomOrder,
const std::vector<unsigned int> &bondOrder, bool coordsIncluded,
std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>> &wedgeBonds,
bool atropisomerOnly = false) {
std::map<std::string, std ::vector<std::string>> wParts;
for (unsigned int i = 0; i < bondOrder.size(); ++i) {
auto idx = bondOrder[i];
const auto bond = mol.getBondWithIdx(idx);
unsigned int wedgeStartAtomIdx = bond->getBeginAtomIdx();
if (!canHaveDirection(*bond)) {
continue;
}
// when figuring out what to output for the bond, favor the wedge state:
Bond::BondDir bd = bond->getBondDir();
switch (bd) {
case Bond::BondDir::BEGINDASH:
case Bond::BondDir::BEGINWEDGE:
case Bond::BondDir::UNKNOWN:
break;
default:
bd = Bond::BondDir::NONE;
}
if (atropisomerOnly && bd == Bond::BondDir::NONE) {
continue;
}
// see if this one is an atropisomer
bool isAnAtropisomer = false;
const Atom *firstAtom = bond->getBeginAtom();
if (bd == Bond::BondDir::BEGINDASH || bd == Bond::BondDir::BEGINWEDGE) {
for (auto bondNbr : mol.atomBonds(firstAtom)) {
if (bondNbr->getIdx() == bond->getIdx()) {
continue; // a bond is not its own neighbor
}
if (bondNbr->getStereo() == Bond::BondStereo::STEREOATROPCW ||
bondNbr->getStereo() == Bond::BondStereo::STEREOATROPCCW) {
isAnAtropisomer = true;
// if it is for an atropisomer and there are no coords, check to see
// if the wedge needs to be flipped based on the smiles reordering
if (!coordsIncluded && isAnAtropisomer) {
Atropisomers::AtropAtomAndBondVec atomAndBondVecs[2];
if (!Atropisomers::getAtropisomerAtomsAndBonds(
bondNbr, atomAndBondVecs, mol)) {
throw ValueErrorException("Internal error - should not occur");
// should not happend
} else {
unsigned int swaps = 0;
unsigned int firstReorderedIdx =
std::find(atomOrder.begin(), atomOrder.end(),
bondNbr->getBeginAtom()->getIdx()) -
atomOrder.begin();
unsigned int secondReorderedIdx =
std::find(atomOrder.begin(), atomOrder.end(),
bondNbr->getEndAtom()->getIdx()) -
atomOrder.begin();
if (firstReorderedIdx > secondReorderedIdx) {
++swaps;
}
for (unsigned int bondAtomIndex = 0; bondAtomIndex < 2;
++bondAtomIndex) {
if (atomAndBondVecs[bondAtomIndex].first == firstAtom)
continue; // swapped atoms on the side where the wedge bond
// is does NOT change the wedge bond
if (atomAndBondVecs[bondAtomIndex].second.size() == 2) {
unsigned int firstOtherAtomIdx =
atomAndBondVecs[bondAtomIndex]
.second[0]
->getOtherAtom(atomAndBondVecs[bondAtomIndex].first)
->getIdx();
unsigned int secondOtherAtomIdx =
atomAndBondVecs[bondAtomIndex]
.second[1]
->getOtherAtom(atomAndBondVecs[bondAtomIndex].first)
->getIdx();
unsigned int firstReorderedAtomIdx =
std::find(atomOrder.begin(), atomOrder.end(),
firstOtherAtomIdx) -
atomOrder.begin();
unsigned int secondReorderedAtomIdx =
std::find(atomOrder.begin(), atomOrder.end(),
secondOtherAtomIdx) -
atomOrder.begin();
if (firstReorderedAtomIdx > secondReorderedAtomIdx) {
++swaps;
}
}
}
if (swaps % 2) {
bd = (bd == Bond::BondDir::BEGINWEDGE)
? Bond::BondDir::BEGINDASH
: Bond::BondDir::BEGINWEDGE;
}
}
}
break;
}
}
}
if (atropisomerOnly) {
// one of the bonds on the beginning atom of this bond must be an
// atropisomer
if (!isAnAtropisomer) {
continue;
}
} else { // atropisomeronly is FALSE - check for a wedging caused by
// chiral atom
unsigned int cfg = 0;
if (bd == Bond::BondDir::NONE &&
bond->getPropIfPresent(common_properties::_MolFileBondCfg, cfg)) {
switch (cfg) {
case 1:
bd = Bond::BondDir::BEGINWEDGE;
break;
case 2:
bd = Bond::BondDir::UNKNOWN;
break;
case 3:
bd = Bond::BondDir::BEGINDASH;
break;
default:
bd = Bond::BondDir::NONE;
}
}
if (bd == Bond::BondDir::NONE && coordsIncluded) {
int dirCode;
bool reverse;
Chirality::GetMolFileBondStereoInfo(
bond, wedgeBonds, &mol.getConformer(), dirCode, reverse);
switch (dirCode) {
case 1:
bd = Bond::BondDir::BEGINWEDGE;
break;
case 3:
bd = Bond::BondDir::UNKNOWN;
break;
case 6:
bd = Bond::BondDir::BEGINDASH;
break;
default:
bd = Bond::BondDir::NONE;
}
if (reverse) {
wedgeStartAtomIdx = bond->getEndAtomIdx();
}
}
}
auto begAtomOrder =
std::find(atomOrder.begin(), atomOrder.end(), wedgeStartAtomIdx) -
atomOrder.begin();
std::string wType = "";
if (bd == Bond::BondDir::UNKNOWN) {
wType = "w";
} else if (coordsIncluded || isAnAtropisomer) {
// we only do wedgeUp and wedgeDown if coordinates are being output
// or its an atropisomer
if (bd == Bond::BondDir::BEGINWEDGE) {
wType = "wU";
} else if (bd == Bond::BondDir::BEGINDASH) {
wType = "wD";
}
}
if (wType != "") {
if (wParts.find(wType) == wParts.end()) {
wParts[wType] = std::vector<std::string>();
}
wParts[wType].push_back(
boost::str(boost::format("%d.%d") % begAtomOrder % i));
}
}
std::string res = "";
for (auto wPart : wParts) {
if (res != "") {
res += ",";
}
res += wPart.first + ":" + boost::algorithm::join(wPart.second, ",");
}
return res;
}
std::string get_coordbonds_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder,
const std::vector<unsigned int> &bondOrder) {
std::string res = "";
for (unsigned int i = 0; i < bondOrder.size(); ++i) {
auto idx = bondOrder[i];
const auto bond = mol.getBondWithIdx(idx);
if (bond->getBondType() != Bond::BondType::DATIVE) {
continue;
}
auto begAtomOrder =
std::find(atomOrder.begin(), atomOrder.end(), bond->getBeginAtomIdx()) -
atomOrder.begin();
if (!res.empty()) {
res += ",";
} else {
res = "C:";
}
res += boost::str(boost::format("%d.%d") % begAtomOrder % i);
}
return res;
}
std::string get_ringbond_cistrans_block(
const ROMol &mol, const std::vector<unsigned int> &atomOrder,
const std::vector<unsigned int> &bondOrder) {
if (!mol.getRingInfo()->isInitialized()) {
return "";
}
const auto rinfo = mol.getRingInfo();
std::string c = "", t = "", ctu = "";
for (unsigned int i = 0; i < bondOrder.size(); ++i) {
auto idx = bondOrder[i];
if (!rinfo->numBondRings(idx) ||
rinfo->minBondRingSize(idx) <
Chirality::minRingSizeForDoubleBondStereo) {
// we only do ring bonds of a minimum size
continue;
}
const auto bond = mol.getBondWithIdx(idx);
if (bond->getBondType() != Bond::BondType::DOUBLE &&
bond->getBondType() != Bond::BondType::AROMATIC) {
continue;
}
Bond::BondStereo bstereo = bond->getStereo();
if (bstereo != Bond::BondStereo::STEREOANY &&
bstereo != Bond::BondStereo::STEREOCIS &&
bstereo != Bond::BondStereo::STEREOTRANS) {
continue;
}
auto label = std::to_string(i);
if (bstereo == Bond::BondStereo::STEREOANY) {
// this one's easy because we don't care about the atom order.
if (ctu.empty()) {
ctu += "ctu:";
} else {
ctu += ",";
}
ctu += label;
} else {
Atom *begAtom = bond->getBeginAtom();
Atom *endAtom = bond->getEndAtom();
bool needSwap = false;
if (begAtom->getDegree() > 2) {
unsigned int o1 = atomOrder[bond->getStereoAtoms()[0]];
for (const auto nbr : mol.atomNeighbors(begAtom)) {
if (nbr == endAtom ||
nbr->getIdx() ==
static_cast<unsigned>(bond->getStereoAtoms()[0])) {
continue;
}
if (atomOrder[nbr->getIdx() < o1]) {
// this neighbor came first, we need to swap:
needSwap = !needSwap;
}
}
}
if (endAtom->getDegree() > 2) {
unsigned int o1 = atomOrder[bond->getStereoAtoms()[1]];
for (const auto nbr : mol.atomNeighbors(endAtom)) {
if (nbr == begAtom ||
nbr->getIdx() ==
static_cast<unsigned>(bond->getStereoAtoms()[1])) {
continue;
}
if (atomOrder[nbr->getIdx() < o1]) {
// this neighbor came first, we need to swap:
needSwap = !needSwap;
}
}
}
if (bstereo == Bond::BondStereo::STEREOCIS || needSwap) {
if (c.empty()) {
c += "c:";
} else {
c += ",";
}
c += label;
} else {
if (t.empty()) {
t += "t:";
} else {
t += ",";
}
t += label;
}
}
}
return c + t + ctu;
}
std::string get_linknodes_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
bool strict = false;
auto linkNodes = MolEnumerator::utils::getMolLinkNodes(mol, strict);
if (linkNodes.empty()) {
return "";
}
// we need a map from original atom idx to output idx:
std::vector<unsigned int> revOrder(mol.getNumAtoms());
for (unsigned i = 0; i < atomOrder.size(); ++i) {
revOrder[atomOrder[i]] = i;
}
std::stringstream res;
res << "LN:";
for (const auto &ln : linkNodes) {
unsigned int atomIdx = atomOrder[ln.bondAtoms[0].first];
res << atomIdx << ":" << ln.minRep << "." << ln.maxRep;
if (mol.getAtomWithIdx(ln.bondAtoms[0].first)->getDegree() > 2) {
// include the outer atom indices
res << "." << atomOrder[ln.bondAtoms[0].second] << "."
<< atomOrder[ln.bondAtoms[1].second];
}
res << ",";
}
std::string resStr = res.str();
if (!resStr.empty() && resStr.back() == ',') {
resStr.pop_back();
}
return resStr;
}
void appendToCXExtension(const std::string &addition, std::string &base) {
if (!addition.empty()) {
if (base.size() > 1) {
base += ",";
}
base += addition;
}
}
} // namespace
void checkCXFeatures(const ROMol &mol) {
std::string lns;
if (mol.getPropIfPresent(common_properties::molFileLinkNodes, lns)) {
BOOST_LOG(rdWarningLog)
<< "CX Extensions: mol has link nodes which are not currently supported"
<< std::endl;
}
const auto &sgs = getSubstanceGroups(mol);
auto parent_check =
std::any_of(sgs.cbegin(), sgs.cend(), [&](const SubstanceGroup &sg) {
if (sg.hasProp("PARENT")) {
return true;
}
return false;
});
if (parent_check) {
BOOST_LOG(rdWarningLog)
<< "CX Extensions: Substance group hierarchy is not always preserved."
<< std::endl;
}
}
std::string getCXExtensions(const std::vector<ROMol *> &mols,
std::uint32_t flags) {
for (const auto &mol : mols) {
checkCXFeatures(*mol);
if (!mol->hasProp(RDKit::common_properties::_smilesAtomOutputOrder) ||
!mol->hasProp(RDKit::common_properties::_smilesBondOutputOrder)) {
throw ValueErrorException(
"Input molecule does not have the required "
"smiles ordering properties set");
}
}
RDKit::RWMol rwmol;
std::vector<unsigned int> atomOrdering;
std::vector<unsigned int> bondOrdering;
for (const auto &mol : mols) {
const auto at_count = rwmol.getNumAtoms();
const auto bond_count = rwmol.getNumBonds();
std::vector<unsigned int> prevAtomOrdering;
std::vector<unsigned int> prevBondOrdering;
rwmol.insertMol(*mol);
mol->getProp(RDKit::common_properties::_smilesAtomOutputOrder,
prevAtomOrdering);
mol->getProp(RDKit::common_properties::_smilesBondOutputOrder,
prevBondOrdering);
for (auto i : prevAtomOrdering) {
atomOrdering.push_back(i + at_count);
}
for (auto i : prevBondOrdering) {
bondOrdering.push_back(i + bond_count);
}
}
rwmol.setProp(RDKit::common_properties::_smilesAtomOutputOrder, atomOrdering,
true);
rwmol.setProp(RDKit::common_properties::_smilesBondOutputOrder, bondOrdering,
true);
return getCXExtensions(rwmol, flags);
}
std::string getCXExtensions(const ROMol &mol, std::uint32_t flags) {
std::string res = "|";
const std::vector<unsigned int> &atomOrder =
mol.getProp<std::vector<unsigned int>>(
common_properties::_smilesAtomOutputOrder);
const std::vector<unsigned int> &bondOrder =
mol.getProp<std::vector<unsigned int>>(
common_properties::_smilesBondOutputOrder);
bool needLabels = false;
bool needValues = false;
for (auto idx : atomOrder) {
const auto at = mol.getAtomWithIdx(idx);
if (at->hasProp(common_properties::atomLabel) ||
at->hasProp(common_properties::_QueryAtomGenericLabel) ||
at->hasProp(common_properties::dummyLabel) ||
at->hasProp(common_properties::_fromAttachPoint)) {
needLabels = true;
}
if (at->hasProp(common_properties::molFileValue)) {
needValues = true;
}
}
if ((flags & SmilesWrite::CXSmilesFields::CX_COORDS) &&
mol.getNumConformers()) {
res += "(" + get_coords_block(mol, atomOrder) + ")";
}
if ((flags & SmilesWrite::CXSmilesFields::CX_ATOM_LABELS) && needLabels) {
auto lbls = get_atomlabel_block(mol, atomOrder);
if (!lbls.empty()) {
if (res.size() > 1) {
res += ",";
}
res += "$" + lbls + "$";
}
}
if ((flags & SmilesWrite::CXSmilesFields::CX_MOLFILE_VALUES) && needValues) {
if (res.size() > 1) {
res += ",";
}
res += "$_AV:" +
get_value_block(mol, atomOrder, common_properties::molFileValue) +
"$";
}
auto radblock = get_radical_block(mol, atomOrder);
if ((flags & SmilesWrite::CXSmilesFields::CX_RADICALS) && radblock.size()) {
if (res.size() > 1) {
res += ",";
}
res += radblock;
if (res.back() == ',') {
res.erase(res.size() - 1);
}
}
if (flags & SmilesWrite::CXSmilesFields::CX_ATOM_PROPS) {
const auto atomblock = get_atom_props_block(mol, atomOrder);
appendToCXExtension(atomblock, res);
}
const Conformer *conf = nullptr;
if (mol.getNumConformers() && (flags & SmilesWrite::CX_COORDS)) {
conf = &mol.getConformer();
}
std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>> wedgeBonds;
if (flags & SmilesWrite::CXSmilesFields::CX_BOND_CFG) {
wedgeBonds = Chirality::pickBondsToWedge(mol, nullptr, conf);
bool includeCoords = flags & SmilesWrite::CXSmilesFields::CX_COORDS &&
mol.getNumConformers();
const auto cfgblock = get_bond_config_block(mol, atomOrder, bondOrder,
includeCoords, wedgeBonds);
appendToCXExtension(cfgblock, res);
const auto cistransblock =
get_ringbond_cistrans_block(mol, atomOrder, bondOrder);
appendToCXExtension(cistransblock, res);
}
// do the CX_BOND_ATROPISOMER only if CX_BOND_CFG s not done. CX_BOND_CFG
// includes the atropisomer wedging
else if (flags & SmilesWrite::CXSmilesFields::CX_BOND_ATROPISOMER) {
Atropisomers::wedgeBondsFromAtropisomers(mol, conf, wedgeBonds);
const auto cfgblock = get_bond_config_block(
mol, atomOrder, bondOrder, conf != nullptr, wedgeBonds, true);
appendToCXExtension(cfgblock, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_COORDINATE_BONDS) {
const auto block = get_coordbonds_block(mol, atomOrder, bondOrder);
appendToCXExtension(block, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_LINKNODES) {
const auto linknodeblock = get_linknodes_block(mol, atomOrder);
appendToCXExtension(linknodeblock, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_ENHANCEDSTEREO) {
const auto stereoblock =
get_enhanced_stereo_block(mol, atomOrder, wedgeBonds);
appendToCXExtension(stereoblock, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_SGROUPS) {
const auto sgroupdatablock = get_sgroup_data_block(mol, atomOrder);
appendToCXExtension(sgroupdatablock, res);
}
if (flags & SmilesWrite::CXSmilesFields::CX_POLYMER) {
const auto sgrouppolyblock =
get_sgroup_polymer_block(mol, atomOrder, bondOrder);
appendToCXExtension(sgrouppolyblock, res);
}
if (flags & (SmilesWrite::CXSmilesFields::CX_SGROUPS |
SmilesWrite::CXSmilesFields::CX_POLYMER)) {
const auto sgrouphierarchyblock = get_sgroup_hierarchy_block(mol);
appendToCXExtension(sgrouphierarchyblock, res);
}
mol.clearProp("_cxsmilesOutputIndex");
if (res.size() > 1) {
res += "|";
} else {
res = "";
}
return res;
}
} // namespace SmilesWrite
} // namespace RDKit