rdkit/Code/GraphMol/SmilesParse/CXSmilesOps.cpp

//
//  Copyright (C) 2016-2021 Greg Landrum and other RDKit contributors
//
//   @@ All Rights Reserved @@
//  This file is part of the RDKit.
//  The contents are covered by the terms of the BSD license
//  which is included in the file license.txt, found at the root
//  of the RDKit source tree.
//
#include <RDGeneral/BoostStartInclude.h>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/format.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/RDKitQueries.h>
#include <GraphMol/FileParsers/MolFileStereochem.h>
#include <GraphMol/Atropisomers.h>
#include <GraphMol/Chirality.h>

#include <iostream>
#include <algorithm>
#include "SmilesWrite.h"
#include "SmilesParse.h"
#include "SmilesParseOps.h"
#include <GraphMol/MolEnumerator/LinkNode.h>
#include <GraphMol/Chirality.h>
#include <map>

namespace SmilesParseOps {
using namespace RDKit;

const std::string cxsmilesindex = "_cxsmilesindex";
const std::string cxsgTracker = "_sgTracker";

// FIX: once this can be automated using constexpr, do so
const std::vector<std::string_view> pseudoatoms{"Pol", "Mod"};
const std::vector<std::string_view> pseudoatoms_p{"Pol_p", "Mod_p"};

std::map<std::string, std::string> sgroupTypemap = {
    {"n", "SRU"},   {"mon", "MON"}, {"mer", "MER"}, {"co", "COP"},
    {"xl", "CRO"},  {"mod", "MOD"}, {"mix", "MIX"}, {"f", "FOR"},
    {"any", "ANY"}, {"gen", "GEN"}, {"c", "COM"},   {"grf", "GRA"},
    {"alt", "COP"}, {"ran", "COP"}, {"blk", "COP"}};

template <typename Q>
void addquery(Q *qry, std::string symbol, RDKit::RWMol &mol, unsigned int idx) {
  PRECONDITION(qry, "bad query");
  auto *qa = new QueryAtom(0);
  qa->setQuery(qry);
  qa->setNoImplicit(true);
  mol.replaceAtom(idx, qa);
  if (symbol != "") {
    mol.getAtomWithIdx(idx)->setProp(RDKit::common_properties::atomLabel,
                                     symbol);
  }
  delete qa;
}

void processCXSmilesLabels(RWMol &mol) {
  if (mol.hasProp("_cxsmilesLabelsProcessed")) {
    return;
  }
  for (auto atom : mol.atoms()) {
    std::string symb = "";
    if (atom->getPropIfPresent(common_properties::atomLabel, symb)) {
      atom->clearProp(common_properties::dummyLabel);
      if (symb == "star_e") {
        /* according to the MDL spec, these match anything, but in MARVIN they
        are "unspecified end groups" for polymers */
        addquery(makeAtomNullQuery(), symb, mol, atom->getIdx());
      } else if (symb == "Q_e") {
        addquery(makeQAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "QH_p") {
        addquery(makeQHAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "AH_p") {  // this seems wrong...
        /* According to the MARVIN Sketch, AH is "any atom, including H" -
        this would be "*" in SMILES - and "A" is "any atom except H".
        The CXSMILES docs say that "A" can be represented normally in SMILES
        and that "AH" needs to be written out as AH_p. I'm going to assume that
        this is a Marvin internal thing and just parse it as they describe it.
        This means that "*" in the SMILES itself needs to be treated
        differently, which we do below. */
        addquery(makeAHAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "X_p") {
        addquery(makeXAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "XH_p") {
        addquery(makeXHAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "M_p") {
        addquery(makeMAtomQuery(), symb, mol, atom->getIdx());
      } else if (symb == "MH_p") {
        addquery(makeMHAtomQuery(), symb, mol, atom->getIdx());
      } else if (std::find(pseudoatoms_p.begin(), pseudoatoms_p.end(), symb) !=
                 pseudoatoms_p.end()) {
        // strip off the "_p":
        atom->setProp(common_properties::dummyLabel,
                      symb.substr(0, symb.size() - 2));
        atom->clearProp(common_properties::atomLabel);
      }
    } else if (atom->getAtomicNum() == 0 && !atom->hasQuery() &&
               atom->getSymbol() == "*") {
      addquery(makeAAtomQuery(), "", mol, atom->getIdx());
    }
  }
  mol.setProp("_cxsmilesLabelsProcessed", 1, true);
}

namespace parser {

const std::string _headCrossings = "_headCrossings";
const std::string _tailCrossings = "_tailCrossings";

template <typename Iterator>
bool read_int(Iterator &first, Iterator last, unsigned int &res) {
  std::string num = "";
  while (first <= last && *first >= '0' && *first <= '9') {
    num += *first;
    ++first;
  }
  if (num.empty()) {
    return false;
  }
  res = boost::lexical_cast<unsigned int>(num);
  return true;
}
template <typename Iterator>
bool read_int_list(Iterator &first, Iterator last,
                   std::vector<unsigned int> &res, char sep = ',') {
  while (1) {
    std::string num = "";
    while (first <= last && *first >= '0' && *first <= '9') {
      num += *first;
      ++first;
    }
    if (!num.empty()) {
      res.push_back(boost::lexical_cast<unsigned int>(num));
    }
    if (first >= last || *first != sep) {
      break;
    }
    ++first;
  }
  return true;
}
template <typename Iterator>
bool read_int_pair(Iterator &first, Iterator last, unsigned int &n1,
                   unsigned int &n2, char sep = '.') {
  if (!read_int(first, last, n1)) {
    return false;
  }
  if (first >= last || *first != sep) {
    return false;
  }
  ++first;
  return read_int(first, last, n2);
}

template <typename Iterator>
std::string read_text_to(Iterator &first, Iterator last, std::string delims) {
  std::string res = "";
  Iterator start = first;
  // EFF: there are certainly faster ways to do this
  while (first <= last && delims.find_first_of(*first) == std::string::npos) {
    if (*first == '&' && std::distance(first, last) > 2 &&
        *(first + 1) == '#') {
      // escaped char
      if (start != first) {
        res += std::string(start, first);
      }
      Iterator next = first + 2;
      while (next != last && *next >= '0' && *next <= '9') {
        ++next;
      }
      if (next == last || *next != ';') {
        throw RDKit::SmilesParseException(
            "failure parsing CXSMILES extensions: quoted block not terminated "
            "with ';'");
      }
      if (next > first + 2) {
        std::string blk = std::string(first + 2, next);
        res += (char)(boost::lexical_cast<int>(blk));
      }
      first = next + 1;
      start = first;
    } else {
      ++first;
    }
  }
  if (start != first) {
    res += std::string(start, first);
  }
  return res;
}
namespace {

// this is the super fun case where no information about bonds in/out of the
// sgroup is present.
void setupUnmarkedPolymerSGroup(RWMol &mol, SubstanceGroup &sgroup,
                                std::vector<unsigned int> &headCrossings,
                                std::vector<unsigned int> &tailCrossings) {
  const auto &atoms = sgroup.getAtoms();
  if (atoms.empty()) {
    throw SmilesParseException("no atoms in polymer sgroup");
  }
  const auto firstAtom = mol.getAtomWithIdx(atoms.front());
  for (auto nbr : boost::make_iterator_range(mol.getAtomNeighbors(firstAtom))) {
    const auto nbrAtom = mol[nbr];
    if (std::find(atoms.begin(), atoms.end(), nbrAtom->getIdx()) ==
        atoms.end()) {
      // in most cases we just add this to the set of headCrossings.
      // The exception occurs when there's only one atom in the SGroup and
      //  we already have a headCrossing, in which case we may put this one
      //  as a tailCrossing
      if (atoms.size() > 1 || headCrossings.empty()) {
        headCrossings.push_back(
            mol.getBondBetweenAtoms(firstAtom->getIdx(), nbrAtom->getIdx())
                ->getIdx());
      } else if (atoms.size() == 1) {
        if (tailCrossings.empty()) {
          tailCrossings.push_back(
              mol.getBondBetweenAtoms(firstAtom->getIdx(), nbrAtom->getIdx())
                  ->getIdx());
        } else {
          BOOST_LOG(rdWarningLog)
              << " single atom polymer Sgroup has more than two bonds to "
                 "external atoms. Ignoring all bonds after the first two."
              << std::endl;
        }
      }
    }
  }
  if (atoms.size() > 1) {
    const auto lastAtom = mol.getAtomWithIdx(atoms.back());
    for (auto nbr :
         boost::make_iterator_range(mol.getAtomNeighbors(lastAtom))) {
      const auto nbrAtom = mol[nbr];
      if (std::find(atoms.begin(), atoms.end(), nbrAtom->getIdx()) ==
          atoms.end()) {
        tailCrossings.push_back(
            mol.getBondBetweenAtoms(lastAtom->getIdx(), nbrAtom->getIdx())
                ->getIdx());
      }
    }
  }
}

// deal with setting up the crossing bonds, etc.
void finalizePolymerSGroup(RWMol &mol, SubstanceGroup &sgroup) {
  bool isFlipped = false;
  std::string connect = "EU";
  if (sgroup.getPropIfPresent("CONNECT", connect)) {
    if (connect.find(",f") != std::string::npos) {
      isFlipped = true;
      boost::replace_all(connect, ",f", "");
    }
  }
  if (connect == "hh") {
    connect = "HH";
  } else if (connect == "ht") {
    connect = "HT";
  } else if (connect == "eu") {
    connect = "EU";
  } else {
    BOOST_LOG(rdWarningLog) << "unrecognized CXSMILES CONNECT value: '"
                            << connect << "'. Assuming 'eu'" << std::endl;
    connect = "EU";
  }
  sgroup.setProp("CONNECT", connect);

  std::vector<unsigned int> headCrossings;
  std::vector<unsigned int> tailCrossings;
  sgroup.getPropIfPresent(_headCrossings, headCrossings);
  sgroup.clearProp(_headCrossings);
  sgroup.getPropIfPresent(_tailCrossings, tailCrossings);
  sgroup.clearProp(_tailCrossings);
  if (headCrossings.empty() && tailCrossings.empty()) {
    setupUnmarkedPolymerSGroup(mol, sgroup, headCrossings, tailCrossings);
  }
  if (headCrossings.empty() && tailCrossings.empty()) {
    // we tried... nothing more we can do
    return;
  }

  for (auto &bondIdx : headCrossings) {
    sgroup.addBondWithIdx(bondIdx);
  }
  sgroup.setProp("XBHEAD", headCrossings);

  for (auto &bondIdx : tailCrossings) {
    sgroup.addBondWithIdx(bondIdx);
  }

  // now we can setup XBCORR
  std::vector<unsigned int> xbcorr;
  for (unsigned int i = 0;
       i < std::min(headCrossings.size(), tailCrossings.size()); ++i) {
    unsigned headIdx = headCrossings[i];
    unsigned tailIdx = tailCrossings[i];
    if (isFlipped) {
      tailIdx = tailCrossings[tailCrossings.size() - i - 1];
    }
    xbcorr.push_back(headIdx);
    xbcorr.push_back(tailIdx);
  }
  sgroup.setProp("XBCORR", xbcorr);
}

Bond *get_bond_with_smiles_idx(const ROMol &mol, unsigned idx) {
  for (auto bnd : mol.bonds()) {
    unsigned int smilesIdx;
    if (bnd->getPropIfPresent("_cxsmilesBondIdx", smilesIdx) &&
        smilesIdx == idx) {
      return bnd;
    }
  }
  return nullptr;
}

}  // end of anonymous namespace

// we use this pattern a lot and it's a long function call, but a very short
// #define
#define VALID_ATIDX(_atidx_) \
  ((_atidx_) >= startAtomIdx && (_atidx_) < startAtomIdx + mol.getNumAtoms())

#define VALID_BNDIDX(_bidx_) \
  ((_bidx_) >= startBondIdx && (_bidx_) < startBondIdx + mol.getNumBonds())

template <typename Iterator>
bool parse_atom_values(Iterator &first, Iterator last, RDKit::RWMol &mol,
                       unsigned int startAtomIdx) {
  if (first >= last || *first != ':') {
    return false;
  }
  ++first;
  unsigned int atIdx = 0;
  while (first <= last && *first != '$') {
    std::string tkn = read_text_to(first, last, ";$");
    if (tkn != "" && VALID_ATIDX(atIdx)) {
      mol.getAtomWithIdx(atIdx)->setProp(RDKit::common_properties::molFileValue,
                                         tkn);
    }
    ++atIdx;
    if (first <= last && *first != '$') {
      ++first;
    }
  }
  if (first >= last || *first != '$') {
    return false;
  }
  ++first;
  return true;
}

template <typename Iterator>
bool parse_atom_props(Iterator &first, Iterator last, RDKit::RWMol &mol,
                      unsigned int startAtomIdx) {
  if (first >= last) {
    return false;
  }
  while (first <= last && *first != '|' && *first != ',') {
    unsigned int atIdx;
    if (read_int(first, last, atIdx)) {
      if (first >= last || *first != '.') {
        return false;
      }
      ++first;
      std::string pname = read_text_to(first, last, ".");
      if (!pname.empty()) {
        if (first >= last || *first != '.') {
          return false;
        }
        ++first;
        std::string pval = read_text_to(first, last, ":|,");
        if (VALID_ATIDX(atIdx) && !pval.empty()) {
          mol.getAtomWithIdx(atIdx - startAtomIdx)->setProp(pname, pval);
        }
      }
    }
    if (first <= last && *first != '|' && *first != ',') {
      ++first;
    }
  }
  if (first <= last && *first != '|' && *first != ',') {
    return false;
  }
  if (*first != '|') {
    ++first;
  }
  return true;
}

template <typename Iterator>
bool parse_atom_labels(Iterator &first, Iterator last, RDKit::RWMol &mol,
                       unsigned int startAtomIdx) {
  if (first >= last || *first != '$') {
    return false;
  }
  ++first;
  unsigned int atIdx = 0;
  while (first <= last && *first != '$') {
    std::string tkn = read_text_to(first, last, ";$");
    if (!tkn.empty() && VALID_ATIDX(atIdx)) {
      mol.getAtomWithIdx(atIdx - startAtomIdx)
          ->setProp(RDKit::common_properties::atomLabel, tkn);
    }
    ++atIdx;
    if (first <= last && *first != '$') {
      ++first;
    }
  }
  if (first >= last || *first != '$') {
    return false;
  }
  ++first;
  return true;
}

template <typename Iterator>
bool parse_coords(Iterator &first, Iterator last, RDKit::RWMol &mol,
                  unsigned int startAtomIdx, unsigned int confIdx) {
  if (first >= last || *first != '(') {
    return false;
  }

  auto *conf = new Conformer(mol.getNumAtoms());
  mol.addConformer(conf);
  conf->setId(confIdx);
  ++first;
  unsigned int atIdx = 0;
  bool is3D = false;
  while (first <= last && *first != ')') {
    RDGeom::Point3D pt;
    std::string tkn = read_text_to(first, last, ";)");
    if (VALID_ATIDX(atIdx)) {
      if (!tkn.empty()) {
        std::vector<std::string> tokens;
        boost::split(tokens, tkn, boost::is_any_of(std::string(",")));
        if (tokens.size() >= 1 && tokens[0].size()) {
          pt.x = boost::lexical_cast<double>(tokens[0]);
        }
        if (tokens.size() >= 2 && tokens[1].size()) {
          pt.y = boost::lexical_cast<double>(tokens[1]);
        }
        if (tokens.size() >= 3 && tokens[2].size()) {
          pt.z = boost::lexical_cast<double>(tokens[2]);
          is3D = true;
        }
      }

      conf->setAtomPos(atIdx - startAtomIdx, pt);
    }
    ++atIdx;
    if (first <= last && *first != ')') {
      ++first;
    }
  }
  // make sure that the conformer really is 3D!
  if (is3D && hasNonZeroZCoords(*conf)) {
    conf->set3D(true);
  } else {
    conf->set3D(false);
  }
  if (first >= last || *first != ')') {
    return false;
  }
  ++first;
  return true;
}

template <typename Iterator>
bool parse_coordinate_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
                            Bond::BondType typ, unsigned int startAtomIdx,
                            unsigned int startBondIdx) {
  if (first >= last || (*first != 'C' && *first != 'H')) {
    return false;
  }
  ++first;
  if (first >= last || *first != ':') {
    return false;
  }
  ++first;
  while (first <= last && *first >= '0' && *first <= '9') {
    unsigned int aidx;
    unsigned int bidx;
    if (read_int_pair(first, last, aidx, bidx)) {
      if (VALID_ATIDX(aidx) && VALID_BNDIDX(bidx)) {
        auto bnd = get_bond_with_smiles_idx(mol, bidx - startBondIdx);
        if (!bnd || (bnd->getBeginAtomIdx() != aidx - startAtomIdx &&
                     bnd->getEndAtomIdx() != aidx - startAtomIdx)) {
          BOOST_LOG(rdWarningLog) << "BOND NOT FOUND! " << bidx
                                  << " involving atom " << aidx << std::endl;
          return false;
        }
        bnd->setBondType(typ);
        if (bnd->getBeginAtomIdx() != aidx - startAtomIdx) {
          unsigned int tmp = bnd->getBeginAtomIdx();
          bnd->setBeginAtomIdx(aidx - startAtomIdx);
          bnd->setEndAtomIdx(tmp);
        }
      }
    } else {
      return false;
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_unsaturation(Iterator &first, Iterator last, RDKit::RWMol &mol,
                        unsigned int startAtomIdx) {
  if (first + 1 >= last || *first != 'u') {
    return false;
  }
  ++first;
  if (first >= last || *first != ':') {
    return false;
  }
  ++first;
  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int idx;
    if (!read_int(first, last, idx)) {
      return false;
    }
    if (VALID_ATIDX(idx)) {
      auto atom = mol.getAtomWithIdx(idx - startAtomIdx);
      if (!atom->hasQuery()) {
        atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
      }
      atom->expandQuery(makeAtomUnsaturatedQuery(), Queries::COMPOSITE_AND);
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_ring_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
                      unsigned int startAtomIdx) {
  if (first >= last || *first != 'r' || first + 1 >= last ||
      *(first + 1) != 'b' || first + 2 >= last || *(first + 2) != ':') {
    return false;
  }
  first += 3;
  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int n1;
    if (!read_int(first, last, n1)) {
      return false;
    }
    // check that we can read at least two more characters:
    if (first + 1 >= last || *first != ':') {
      return false;
    }
    ++first;
    unsigned int n2;
    bool gt = false;
    if (*first == '*') {
      ++first;
      n2 = 0xDEADBEEF;
      if (VALID_ATIDX(n1)) {
        mol.setProp(common_properties::_NeedsQueryScan, 1);
      }
    } else {
      if (!read_int(first, last, n2)) {
        return false;
      }
      switch (n2) {
        case 0:
        case 2:
        case 3:
          break;
        case 4:
          gt = true;
          break;
        default:
          BOOST_LOG(rdWarningLog)
              << "unrecognized rb value: " << n2 << std::endl;
          return false;
      }
    }
    if (VALID_ATIDX(n1)) {
      auto atom = mol.getAtomWithIdx(n1 - startAtomIdx);
      if (!atom->hasQuery()) {
        atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
      }
      if (!gt) {
        atom->expandQuery(makeAtomRingBondCountQuery(n2),
                          Queries::COMPOSITE_AND);
      } else {
        auto q = static_cast<ATOM_EQUALS_QUERY *>(new ATOM_LESSEQUAL_QUERY);
        q->setVal(n2);
        q->setDescription("AtomRingBondCount");
        q->setDataFunc(queryAtomRingBondCount);
        atom->expandQuery(q, Queries::COMPOSITE_AND);
      }
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_linknodes(Iterator &first, Iterator last, RDKit::RWMol &mol,
                     unsigned int startAtomIdx) {
  // these look like: |LN:1:1.3.2.6,4:1.4.3.6|
  // that's two records:
  //   1:1.3.2.6: 1-3 repeats, atom 1-2, 1-6
  //   4:1.4.3.6: 1-4 repeats, atom 4-3, 4-6
  // which maps to the property value "1 3 2 2 3 2 7|1 4 2 5 4 5 7"
  // If the linking atom only has two neighbors then the outer atom
  // specification (the last two digits) can be left out. So for a molecule
  // where atom 1 has bonds only to atoms 2 and 6 we could have
  // |LN:1:1.3|
  // instead of
  // |LN:1:1.3.2.6|
  if (first >= last || *first != 'L' || first + 1 >= last ||
      *(first + 1) != 'N' || first + 2 >= last || *(first + 2) != ':') {
    return false;
  }
  first += 3;
  std::string accum = "";
  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int atidx;
    if (!read_int(first, last, atidx)) {
      return false;
    }
    // check that we can read at least two more characters:
    if (first + 1 >= last || *first != ':') {
      return false;
    }
    ++first;
    unsigned int startReps;
    if (!read_int(first, last, startReps)) {
      return false;
    }
    if (first + 1 >= last || *first != '.') {
      return false;
    }
    ++first;
    unsigned int endReps;
    if (!read_int(first, last, endReps)) {
      return false;
    }
    unsigned int idx1;
    unsigned int idx2;
    if (first < last && *first == '.') {
      ++first;
      if (!read_int(first, last, idx1)) {
        return false;
      }
      ++first;
      if (!read_int(first, last, idx2)) {
        return false;
      }
    } else if (VALID_ATIDX(atidx) &&
               mol.getAtomWithIdx(atidx - startAtomIdx)->getDegree() == 2) {
      auto nbrs =
          mol.getAtomNeighbors(mol.getAtomWithIdx(atidx - startAtomIdx));
      idx1 = *nbrs.first;
      nbrs.first++;
      idx2 = *nbrs.first;
    } else if (VALID_ATIDX(atidx)) {
      return false;
    }
    if (first < last && *first == ',') {
      ++first;
    }
    if (VALID_ATIDX(atidx)) {
      if (!accum.empty()) {
        accum += "|";
      }
      accum += (boost::format("%d %d 2 %d %d %d %d") % startReps % endReps %
                (atidx - startAtomIdx + 1) % (idx1 - startAtomIdx + 1) %
                (atidx - startAtomIdx + 1) % (idx2 - startAtomIdx + 1))
                   .str();
    }
  }
  if (!accum.empty()) {
    mol.setProp(common_properties::molFileLinkNodes, accum);
  }
  return true;
}

template <typename Iterator>
void parse_data_sgroup_attr(Iterator &first, Iterator last,
                            SubstanceGroup &sgroup, bool keepSGroup,
                            std::string fieldName, bool fieldIsArray = false) {
  PRECONDITION(first < last);
  if (first != last && *first != '|') {
    std::string data = read_text_to(first, last, ":");
    ++first;
    if (!data.empty() && keepSGroup) {
      if (fieldIsArray) {
        std::vector<std::string> dataFields = {data};
        sgroup.setProp(fieldName, dataFields);
      } else {
        sgroup.setProp(fieldName, data);
      }
    }
  }
}

template <typename Iterator>
bool parse_data_sgroup(Iterator &first, Iterator last, RDKit::RWMol &mol,
                       unsigned int startAtomIdx, unsigned int nSGroups) {
  // these look like: |SgD:2,1:FIELD:info::::|
  // example from CXSMILES docs:
  //    SgD:3,2,1,0:name:data:like:unit:t:(1.,1.)
  // the fields are:
  //    SgD:[atom indices]:[field name]:[data value]:[query
  //    operator]:[unit]:[tag]:[coords]
  //   coords are (-1) if atomic coordinates are present
  if (first >= last || *first != 'S' || first + 3 >= last ||
      *(first + 1) != 'g' || *(first + 2) != 'D' || *(first + 3) != ':') {
    return false;
  }
  first += 4;
  std::vector<unsigned int> atoms;
  if (!read_int_list(first, last, atoms)) {
    return false;
  }
  SubstanceGroup sgroup(&mol, std::string("DAT"));
  sgroup.setProp(cxsmilesindex, nSGroups);
  bool keepSGroup = false;
  for (auto idx : atoms) {
    if (VALID_ATIDX(idx)) {
      keepSGroup = true;
      sgroup.addAtomWithIdx(idx - startAtomIdx);
    }
  }
  ++first;

  parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDNAME");

  // FIX:
  if (keepSGroup) {
    sgroup.setProp("FIELDDISP", "    0.0000    0.0000    DR    ALL  0       0");
  }

  parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "DATAFIELDS", true);

  parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "QUERYOP");

  parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDINFO");

  parse_data_sgroup_attr(first, last, sgroup, keepSGroup, "FIELDTAG");

  if (first < last && *first == '(') {
    // FIX
    std::string coords = read_text_to(first, last, ")");
    ++first;
    if (keepSGroup) {
      sgroup.setProp("COORDS", coords);
    }
  }
  // the label processing can destroy sgroup info, so do that now
  // (the function will immediately return if already called)
  if (keepSGroup) {
    processCXSmilesLabels(mol);
    sgroup.setProp<unsigned int>("index", getSubstanceGroups(mol).size() + 1);
    addSubstanceGroup(mol, sgroup);
  }
  return true;
}

namespace {
std::vector<RDKit::SubstanceGroup>::iterator find_matching_sgroup(
    std::vector<RDKit::SubstanceGroup> &sgs, unsigned int targetId) {
  return std::find_if(sgs.begin(), sgs.end(), [targetId](const auto &sg) {
    unsigned int pval;
    if (sg.getPropIfPresent(cxsmilesindex, pval)) {
      if (pval == targetId) {
        return true;
      }
    }
    return false;
  });
}
}  // namespace
template <typename Iterator>
bool parse_sgroup_hierarchy(Iterator &first, Iterator last, RDKit::RWMol &mol) {
  // these look like: |SgH:1:0|
  // from CXSMILES docs:
  //    SgH:parentSgroupIndex1:childSgroupIndex1.childSgroupIndex2,parentSgroupIndex2:childSgroupIndex1
  if (first >= last || *first != 'S' || first + 3 >= last ||
      *(first + 1) != 'g' || *(first + 2) != 'H' || *(first + 3) != ':') {
    return false;
  }
  first += 4;
  auto &sgs = getSubstanceGroups(mol);
  while (1) {
    unsigned int parentId;
    if (!read_int(first, last, parentId)) {
      return false;
    }

    bool validParent = true;
    auto psg = find_matching_sgroup(sgs, parentId);
    if (psg == sgs.end()) {
      validParent = false;
    } else {
      psg->getPropIfPresent("index", parentId);
    }
    if (first <= last && *first == ':') {
      ++first;
      std::vector<unsigned int> children;
      if (!read_int_list(first, last, children, '.')) {
        return false;
      }
      if (validParent) {
        for (auto childId : children) {
          if (childId >= sgs.size()) {
            throw SmilesParseException(
                "child id references non-existent SGroup");
          }
          auto csg = find_matching_sgroup(sgs, childId);
          if (csg != sgs.end()) {
            unsigned int cid;
            csg->getProp("index", cid);
            csg->setProp("PARENT", parentId);
          }
        }
      }
      if (first <= last && *first == ',') {
        ++first;
      } else {
        break;
      }
    } else {
      return false;
    }
  }

  return true;
}

template <typename Iterator>
bool parse_polymer_sgroup(Iterator &first, Iterator last, RDKit::RWMol &mol,
                          unsigned int startAtomIdx, unsigned int nSGroups) {
  // these look like:
  //    |Sg:n:6,1,2,4::hh&#44;f:6,0,:4,2,|
  // example from CXSMILES docs:
  // the fields are:
  //    Sg:[type]:[atom indices]:[subscript]:[superscript]:[head crossing
  //    bonds]:[tail crossing bonds]:
  //
  // note that it's legit for empty fields to be completely missing.
  //   for example, this doesn't have any crossing bonds indicated:
  // *-CCCN-* |$star_e;;;;;star_e$,Sg:n:4,1,2,3::hh|
  // this last bit makes the whole thing doubleplusfun to parse

  if (first >= last || *first != 'S' || first + 2 >= last ||
      *(first + 1) != 'g' || *(first + 2) != ':') {
    return false;
  }
  first += 3;

  std::string typ = read_text_to(first, last, ":");
  ++first;
  if (sgroupTypemap.find(typ) == sgroupTypemap.end()) {
    return false;
  }
  bool keepSGroup = false;
  SubstanceGroup sgroup(&mol, sgroupTypemap[typ]);
  sgroup.setProp(cxsmilesindex, nSGroups);
  if (typ == "alt") {
    sgroup.setProp("SUBTYPE", std::string("ALT"));
  } else if (typ == "ran") {
    sgroup.setProp("SUBTYPE", std::string("RAN"));
  } else if (typ == "blk") {
    sgroup.setProp("SUBTYPE", std::string("BLO"));
  }

  std::vector<unsigned int> atoms;
  if (!read_int_list(first, last, atoms)) {
    return false;
  }
  //++first;
  for (auto idx : atoms) {
    if (VALID_ATIDX(idx)) {
      sgroup.addAtomWithIdx(idx - startAtomIdx);
      keepSGroup = true;
    }
  }
  std::vector<unsigned int> headCrossing;
  std::vector<unsigned int> tailCrossing;
  if (first <= last && *first == ':') {
    ++first;
    std::string subscript = read_text_to(first, last, ":|");
    if (keepSGroup && !subscript.empty()) {
      sgroup.setProp("LABEL", subscript);
    }
    if (first <= last && *first == ':') {
      ++first;
      std::string superscript = read_text_to(first, last, ":|,");
      if (keepSGroup && !superscript.empty()) {
        sgroup.setProp("CONNECT", superscript);
      }

      if (first <= last && *first == ':') {
        ++first;
        if (!read_int_list(first, last, headCrossing)) {
          return false;
        }
        if (keepSGroup && !headCrossing.empty()) {
          for (auto &cidx : headCrossing) {
            if (VALID_ATIDX(cidx)) {
              cidx -= startAtomIdx;
            } else {
              keepSGroup = false;
              break;
            }
          }
          sgroup.setProp(_headCrossings, headCrossing, true);
        }
        if (first <= last && *first == ':') {
          ++first;
          if (!read_int_list(first, last, tailCrossing)) {
            return false;
          }
        }
        if (keepSGroup && !tailCrossing.empty()) {
          for (auto &cidx : tailCrossing) {
            if (VALID_ATIDX(cidx)) {
              cidx -= startAtomIdx;
            } else {
              keepSGroup = false;
              break;
            }
          }
          sgroup.setProp("_tailCrossings", tailCrossing, true);
        }
      }
    }
  }
  if (keepSGroup) {  // the label processing can destroy sgroup info, so do that
                     // now (the function will immediately return if already
                     // called)
    processCXSmilesLabels(mol);

    finalizePolymerSGroup(mol, sgroup);
    sgroup.setProp<unsigned int>("index", getSubstanceGroups(mol).size() + 1);

    addSubstanceGroup(mol, sgroup);
  }
  return true;
}

template <typename Iterator>
bool parse_variable_attachments(Iterator &first, Iterator last,
                                RDKit::RWMol &mol, unsigned int startAtomIdx) {
  // these look like: CO*.C1=CC=NC=C1 |m:2:3.5.4|
  // that corresponds to replacing the bond to atom 2 with bonds to atom 3, 5,
  // or 4
  //
  if (first >= last || *first != 'm' || first + 1 >= last ||
      *(first + 1) != ':') {
    return false;
  }
  first += 2;

  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int at1idx;
    if (!read_int(first, last, at1idx)) {
      return false;
    }

    if (VALID_ATIDX(at1idx) &&
        mol.getAtomWithIdx(at1idx - startAtomIdx)->getDegree() != 1) {
      BOOST_LOG(rdWarningLog)
          << "position variation bond to atom with more than one bond"
          << std::endl;
      return false;
    }
    if (first < last && *first == ':') {
      ++first;
    } else {
      BOOST_LOG(rdWarningLog) << "improperly formatted m: block" << std::endl;
      return false;
    }
    std::vector<std::string> others;
    while (first < last && *first >= '0' && *first <= '9') {
      unsigned int aidx;
      if (!read_int(first, last, aidx)) {
        return false;
      }
      if (VALID_ATIDX(aidx)) {
        others.push_back(std::to_string(aidx - startAtomIdx + 1));
      }
      if (first < last && *first == '.') {
        ++first;
      }
    }
    if (VALID_ATIDX(at1idx)) {
      std::string endPts = "(" + std::to_string(others.size());
      for (auto idx : others) {
        endPts += " " + idx;
      }
      endPts += ")";

      for (auto nbri : boost::make_iterator_range(
               mol.getAtomBonds(mol.getAtomWithIdx(at1idx - startAtomIdx)))) {
        auto bnd = mol[nbri];
        bnd->setProp(common_properties::_MolFileBondEndPts, endPts);
        bnd->setProp(common_properties::_MolFileBondAttach, std::string("ANY"));
      }
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_wedged_bonds(Iterator &first, Iterator last, RDKit::RWMol &mol,
                        unsigned int startAtomIdx, unsigned int startBondIdx) {
  // these look like: CC(O)Cl |w:1.0|
  // also wD and wU for down and up wedges.
  //
  // We do not end up using this to set stereochemistry, but the relevant bond
  // properties are set in case client code wants to do something with the
  // information.
  if (first >= last || *first != 'w' || first + 1 >= last) {
    return false;
  }
  ++first;
  Bond::BondDir state = Bond::BondDir::NONE;
  unsigned int cfg = 0;
  switch (*first) {
    case ':':
      state = Bond::BondDir::UNKNOWN;
      cfg = 2;
      break;
    case 'U':
      state = Bond::BondDir::BEGINWEDGE;
      cfg = 1;
      ++first;
      break;
    case 'D':
      state = Bond::BondDir::BEGINDASH;
      cfg = 3;
      ++first;
      break;
    default:
      break;
  }
  if (state == Bond::BondDir::NONE || first >= last || first + 1 >= last ||
      *first != ':') {
    return false;
  }
  ++first;
  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int atomIdx;
    if (!read_int(first, last, atomIdx)) {
      return false;
    }
    if (first < last && *first == '.') {
      ++first;
    } else {
      BOOST_LOG(rdWarningLog) << "improperly formatted w block" << std::endl;
      return false;
    }
    unsigned int bondIdx;
    if (!read_int(first, last, bondIdx)) {
      return false;
    }

    if (VALID_ATIDX(atomIdx) && VALID_BNDIDX(bondIdx)) {
      auto atom = mol.getAtomWithIdx(atomIdx - startAtomIdx);
      auto bond = get_bond_with_smiles_idx(mol, bondIdx - startBondIdx);

      if (!bond) {
        BOOST_LOG(rdWarningLog)
            << "bond " << bondIdx << " not found, wedge from atom " << atomIdx
            << " cannot be applied." << std::endl;
        return false;
      }

      // we can't set wedging twice:
      if (bond->hasProp(common_properties::_MolFileBondCfg)) {
        BOOST_LOG(rdWarningLog)
            << "w block attempts to set wedging on bond " << bond->getIdx()
            << " more than once." << std::endl;
        return false;
      }

      // first things first, the atom needs to be the start atom of the bond for
      // any of this to make sense
      if (atom->getIdx() != bond->getBeginAtomIdx()) {
        if (atom->getIdx() != bond->getEndAtomIdx()) {
          BOOST_LOG(rdWarningLog)
              << "atom " << atomIdx << " is not associated with bond "
              << bondIdx << "(" << bond->getBeginAtomIdx() + startAtomIdx << "-"
              << bond->getEndAtomIdx() + startAtomIdx << ")"
              << " in w block" << std::endl;
          return false;
        }
        auto eidx = bond->getBeginAtomIdx();
        bond->setBeginAtomIdx(atom->getIdx());
        bond->setEndAtomIdx(eidx);
      }
      bond->setProp(common_properties::_MolFileBondCfg, cfg);
      bond->setBondDir(state);
      if (cfg == 2 && canHaveDirection(*bond)) {
        bond->getBeginAtom()->setChiralTag(Atom::ChiralType::CHI_UNSPECIFIED);
        mol.setProp(detail::_needsDetectBondStereo, 1);
      }
      if ((cfg == 1 || cfg == 3) && canHaveDirection(*bond)) {
        mol.setProp(detail::_needsDetectAtomStereo, 1);
      }
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_doublebond_stereo(Iterator &first, Iterator last, RDKit::RWMol &mol,
                             unsigned int, unsigned int startBondIdx,
                             Bond::BondStereo stereo) {
  // these look like: C1CCCC/C=C/CCC1 |ctu:5|
  // also c and t for cis or trans
  //
  while (first < last && *first != ':') {
    ++first;
  }
  if (first >= last || *first != ':') {
    return false;
  }
  ++first;

  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int bondIdx;
    if (!read_int(first, last, bondIdx)) {
      return false;
    }
    if (VALID_BNDIDX(bondIdx)) {
      auto bond = get_bond_with_smiles_idx(mol, bondIdx - startBondIdx);

      if (!bond) {
        BOOST_LOG(rdWarningLog)
            << "bond " << bondIdx
            << " not found, cannot mark as stereo double bond." << std::endl;
        return false;
      }

      Chirality::detail::setStereoForBond(mol, bond, stereo);
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool parse_substitution(Iterator &first, Iterator last, RDKit::RWMol &mol,
                        unsigned int startAtomIdx) {
  if (first >= last || *first != 's' || first + 1 >= last ||
      *(first + 1) != ':') {
    return false;
  }
  first += 2;
  while (first < last && *first >= '0' && *first <= '9') {
    unsigned int n1;
    if (!read_int(first, last, n1)) {
      return false;
    }
    // check that we can read at least two more characters:
    if (first + 1 >= last || *first != ':') {
      return false;
    }
    ++first;
    unsigned int n2;
    if (*first == '*') {
      ++first;
      n2 = 0xDEADBEEF;
      if (VALID_ATIDX(n1)) {
        mol.setProp(common_properties::_NeedsQueryScan, 1);
      }
    } else {
      if (!read_int(first, last, n2)) {
        return false;
      }
    }
    if (VALID_ATIDX(n1)) {
      auto atom = mol.getAtomWithIdx(n1 - startAtomIdx);
      if (!atom->hasQuery()) {
        atom = QueryOps::replaceAtomWithQueryAtom(&mol, atom);
      }
      atom->expandQuery(makeAtomNonHydrogenDegreeQuery(n2),
                        Queries::COMPOSITE_AND);
    }
    if (first < last && *first == ',') {
      ++first;
    }
  }
  return true;
}

template <typename Iterator>
bool processRadicalSection(Iterator &first, Iterator last, RDKit::RWMol &mol,
                           unsigned int numRadicalElectrons,
                           unsigned int startAtomIdx) {
  if (first >= last) {
    return false;
  }
  ++first;
  if (first >= last || *first != ':') {
    return false;
  }
  ++first;
  unsigned int atIdx;
  if (!read_int(first, last, atIdx)) {
    return false;
  }
  if (VALID_ATIDX(atIdx)) {
    mol.getAtomWithIdx(atIdx - startAtomIdx)
        ->setNumRadicalElectrons(numRadicalElectrons);
  }
  while (first < last && *first == ',') {
    ++first;
    if (first < last && (*first < '0' || *first > '9')) {
      return true;
    }
    if (!read_int(first, last, atIdx)) {
      return false;
    }
    if (VALID_ATIDX(atIdx)) {
      mol.getAtomWithIdx(atIdx - startAtomIdx)
          ->setNumRadicalElectrons(numRadicalElectrons);
    }
  }
  return first < last;
}

template <typename Iterator>
bool parse_radicals(Iterator &first, Iterator last, RDKit::RWMol &mol,
                    unsigned int startAtomIdx) {
  if (first >= last || *first != '^') {
    return false;
  }
  while (*first == '^') {
    ++first;
    if (first >= last) {
      return false;
    }
    if (*first < '1' || *first > '7') {
      return false;  // these are the values that are allowed to be there
    }
    switch (*first) {
      case '1':
        if (!processRadicalSection(first, last, mol, 1, startAtomIdx)) {
          return false;
        }
        break;
      case '2':
      case '3':
      case '4':
        if (!processRadicalSection(first, last, mol, 2, startAtomIdx)) {
          return false;
        }
        break;
      case '5':
      case '6':
      case '7':
        if (!processRadicalSection(first, last, mol, 3, startAtomIdx)) {
          return false;
        }
        break;
      default:
        BOOST_LOG(rdWarningLog)
            << "Radical specification " << *first << " ignored.";
    }
  }
  return true;
}

template <typename Iterator>
bool parse_enhanced_stereo(Iterator &first, Iterator last, RDKit::RWMol &mol,
                           unsigned int startAtomIdx) {
  StereoGroupType group_type = StereoGroupType::STEREO_ABSOLUTE;
  if (*first == 'a') {
    group_type = StereoGroupType::STEREO_ABSOLUTE;
  } else if (*first == 'o') {
    group_type = StereoGroupType::STEREO_OR;
  } else if (*first == '&') {
    group_type = StereoGroupType::STEREO_AND;
  }
  ++first;

  // OR and AND groups carry a group number
  unsigned int group_id = 0;
  if (group_type != StereoGroupType::STEREO_ABSOLUTE) {
    read_int(first, last, group_id);
  }

  if (first >= last || *first != ':') {
    return false;
  }
  ++first;

  std::vector<Atom *> atoms;
  std::vector<Bond *> bonds;

  while (first <= last && *first >= '0' && *first <= '9') {
    unsigned int aidx;
    if (read_int(first, last, aidx)) {
      if (VALID_ATIDX(aidx)) {
        Atom *atom = mol.getAtomWithIdx(aidx - startAtomIdx);
        if (!atom) {
          BOOST_LOG(rdWarningLog)
              << "Atom " << aidx << " not found!" << std::endl;
          return false;
        }
        atoms.push_back(atom);
      }
    } else {
      return false;
    }

    if (first < last && *first == ',') {
      ++first;
    }
  }
  if (!atoms.empty()) {
    // we need to do a bit of work to check whether or not we've already seen
    // this particular StereoGroup (was Github #6050)
    const auto group_hash =
        10 * group_id + static_cast<unsigned int>(group_type);
    std::vector<unsigned int> sgTracker;
    mol.getPropIfPresent(cxsgTracker, sgTracker);
    std::vector<StereoGroup> mol_stereo_groups(mol.getStereoGroups());
    TEST_ASSERT(mol_stereo_groups.size() == sgTracker.size());

    auto iter = std::find(sgTracker.begin(), sgTracker.end(), group_hash);
    if (iter != sgTracker.end()) {
      auto index = iter - sgTracker.begin();
      auto gAtoms = mol_stereo_groups[index].getAtoms();
      gAtoms.insert(gAtoms.end(), atoms.begin(), atoms.end());
      mol_stereo_groups[index] =
          StereoGroup(mol_stereo_groups[index].getGroupType(),
                      std::move(gAtoms), std::move(bonds), group_id);
    } else {
      // not seen this before, create a new stereogroup
      mol_stereo_groups.emplace_back(group_type, std::move(atoms),
                                     std::move(bonds), group_id);
      sgTracker.push_back(group_hash);
      mol.setProp(cxsgTracker, sgTracker);
    }

    mol.setStereoGroups(std::move(mol_stereo_groups));
  }

  return true;
}

template <typename Iterator>
bool parse_it(Iterator &first, Iterator last, RDKit::RWMol &mol,
              unsigned int startAtomIdx, unsigned int startBondIdx) {
  if (first >= last || *first != '|') {
    return false;
  }
  ++first;
  unsigned int nSGroups = 0;
  unsigned int confIndex = 0;
  while (first < last && *first != '|') {
    typename Iterator::difference_type length = std::distance(first, last);
    if (*first == '(') {
      if (!parse_coords(first, last, mol, startAtomIdx, confIndex++)) {
        return false;
      }
    } else if (*first == '$') {
      if (length > 4 && *(first + 1) == '_' && *(first + 2) == 'A' &&
          *(first + 3) == 'V' && *(first + 4) == ':') {
        first += 4;
        if (!parse_atom_values(first, last, mol, startAtomIdx)) {
          return false;
        }
      } else {
        if (!parse_atom_labels(first, last, mol, startAtomIdx)) {
          return false;
        }
      }
    } else if (length > 9 && std::string(first, first + 9) == "atomProp:") {
      first += 9;
      if (!parse_atom_props(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'C') {
      if (!parse_coordinate_bonds(first, last, mol, Bond::DATIVE, startAtomIdx,
                                  startBondIdx)) {
        return false;
      }
    } else if (*first == 'H') {
      if (!parse_coordinate_bonds(first, last, mol, Bond::HYDROGEN,
                                  startAtomIdx, startBondIdx)) {
        return false;
      }
    } else if (*first == '^') {
      if (!parse_radicals(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'a' || *first == 'o' ||
               (*first == '&' && first + 1 < last && first[1] != '#')) {
      if (!parse_enhanced_stereo(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'r' && first + 1 < last && first[1] == 'b') {
      if (!parse_ring_bonds(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'L' && first + 1 < last && first[1] == 'N') {
      if (!parse_linknodes(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'S' && first + 2 < last && first[1] == 'g' &&
               first[2] == 'D') {
      if (!parse_data_sgroup(first, last, mol, startAtomIdx, nSGroups++)) {
        return false;
      }
    } else if (*first == 'S' && first + 2 < last && first[1] == 'g' &&
               first[2] == 'H') {
      if (!parse_sgroup_hierarchy(first, last, mol)) {
        return false;
      }
    } else if (*first == 'S' && first + 1 < last && first[1] == 'g') {
      if (!parse_polymer_sgroup(first, last, mol, startAtomIdx, nSGroups++)) {
        return false;
      }
    } else if (*first == 'u') {
      if (!parse_unsaturation(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 's') {
      if (!parse_substitution(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'm') {
      if (!parse_variable_attachments(first, last, mol, startAtomIdx)) {
        return false;
      }
    } else if (*first == 'w') {
      if (!parse_wedged_bonds(first, last, mol, startAtomIdx, startBondIdx)) {
        return false;
      }
    } else if (*first == 'c' && first + 2 < last && first[1] == 't' &&
               first[2] == 'u') {
      if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
                                   Bond::BondStereo::STEREOANY)) {
        return false;
      }
    } else if (*first == 'c') {
      if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
                                   Bond::BondStereo::STEREOCIS)) {
        return false;
      }
    } else if (*first == 't') {
      if (!parse_doublebond_stereo(first, last, mol, startAtomIdx, startBondIdx,
                                   Bond::BondStereo::STEREOTRANS)) {
        return false;
      }
    } else {
      ++first;
    }
    // if(first < last && *first != '|') ++first;
  }
  if (first >= last || *first != '|') {
    return false;
  }
  ++first;  // step past the last '|'
  return true;
}
}  // namespace parser

void parseCXExtensions(RDKit::RWMol &mol, const std::string &extText,
                       std::string::const_iterator &first,
                       unsigned int startAtomIdx, unsigned int startBondIdx) {
  // BOOST_LOG(rdWarningLog) << "parseCXNExtensions: " << extText << std::endl;
  if (extText.empty()) {
    return;
  }
  if (extText[0] != '|') {
    throw RDKit::SmilesParseException(
        "CXSMILES extension does not start with |");
  }
  first = extText.begin();
  bool ok =
      parser::parse_it(first, extText.end(), mol, startAtomIdx, startBondIdx);
  if (!ok) {
    throw RDKit::SmilesParseException("failure parsing CXSMILES extensions");
  }
  processCXSmilesLabels(mol);
  mol.clearProp("_cxsmilesLabelsProcessed");
  mol.clearProp(cxsgTracker);
}
}  // end of namespace SmilesParseOps

namespace RDKit {
namespace SmilesWrite {
namespace {

std::vector<unsigned> getSortedMappedIndexes(
    const std::vector<unsigned int> &atomIds,
    const std::vector<unsigned> &revOrder) {
  std::vector<unsigned> res;
  res.reserve(atomIds.size());
  for (auto atomId : atomIds) {
    res.push_back(revOrder[atomId]);
  }
  std::sort(res.begin(), res.end());
  return res;
}

std::pair<std::vector<StereoGroup>, std::vector<std::vector<unsigned>>>
getSortedStereoGroupsAndIndices(
    const ROMol &mol, const std::vector<unsigned int> &revOrder,
    std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>>
        &wedgeBonds) {
  using StGrpIdxPair = std::pair<StereoGroup, std::vector<unsigned>>;

  auto &groups = mol.getStereoGroups();

  std::vector<StGrpIdxPair> sortingGroups;
  sortingGroups.reserve(groups.size());

  for (const auto &sg : groups) {
    std::vector<unsigned int> atomIds;
    Atropisomers::getAllAtomIdsForStereoGroup(mol, sg, atomIds, wedgeBonds);
    const auto newAtomIndexes = getSortedMappedIndexes(atomIds, revOrder);
    if (!newAtomIndexes.empty()) {
      sortingGroups.emplace_back(sg, newAtomIndexes);
    }
  }

  // sort by 1) StereoGroup type; 2) StereoGroup id; 3) atom indexes
  std::sort(sortingGroups.begin(), sortingGroups.end(),
            [](const StGrpIdxPair &a, const StGrpIdxPair &b) {
              const auto &[sgA, idxsA] = a;
              const auto &[sgB, idxsB] = b;
              if (sgA.getGroupType() == sgB.getGroupType()) {
                if (sgA.getWriteId() == sgB.getWriteId()) {
                  return idxsA < idxsB;
                }
                return sgA.getWriteId() < sgB.getWriteId();
              }
              return sgA.getGroupType() < sgB.getGroupType();
            });

  std::vector<StereoGroup> sgs;
  std::vector<std::vector<unsigned>> sgAtomIdxs;
  sgs.reserve(sortingGroups.size());
  sgAtomIdxs.reserve(sortingGroups.size());

  for (auto &&p : sortingGroups) {
    sgs.push_back(std::move(p.first));
    sgAtomIdxs.push_back(std::move(p.second));
  }
  return {std::move(sgs), std::move(sgAtomIdxs)};
}

std::string quote_string(const std::string &txt) {
  // FIX
  return txt;
}

std::string quote_atomprop_string(const std::string &txt) {
  // at a bare minimum, . needs to be escaped
  std::string res;
  for (auto c : txt) {
    if (c == '.') {
      res += "&#46;";
    } else {
      res += c;
    }
  }
  return res;
}

std::string get_enhanced_stereo_block(
    const ROMol &mol, const std::vector<unsigned int> &atomOrder,
    std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>>
        &wedgeBonds) {
  if (mol.getStereoGroups().empty()) {
    return "";
  }
  std::stringstream res;
  // we need a map from original atom idx to output idx:
  std::vector<unsigned int> revOrder(mol.getNumAtoms());
  for (unsigned i = 0; i < atomOrder.size(); ++i) {
    revOrder[atomOrder[i]] = i;
  }

  auto [groups, groupsAtoms] =
      getSortedStereoGroupsAndIndices(mol, revOrder, wedgeBonds);

  assignStereoGroupIds(groups);

  auto grpAtomsItr = groupsAtoms.begin();
  for (auto sgItr = groups.begin(); sgItr != groups.end();
       ++sgItr, ++grpAtomsItr) {
    switch (sgItr->getGroupType()) {
      case StereoGroupType::STEREO_ABSOLUTE:
        res << "a:";
        break;
      case StereoGroupType::STEREO_OR:
        res << "o" << sgItr->getWriteId() << ":";
        break;
      case StereoGroupType::STEREO_AND:
        res << "&" << sgItr->getWriteId() << ":";
        break;
    }

    for (const auto &aid : *grpAtomsItr) {
      res << aid << ",";
    }
  }

  std::string resStr = res.str();
  if (!resStr.empty() && resStr.back() == ',') {
    resStr.pop_back();
  }
  return resStr;
}

std::string get_sgroup_hierarchy_block(const ROMol &mol) {
  const auto &sgs = getSubstanceGroups(mol);
  if (sgs.empty()) {
    return "";
  }
  std::stringstream res;
  // we need a map from sgroup index to output index;
  std::map<unsigned int, unsigned int> sgroupOrder;
  bool parentPresent = false;
  for (const auto &sg : sgs) {
    if (sg.hasProp("_cxsmilesOutputIndex")) {
      unsigned int sgidx = sg.getIndexInMol();
      sg.getPropIfPresent("index", sgidx);
      sgroupOrder[sgidx] = sg.getProp<unsigned int>("_cxsmilesOutputIndex");
      sg.clearProp("_cxsmilesOutputIndex");
    }
    if (sg.hasProp("PARENT")) {
      parentPresent = true;
    }
  }

  if (parentPresent) {
    // now loop over them and add the information
    std::map<unsigned int, std::vector<unsigned int>> accum;
    for (const auto &sg : sgs) {
      unsigned pidx;
      if (sg.getPropIfPresent("PARENT", pidx) &&
          sgroupOrder.find(pidx) != sgroupOrder.end()) {
        unsigned int sgidx = sg.getIndexInMol();
        sg.getPropIfPresent("index", sgidx);
        if (sgroupOrder.find(sgidx) != sgroupOrder.end()) {
          accum[sgroupOrder[pidx]].push_back(sgroupOrder[sgidx]);
        }
      }
    }
    if (!accum.empty()) {
      res << "SgH:";
      for (const auto &pr : accum) {
        res << pr.first << ":";
        for (auto v : pr.second) {
          res << v << ".";
        }
        // remove the extra ".":
        res.seekp(-1, res.cur);
        res << ",";
      }
    }
    std::string resStr = res.str();
    while (!resStr.empty() && resStr.back() == ',') {
      resStr.pop_back();
    }
    return resStr;
  } else {
    return "";
  }
}

std::string get_sgroup_polymer_block(
    const ROMol &mol, const std::vector<unsigned int> &atomOrder,
    const std::vector<unsigned int> &bondOrder) {
  const auto &sgs = getSubstanceGroups(mol);
  if (sgs.empty()) {
    return "";
  }
  unsigned int sgroupOutputIndex = 0;
  mol.getPropIfPresent("_cxsmilesOutputIndex", sgroupOutputIndex);
  std::stringstream res;
  // we need a map from original atom idx to output idx:
  std::vector<unsigned int> revAtomOrder(mol.getNumAtoms());
  for (unsigned i = 0; i < atomOrder.size(); ++i) {
    revAtomOrder[atomOrder[i]] = i;
  }
  // we need a map from original bond idx to output idx:
  std::vector<unsigned int> revBondOrder(mol.getNumBonds());
  for (unsigned i = 0; i < bondOrder.size(); ++i) {
    revBondOrder[bondOrder[i]] = i;
  }

  std::map<std::string, std::string> reverseTypemap;
  for (const auto &pr : SmilesParseOps::sgroupTypemap) {
    if (reverseTypemap.find(pr.second) == reverseTypemap.end()) {
      reverseTypemap[pr.second] = pr.first;
    }
  }

  for (const auto &sg : sgs) {
    std::string typ;
    if (sg.getPropIfPresent("TYPE", typ) &&
        reverseTypemap.find(typ) != reverseTypemap.end()) {
      sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
      sgroupOutputIndex++;

      res << "Sg:";
      std::string subtype;
      if (typ == "COP" && sg.getPropIfPresent("SUBTYPE", subtype)) {
        if (subtype == "ALT") {
          res << "alt";
        } else if (subtype == "RAN") {
          res << "ran";
        } else if (subtype == "BLO") {
          res << "blk";
        } else {
          res << reverseTypemap["COP"];
        }
      } else {
        res << reverseTypemap[typ];
      }
      res << ":";
      for (const auto oaid : sg.getAtoms()) {
        res << revAtomOrder[oaid] << ",";
      }
      // remove the extra ",":
      res.seekp(-1, res.cur);
      res << ":";
      std::string label;
      if (sg.getPropIfPresent("LABEL", label)) {
        res << label;
      }
      res << ":";
      std::string connect;
      if (sg.getPropIfPresent("CONNECT", connect)) {
        boost::algorithm::to_lower(connect);
        res << connect;
      }
      res << ":";
      std::vector<unsigned int> headCrossings;
      if (sg.getPropIfPresent("XBHEAD", headCrossings) &&
          headCrossings.size() > 1) {
        for (auto v : headCrossings) {
          res << bondOrder[v] << ",";
        }
        // remove the extra ",":
        res.seekp(-1, res.cur);
      }
      res << ":";
      std::vector<unsigned int> tailCrossings;
      if (sg.getPropIfPresent("XBCORR", tailCrossings) &&
          tailCrossings.size() > 2) {
        for (unsigned int i = 1; i < tailCrossings.size(); i += 2) {
          res << bondOrder[tailCrossings[i]] << ",";
        }
        // remove the extra ",":
        res.seekp(-1, res.cur);
      }
      res << ":";
    }
    res << ",";
  }

  std::string resStr = res.str();
  while (!resStr.empty() && resStr.back() == ',') {
    resStr.pop_back();
  }
  mol.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);

  return resStr;
}

std::string get_sgroup_data_block(const ROMol &mol,
                                  const std::vector<unsigned int> &atomOrder) {
  const auto &sgs = getSubstanceGroups(mol);
  if (sgs.empty()) {
    return "";
  }

  unsigned int sgroupOutputIndex = 0;
  mol.getPropIfPresent("_cxsmilesOutputIndex", sgroupOutputIndex);

  std::stringstream res;
  // we need a map from original atom idx to output idx:
  std::vector<unsigned int> revOrder(mol.getNumAtoms());
  for (unsigned i = 0; i < atomOrder.size(); ++i) {
    revOrder[atomOrder[i]] = i;
  }

  for (const auto &sg : sgs) {
    if (sg.hasProp("TYPE") && sg.getProp<std::string>("TYPE") == "DAT") {
      sg.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);
      sgroupOutputIndex++;

      res << "SgD:";
      // we don't attempt to canonicalize the atom order because the user
      // may ascribe some significance to the ordering of the atoms
      for (const auto oaid : sg.getAtoms()) {
        res << revOrder[oaid] << ",";
      }
      // remove the extra ",":
      res.seekp(-1, res.cur);
      res << ":";
      std::string prop;
      if (sg.getPropIfPresent("FIELDNAME", prop) && !prop.empty()) {
        res << prop;
      }
      res << ":";
      std::vector<std::string> vprop;
      if (sg.getPropIfPresent("DATAFIELDS", vprop) && !vprop.empty()) {
        for (const auto &pv : vprop) {
          res << pv << ",";
        }
        // remove the extra ",":
        res.seekp(-1, res.cur);
      }
      res << ":";
      if (sg.getPropIfPresent("QUERYOP", prop) && !prop.empty()) {
        res << prop;
      }
      res << ":";
      if (sg.getPropIfPresent("FIELDINFO", prop) && !prop.empty()) {
        res << prop;
      }
      res << ":";
      if (sg.getPropIfPresent("FIELDTAG", prop) && !prop.empty()) {
        res << prop;
      }
      res << ":";
      // FIX: do something about the coordinates
    }
    res << ",";
  }

  std::string resStr = res.str();
  if (!resStr.empty() && resStr.back() == ',') {
    resStr.pop_back();
  }
  mol.setProp("_cxsmilesOutputIndex", sgroupOutputIndex);

  return resStr;
}

std::string get_atomlabel_block(const ROMol &mol,
                                const std::vector<unsigned int> &atomOrder) {
  std::string res = "";
  for (auto idx : atomOrder) {
    if (idx != atomOrder.front()) {
      res += ";";
    }
    std::string lbl;
    int val;
    const auto atom = mol.getAtomWithIdx(idx);
    if (atom->getPropIfPresent(common_properties::_QueryAtomGenericLabel,
                               lbl)) {
      res += quote_string(lbl + "_p");
    } else if (!atom->getAtomicNum() &&
               atom->getPropIfPresent(common_properties::dummyLabel, lbl) &&
               std::find(SmilesParseOps::pseudoatoms.begin(),
                         SmilesParseOps::pseudoatoms.end(),
                         lbl) != SmilesParseOps::pseudoatoms.end()) {
      res += quote_string(lbl + "_p");
    } else if (!atom->getAtomicNum() &&
               atom->getPropIfPresent(common_properties::_fromAttachPoint,
                                      val) &&
               (val == 1 || val == 2)) {
      res += quote_string("_AP" + std::to_string(val));
    } else if (atom->getPropIfPresent(common_properties::atomLabel, lbl)) {
      res += quote_string(lbl);
    }
  }
  // if we didn't find anything return an empty string
  if (std::find_if_not(res.begin(), res.end(),
                       [](const auto c) { return c == ';'; }) == res.end()) {
    res.clear();
  }
  return res;
}

std::string get_value_block(const ROMol &mol,
                            const std::vector<unsigned int> &atomOrder,
                            const std::string &prop) {
  std::string res = "";
  bool first = true;
  for (auto idx : atomOrder) {
    if (!first) {
      res += ";";
    } else {
      first = false;
    }
    std::string lbl;
    if (mol.getAtomWithIdx(idx)->getPropIfPresent(prop, lbl)) {
      res += quote_string(lbl);
    }
  }
  return res;
}
std::string get_radical_block(const ROMol &mol,
                              const std::vector<unsigned int> &atomOrder) {
  std::string res = "";
  std::map<unsigned int, std::vector<unsigned int>> rads;
  for (unsigned int i = 0; i < atomOrder.size(); ++i) {
    auto idx = atomOrder[i];
    auto nrad = mol.getAtomWithIdx(idx)->getNumRadicalElectrons();
    if (nrad) {
      rads[nrad].push_back(i);
    }
  }
  if (rads.size()) {
    for (const auto &pr : rads) {
      switch (pr.first) {
        case 1:
          res += "^1:";
          break;
        case 2:
          res += "^2:";
          break;
        case 3:
          res += "^5:";
          break;
        default:
          BOOST_LOG(rdWarningLog) << "unsupported number of radical electrons "
                                  << pr.first << std::endl;
      }
      for (auto aidx : pr.second) {
        res += boost::str(boost::format("%d,") % aidx);
      }
    }
  }
  return res;
}
double zero_small_vals(double val) {
  if (fabs(val) < 1e-4) {
    return 0.0;
  }
  return val;
}
std::string get_coords_block(const ROMol &mol,
                             const std::vector<unsigned int> &atomOrder) {
  std::string res = "";
  const auto &conf = mol.getConformer();
  bool first = true;
  for (auto idx : atomOrder) {
    const auto &pt = conf.getAtomPos(idx);
    if (!first) {
      res += ";";
    } else {
      first = false;
    }
    res += boost::str(boost::format("%g,%g,") % zero_small_vals(pt.x) %
                      zero_small_vals(pt.y));
    if (conf.is3D()) {
      auto zc = boost::str(boost::format("%g") % zero_small_vals(pt.z));
      if (zc != "0") {
        res += zc;
      }
    }
  }
  return res;
}

std::string get_atom_props_block(const ROMol &mol,
                                 const std::vector<unsigned int> &atomOrder) {
  std::vector<std::string> skip = {common_properties::atomLabel,
                                   common_properties::molFileValue,
                                   common_properties::molParity};
  std::string res = "";
  unsigned int which = 0;
  for (auto idx : atomOrder) {
    const auto atom = mol.getAtomWithIdx(idx);
    bool isAttachmentPoint = !atom->getAtomicNum() &&
                             atom->hasProp(common_properties::_fromAttachPoint);
    bool includePrivate = false, includeComputed = false;
    for (const auto &pn : atom->getPropList(includePrivate, includeComputed)) {
      if (std::find(skip.begin(), skip.end(), pn) == skip.end()) {
        std::string pv = atom->getProp<std::string>(pn);
        if (pn == "dummyLabel" &&
            (isAttachmentPoint ||
             std::find(SmilesParseOps::pseudoatoms.begin(),
                       SmilesParseOps::pseudoatoms.end(),
                       pv) != SmilesParseOps::pseudoatoms.end())) {
          // it's a pseudoatom or attachment point, skip it
          continue;
        }
        if (res.empty()) {
          res += "atomProp";
        }
        res +=
            boost::str(boost::format(":%d.%s.%s") % which %
                       quote_atomprop_string(pn) % quote_atomprop_string(pv));
      }
    }
    ++which;
  }
  return res;
}

std::string get_bond_config_block(
    const ROMol &mol, const std::vector<unsigned int> &atomOrder,
    const std::vector<unsigned int> &bondOrder, bool coordsIncluded,
    std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>> &wedgeBonds,
    bool atropisomerOnly = false) {
  std::map<std::string, std ::vector<std::string>> wParts;
  for (unsigned int i = 0; i < bondOrder.size(); ++i) {
    auto idx = bondOrder[i];
    const auto bond = mol.getBondWithIdx(idx);
    unsigned int wedgeStartAtomIdx = bond->getBeginAtomIdx();

    if (!canHaveDirection(*bond)) {
      continue;
    }
    // when figuring out what to output for the bond, favor the wedge state:
    Bond::BondDir bd = bond->getBondDir();
    switch (bd) {
      case Bond::BondDir::BEGINDASH:
      case Bond::BondDir::BEGINWEDGE:
      case Bond::BondDir::UNKNOWN:
        break;
      default:
        bd = Bond::BondDir::NONE;
    }

    if (atropisomerOnly && bd == Bond::BondDir::NONE) {
      continue;
    }

    // see if this one is an atropisomer

    bool isAnAtropisomer = false;

    const Atom *firstAtom = bond->getBeginAtom();
    if (bd == Bond::BondDir::BEGINDASH || bd == Bond::BondDir::BEGINWEDGE) {
      for (auto bondNbr : mol.atomBonds(firstAtom)) {
        if (bondNbr->getIdx() == bond->getIdx()) {
          continue;  // a bond is not its own neighbor
        }
        if (bondNbr->getStereo() == Bond::BondStereo::STEREOATROPCW ||
            bondNbr->getStereo() == Bond::BondStereo::STEREOATROPCCW) {
          isAnAtropisomer = true;

          // if it is for an atropisomer and there are no coords, check to see
          // if the wedge needs to be flipped based on the smiles reordering
          if (!coordsIncluded && isAnAtropisomer) {
            Atropisomers::AtropAtomAndBondVec atomAndBondVecs[2];
            if (!Atropisomers::getAtropisomerAtomsAndBonds(
                    bondNbr, atomAndBondVecs, mol)) {
              throw ValueErrorException("Internal error - should not occur");
              // should not happend
            } else {
              unsigned int swaps = 0;

              unsigned int firstReorderedIdx =
                  std::find(atomOrder.begin(), atomOrder.end(),
                            bondNbr->getBeginAtom()->getIdx()) -
                  atomOrder.begin();
              unsigned int secondReorderedIdx =
                  std::find(atomOrder.begin(), atomOrder.end(),
                            bondNbr->getEndAtom()->getIdx()) -
                  atomOrder.begin();
              if (firstReorderedIdx > secondReorderedIdx) {
                ++swaps;
              }

              for (unsigned int bondAtomIndex = 0; bondAtomIndex < 2;
                   ++bondAtomIndex) {
                if (atomAndBondVecs[bondAtomIndex].first == firstAtom)
                  continue;  // swapped atoms on the side where the wedge bond
                             // is does NOT change the wedge bond
                if (atomAndBondVecs[bondAtomIndex].second.size() == 2) {
                  unsigned int firstOtherAtomIdx =
                      atomAndBondVecs[bondAtomIndex]
                          .second[0]
                          ->getOtherAtom(atomAndBondVecs[bondAtomIndex].first)
                          ->getIdx();
                  unsigned int secondOtherAtomIdx =
                      atomAndBondVecs[bondAtomIndex]
                          .second[1]
                          ->getOtherAtom(atomAndBondVecs[bondAtomIndex].first)
                          ->getIdx();

                  unsigned int firstReorderedAtomIdx =
                      std::find(atomOrder.begin(), atomOrder.end(),
                                firstOtherAtomIdx) -
                      atomOrder.begin();
                  unsigned int secondReorderedAtomIdx =
                      std::find(atomOrder.begin(), atomOrder.end(),
                                secondOtherAtomIdx) -
                      atomOrder.begin();

                  if (firstReorderedAtomIdx > secondReorderedAtomIdx) {
                    ++swaps;
                  }
                }
              }
              if (swaps % 2) {
                bd = (bd == Bond::BondDir::BEGINWEDGE)
                         ? Bond::BondDir::BEGINDASH
                         : Bond::BondDir::BEGINWEDGE;
              }
            }
          }

          break;
        }
      }
    }

    if (atropisomerOnly) {
      // one of the bonds on the beginning atom of this bond must be an
      // atropisomer

      if (!isAnAtropisomer) {
        continue;
      }
    } else {  //  atropisomeronly is FALSE - check for a wedging caused by
              //  chiral atom
      unsigned int cfg = 0;
      if (bd == Bond::BondDir::NONE &&
          bond->getPropIfPresent(common_properties::_MolFileBondCfg, cfg)) {
        switch (cfg) {
          case 1:
            bd = Bond::BondDir::BEGINWEDGE;
            break;
          case 2:
            bd = Bond::BondDir::UNKNOWN;
            break;
          case 3:
            bd = Bond::BondDir::BEGINDASH;
            break;

          default:
            bd = Bond::BondDir::NONE;
        }
      }

      if (bd == Bond::BondDir::NONE && coordsIncluded) {
        int dirCode;
        bool reverse;
        Chirality::GetMolFileBondStereoInfo(
            bond, wedgeBonds, &mol.getConformer(), dirCode, reverse);
        switch (dirCode) {
          case 1:
            bd = Bond::BondDir::BEGINWEDGE;
            break;
          case 3:
            bd = Bond::BondDir::UNKNOWN;
            break;
          case 6:
            bd = Bond::BondDir::BEGINDASH;
            break;
          default:
            bd = Bond::BondDir::NONE;
        }
        if (reverse) {
          wedgeStartAtomIdx = bond->getEndAtomIdx();
        }
      }
    }

    auto begAtomOrder =
        std::find(atomOrder.begin(), atomOrder.end(), wedgeStartAtomIdx) -
        atomOrder.begin();

    std::string wType = "";
    if (bd == Bond::BondDir::UNKNOWN) {
      wType = "w";
    } else if (coordsIncluded || isAnAtropisomer) {
      // we only do wedgeUp and wedgeDown if coordinates are being output
      // or its an atropisomer
      if (bd == Bond::BondDir::BEGINWEDGE) {
        wType = "wU";
      } else if (bd == Bond::BondDir::BEGINDASH) {
        wType = "wD";
      }
    }

    if (wType != "") {
      if (wParts.find(wType) == wParts.end()) {
        wParts[wType] = std::vector<std::string>();
      }
      wParts[wType].push_back(
          boost::str(boost::format("%d.%d") % begAtomOrder % i));
    }
  }
  std::string res = "";

  for (auto wPart : wParts) {
    if (res != "") {
      res += ",";
    }
    res += wPart.first + ":" + boost::algorithm::join(wPart.second, ",");
  }

  return res;
}

std::string get_coordbonds_block(const ROMol &mol,
                                 const std::vector<unsigned int> &atomOrder,
                                 const std::vector<unsigned int> &bondOrder) {
  std::string res = "";
  for (unsigned int i = 0; i < bondOrder.size(); ++i) {
    auto idx = bondOrder[i];
    const auto bond = mol.getBondWithIdx(idx);
    if (bond->getBondType() != Bond::BondType::DATIVE) {
      continue;
    }
    auto begAtomOrder =
        std::find(atomOrder.begin(), atomOrder.end(), bond->getBeginAtomIdx()) -
        atomOrder.begin();
    if (!res.empty()) {
      res += ",";
    } else {
      res = "C:";
    }
    res += boost::str(boost::format("%d.%d") % begAtomOrder % i);
  }
  return res;
}

std::string get_ringbond_cistrans_block(
    const ROMol &mol, const std::vector<unsigned int> &atomOrder,
    const std::vector<unsigned int> &bondOrder) {
  if (!mol.getRingInfo()->isInitialized()) {
    return "";
  }

  const auto rinfo = mol.getRingInfo();
  std::string c = "", t = "", ctu = "";
  for (unsigned int i = 0; i < bondOrder.size(); ++i) {
    auto idx = bondOrder[i];
    if (!rinfo->numBondRings(idx) ||
        rinfo->minBondRingSize(idx) <
            Chirality::minRingSizeForDoubleBondStereo) {
      // we only do ring bonds of a minimum size
      continue;
    }
    const auto bond = mol.getBondWithIdx(idx);
    if (bond->getBondType() != Bond::BondType::DOUBLE &&
        bond->getBondType() != Bond::BondType::AROMATIC) {
      continue;
    }
    Bond::BondStereo bstereo = bond->getStereo();
    if (bstereo != Bond::BondStereo::STEREOANY &&
        bstereo != Bond::BondStereo::STEREOCIS &&
        bstereo != Bond::BondStereo::STEREOTRANS) {
      continue;
    }

    auto label = std::to_string(i);

    if (bstereo == Bond::BondStereo::STEREOANY) {
      // this one's easy because we don't care about the atom order.
      if (ctu.empty()) {
        ctu += "ctu:";
      } else {
        ctu += ",";
      }
      ctu += label;
    } else {
      Atom *begAtom = bond->getBeginAtom();
      Atom *endAtom = bond->getEndAtom();
      bool needSwap = false;
      if (begAtom->getDegree() > 2) {
        unsigned int o1 = atomOrder[bond->getStereoAtoms()[0]];
        for (const auto nbr : mol.atomNeighbors(begAtom)) {
          if (nbr == endAtom ||
              nbr->getIdx() ==
                  static_cast<unsigned>(bond->getStereoAtoms()[0])) {
            continue;
          }
          if (atomOrder[nbr->getIdx() < o1]) {
            // this neighbor came first, we need to swap:
            needSwap = !needSwap;
          }
        }
      }
      if (endAtom->getDegree() > 2) {
        unsigned int o1 = atomOrder[bond->getStereoAtoms()[1]];
        for (const auto nbr : mol.atomNeighbors(endAtom)) {
          if (nbr == begAtom ||
              nbr->getIdx() ==
                  static_cast<unsigned>(bond->getStereoAtoms()[1])) {
            continue;
          }
          if (atomOrder[nbr->getIdx() < o1]) {
            // this neighbor came first, we need to swap:
            needSwap = !needSwap;
          }
        }
      }
      if (bstereo == Bond::BondStereo::STEREOCIS || needSwap) {
        if (c.empty()) {
          c += "c:";
        } else {
          c += ",";
        }
        c += label;
      } else {
        if (t.empty()) {
          t += "t:";
        } else {
          t += ",";
        }
        t += label;
      }
    }
  }
  return c + t + ctu;
}

std::string get_linknodes_block(const ROMol &mol,
                                const std::vector<unsigned int> &atomOrder) {
  bool strict = false;
  auto linkNodes = MolEnumerator::utils::getMolLinkNodes(mol, strict);
  if (linkNodes.empty()) {
    return "";
  }
  // we need a map from original atom idx to output idx:
  std::vector<unsigned int> revOrder(mol.getNumAtoms());
  for (unsigned i = 0; i < atomOrder.size(); ++i) {
    revOrder[atomOrder[i]] = i;
  }

  std::stringstream res;
  res << "LN:";
  for (const auto &ln : linkNodes) {
    unsigned int atomIdx = atomOrder[ln.bondAtoms[0].first];
    res << atomIdx << ":" << ln.minRep << "." << ln.maxRep;
    if (mol.getAtomWithIdx(ln.bondAtoms[0].first)->getDegree() > 2) {
      // include the outer atom indices
      res << "." << atomOrder[ln.bondAtoms[0].second] << "."
          << atomOrder[ln.bondAtoms[1].second];
    }
    res << ",";
  }

  std::string resStr = res.str();
  if (!resStr.empty() && resStr.back() == ',') {
    resStr.pop_back();
  }
  return resStr;
}

void appendToCXExtension(const std::string &addition, std::string &base) {
  if (!addition.empty()) {
    if (base.size() > 1) {
      base += ",";
    }
    base += addition;
  }
}

}  // namespace

void checkCXFeatures(const ROMol &mol) {
  std::string lns;
  if (mol.getPropIfPresent(common_properties::molFileLinkNodes, lns)) {
    BOOST_LOG(rdWarningLog)
        << "CX Extensions: mol has link nodes which are not currently supported"
        << std::endl;
  }
  const auto &sgs = getSubstanceGroups(mol);
  auto parent_check =
      std::any_of(sgs.cbegin(), sgs.cend(), [&](const SubstanceGroup &sg) {
        if (sg.hasProp("PARENT")) {
          return true;
        }
        return false;
      });
  if (parent_check) {
    BOOST_LOG(rdWarningLog)
        << "CX Extensions: Substance group hierarchy is not always preserved."
        << std::endl;
  }
}

std::string getCXExtensions(const std::vector<ROMol *> &mols,
                            std::uint32_t flags) {
  for (const auto &mol : mols) {
    checkCXFeatures(*mol);
    if (!mol->hasProp(RDKit::common_properties::_smilesAtomOutputOrder) ||
        !mol->hasProp(RDKit::common_properties::_smilesBondOutputOrder)) {
      throw ValueErrorException(
          "Input molecule does not have the required "
          "smiles ordering properties set");
    }
  }
  RDKit::RWMol rwmol;

  std::vector<unsigned int> atomOrdering;
  std::vector<unsigned int> bondOrdering;

  for (const auto &mol : mols) {
    const auto at_count = rwmol.getNumAtoms();
    const auto bond_count = rwmol.getNumBonds();

    std::vector<unsigned int> prevAtomOrdering;
    std::vector<unsigned int> prevBondOrdering;

    rwmol.insertMol(*mol);

    mol->getProp(RDKit::common_properties::_smilesAtomOutputOrder,
                 prevAtomOrdering);
    mol->getProp(RDKit::common_properties::_smilesBondOutputOrder,
                 prevBondOrdering);
    for (auto i : prevAtomOrdering) {
      atomOrdering.push_back(i + at_count);
    }
    for (auto i : prevBondOrdering) {
      bondOrdering.push_back(i + bond_count);
    }
  }

  rwmol.setProp(RDKit::common_properties::_smilesAtomOutputOrder, atomOrdering,
                true);
  rwmol.setProp(RDKit::common_properties::_smilesBondOutputOrder, bondOrdering,
                true);

  return getCXExtensions(rwmol, flags);
}

std::string getCXExtensions(const ROMol &mol, std::uint32_t flags) {
  std::string res = "|";
  const std::vector<unsigned int> &atomOrder =
      mol.getProp<std::vector<unsigned int>>(
          common_properties::_smilesAtomOutputOrder);
  const std::vector<unsigned int> &bondOrder =
      mol.getProp<std::vector<unsigned int>>(
          common_properties::_smilesBondOutputOrder);

  bool needLabels = false;
  bool needValues = false;
  for (auto idx : atomOrder) {
    const auto at = mol.getAtomWithIdx(idx);
    if (at->hasProp(common_properties::atomLabel) ||
        at->hasProp(common_properties::_QueryAtomGenericLabel) ||
        at->hasProp(common_properties::dummyLabel) ||
        at->hasProp(common_properties::_fromAttachPoint)) {
      needLabels = true;
    }
    if (at->hasProp(common_properties::molFileValue)) {
      needValues = true;
    }
  }
  if ((flags & SmilesWrite::CXSmilesFields::CX_COORDS) &&
      mol.getNumConformers()) {
    res += "(" + get_coords_block(mol, atomOrder) + ")";
  }
  if ((flags & SmilesWrite::CXSmilesFields::CX_ATOM_LABELS) && needLabels) {
    auto lbls = get_atomlabel_block(mol, atomOrder);
    if (!lbls.empty()) {
      if (res.size() > 1) {
        res += ",";
      }
      res += "$" + lbls + "$";
    }
  }
  if ((flags & SmilesWrite::CXSmilesFields::CX_MOLFILE_VALUES) && needValues) {
    if (res.size() > 1) {
      res += ",";
    }
    res += "$_AV:" +
           get_value_block(mol, atomOrder, common_properties::molFileValue) +
           "$";
  }
  auto radblock = get_radical_block(mol, atomOrder);
  if ((flags & SmilesWrite::CXSmilesFields::CX_RADICALS) && radblock.size()) {
    if (res.size() > 1) {
      res += ",";
    }
    res += radblock;
    if (res.back() == ',') {
      res.erase(res.size() - 1);
    }
  }

  if (flags & SmilesWrite::CXSmilesFields::CX_ATOM_PROPS) {
    const auto atomblock = get_atom_props_block(mol, atomOrder);
    appendToCXExtension(atomblock, res);
  }

  const Conformer *conf = nullptr;
  if (mol.getNumConformers() && (flags & SmilesWrite::CX_COORDS)) {
    conf = &mol.getConformer();
  }

  std::map<int, std::unique_ptr<RDKit::Chirality::WedgeInfoBase>> wedgeBonds;
  if (flags & SmilesWrite::CXSmilesFields::CX_BOND_CFG) {
    wedgeBonds = Chirality::pickBondsToWedge(mol, nullptr, conf);

    bool includeCoords = flags & SmilesWrite::CXSmilesFields::CX_COORDS &&
                         mol.getNumConformers();
    const auto cfgblock = get_bond_config_block(mol, atomOrder, bondOrder,
                                                includeCoords, wedgeBonds);
    appendToCXExtension(cfgblock, res);
    const auto cistransblock =
        get_ringbond_cistrans_block(mol, atomOrder, bondOrder);
    appendToCXExtension(cistransblock, res);
  }

  // do the CX_BOND_ATROPISOMER only if CX_BOND_CFG s not done.  CX_BOND_CFG
  // includes the atropisomer wedging
  else if (flags & SmilesWrite::CXSmilesFields::CX_BOND_ATROPISOMER) {
    Atropisomers::wedgeBondsFromAtropisomers(mol, conf, wedgeBonds);
    const auto cfgblock = get_bond_config_block(
        mol, atomOrder, bondOrder, conf != nullptr, wedgeBonds, true);
    appendToCXExtension(cfgblock, res);
  }

  if (flags & SmilesWrite::CXSmilesFields::CX_COORDINATE_BONDS) {
    const auto block = get_coordbonds_block(mol, atomOrder, bondOrder);
    appendToCXExtension(block, res);
  }

  if (flags & SmilesWrite::CXSmilesFields::CX_LINKNODES) {
    const auto linknodeblock = get_linknodes_block(mol, atomOrder);
    appendToCXExtension(linknodeblock, res);
  }
  if (flags & SmilesWrite::CXSmilesFields::CX_ENHANCEDSTEREO) {
    const auto stereoblock =
        get_enhanced_stereo_block(mol, atomOrder, wedgeBonds);
    appendToCXExtension(stereoblock, res);
  }
  if (flags & SmilesWrite::CXSmilesFields::CX_SGROUPS) {
    const auto sgroupdatablock = get_sgroup_data_block(mol, atomOrder);
    appendToCXExtension(sgroupdatablock, res);
  }
  if (flags & SmilesWrite::CXSmilesFields::CX_POLYMER) {
    const auto sgrouppolyblock =
        get_sgroup_polymer_block(mol, atomOrder, bondOrder);
    appendToCXExtension(sgrouppolyblock, res);
  }
  if (flags & (SmilesWrite::CXSmilesFields::CX_SGROUPS |
               SmilesWrite::CXSmilesFields::CX_POLYMER)) {
    const auto sgrouphierarchyblock = get_sgroup_hierarchy_block(mol);
    appendToCXExtension(sgrouphierarchyblock, res);
  }
  mol.clearProp("_cxsmilesOutputIndex");
  if (res.size() > 1) {
    res += "|";
  } else {
    res = "";
  }
  return res;
}
}  // namespace SmilesWrite
}  // namespace RDKit