Files
rdkit/Code/GraphMol/Abbreviations/AbbreviationsUtils.cpp
Greg Landrum d2d87909de Add support for abbreviations (#3406)
* support read-only access to cstates from python

* expose GetBrackets

* expose getAttachPoints too

remove vestigial SubstanceGroupCState_VECT

* backup

* backup

* basics working

* backup

* add label_mol_abbreviations

* fix a bug in the chirality handling

* add linkers, needs more testing

* add another peptide test

* sanitize results by default

* just need rings

* getting started with the C++ form of abbreviations

* a bit of error handling

* add findApplicableMatches

* actually apply the abbreviations

* make the getDefault functions more efficient

* add labeling (creating s groups)

* docs

* basic python wrappers (maybe this is enough?)

* add _displayLabel and _displayLabelW support to MolDraw2D
update the docs for that

* use displayLabel props

* add more default abbrevs

* change default linker defns
add parseLinkers convenience function

* make sure attachment point atoms aren't aromatic

* change the color of dummies to be darker gray

* remove python implementation

* support abbreviations in the java wrappers

* add abbreviations to the csharp wrappers

* add abbreviations to the js wrappers

* add molParity to the list of atom props not written to CXSMILES

* support condensing SUP substance groups

* add that to the python wrappers

* Update testAbbreviations.py

* clear ring info if we added it

* document that the molecules with abbreviations removed have not been sanitized
2020-09-28 17:09:46 -04:00

222 lines
6.4 KiB
C++

//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "Abbreviations.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/RDKitQueries.h>
#include <boost/tokenizer.hpp>
using tokenizer = boost::tokenizer<boost::char_separator<char>>;
namespace RDKit {
namespace Abbreviations {
namespace common_properties {
const std::string numDummies = "_numDummies";
}
namespace Utils {
namespace data {
/*
Translations of superatom labels to SMILES.
First atom of SMILES string should be the one connected to the rest of
the molecule.
ADAPTED FROM: https://github.com/openbabel/superatoms/blob/master/superatom.txt
Originally from http://cactus.nci.nih.gov/osra/
The left-aligned form is the one recognized in MDL alias lines;
the right-aligned form may be used in 2D depiction.
label smiles display_label display_label_w
*/
const std::string defaultAbbreviations =
R"ABBREVS(CO2Et C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
COOEt C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
OiBu OCC(C)C OiBu iBuO
nDec CCCCCCCCCC nDec
nNon CCCCCCCCC nNon
nOct CCCCCCCC nOct
nHept CCCCCCC nHept
nHex CCCCCC nHex
nPent CCCCC nPent
iPent C(C)CCC iPent
tBu C(C)(C)C tBu
iBu C(C)CC iBu
nBu CCCC nBu
iPr C(C)C iPr
nPr CCC nPr
Et CC Et
NCF3 NC(F)(F)F NCF<sub>3</sub> F<sub>3</sub>CN
CF3 C(F)(F)F CF<sub>3</sub> F<sub>3</sub>C
CCl3 C(Cl)(Cl)Cl CCl<sub>3</sub> Cl<sub>3</sub>C
CN C#N CN NC
NC [N+]#[C-] NC CN
N(OH)CH3 N([OH])C N(OH)CH<sub>3</sub> CH<sub>3</sub>(OH)N
NO2 [N+](=O)[O-] NO<sub>2</sub> O<sub>2</sub>N
NO N=O NO ON
SO3H S(=O)(=O)[OH] SO<sub>3</sub>H HO<sub>3</sub>S
CO2H C(=O)[OH] CO<sub>2</sub>H HO<sub>2</sub>C
COOH C(=O)[OH] COOH HOOC
OEt OCC OEt EtO
OAc OC(=O)C OAc AcO
NHAc NC(=O)C NHAc AcNH
Ac C(=O)C Ac
CHO C=O CHO OHC
NMe NC NMe MeN
SMe SC SMe MeS
OMe OC OMe MeO
CO2- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC
COO- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC)ABBREVS";
/*
Translations of linker superatom labels to SMILES.
First atom of SMILES string should be a dummy connected to the rest of
the molecule. The other linker dummy/dummies show the other attachments
*/
const std::string defaultLinkers =
R"ABBREVS(PEG6 *OCCOCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*)ABBREVS";
// other possible abbreviations that might be useful:
/*
PEG6 *OCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*
Hex *CCCCCC*
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy
ala *N[C@@H](C)C(=O)* ala
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
asn *N[C@@H](CC(N)=O)C(=O)* asn
asp *N[C@@H](CC(O)=O)C(=O)* asp
cys *N[C@@H](CS)C(=O)* cys
gln *N[C@@H](CCC(N)=O)C(=O)* gln
glu *N[C@@H](CCC(O)=O)C(=O)* glu
gly *NCC(=O)* gly
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
ile *N[C@@H](C(C)CC)C(=O)* ile
leu *N[C@@H](CC(C)C)C(=O)* leu
lys *N[C@@H](CCCCN)C(=O)* lys
met *N[C@@H](CCSC)C(=O)* met
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
pro *N1[C@@H](CCC1)C(=O)* pro
ser *N[C@@H](CO)C(=O)* ser
thr *N[C@@H](C(O)C)C(=O)* thr
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
val *N[C@@H](C(C)C)C(=O)* val
*/
} // namespace data
namespace detail {
ROMol *createAbbreviationMol(const std::string &txt, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::string smarts;
if (txt[0] != '*') {
smarts = "*" + txt;
} else {
smarts = txt;
}
RWMol *q = SmartsToMol(smarts);
if (!q) {
return q;
}
if (q->getNumAtoms() < 2) {
BOOST_LOG(rdErrorLog) << "abbreviation with <2 atoms ignored" << std::endl;
delete q;
return nullptr;
}
MolOps::AdjustQueryParameters ps;
ps.adjustDegree = true;
ps.adjustDegreeFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
ps.adjustRingCount = true;
ps.adjustRingCountFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
MolOps::adjustQueryProperties(*q, &ps);
if (!allowConnectionToDummies) {
auto qry = makeAtomNumQuery(0);
qry->setNegation(true);
q->getAtomWithIdx(0)->expandQuery(qry);
}
unsigned int nDummies = std::count_if(smarts.begin(), smarts.end(),
[](char c) { return c == '*'; });
if (removeExtraDummies) {
for (unsigned int i = q->getNumAtoms() - 1; i > 0; --i) {
auto at = q->getAtomWithIdx(i);
if (at->hasQuery() && at->getQuery()->getDescription() == "AtomNull") {
q->removeAtom(i);
--nDummies;
}
}
}
q->setProp(common_properties::numDummies, nDummies);
return q;
}
} // namespace detail
std::vector<AbbreviationDefinition> parseAbbreviations(
const std::string &text, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::vector<AbbreviationDefinition> res;
boost::char_separator<char> lineSep("\n");
tokenizer lines(text, lineSep);
boost::char_separator<char> fieldSep(" \t");
for (const auto line : lines) {
AbbreviationDefinition defn;
tokenizer fields(line, fieldSep);
tokenizer::iterator field = fields.begin();
defn.label = *field;
++field;
defn.smarts = *field;
++field;
if (field != fields.end()) {
defn.displayLabel = *field;
++field;
if (field != fields.end()) {
defn.displayLabelW = *field;
}
}
defn.mol.reset(detail::createAbbreviationMol(
defn.smarts, removeExtraDummies, allowConnectionToDummies));
if (defn.mol) {
res.push_back(defn);
}
}
return res;
}
std::vector<AbbreviationDefinition> getDefaultAbbreviations() {
static auto defs = parseAbbreviations(data::defaultAbbreviations);
return defs;
}
std::vector<AbbreviationDefinition> getDefaultLinkers() {
static auto defs = parseAbbreviations(data::defaultLinkers, true, true);
return defs;
}
} // namespace Utils
} // namespace Abbreviations
} // namespace RDKit