Add support for abbreviations (#3406)

* support read-only access to cstates from python

* expose GetBrackets

* expose getAttachPoints too

remove vestigial SubstanceGroupCState_VECT

* backup

* backup

* basics working

* backup

* add label_mol_abbreviations

* fix a bug in the chirality handling

* add linkers, needs more testing

* add another peptide test

* sanitize results by default

* just need rings

* getting started with the C++ form of abbreviations

* a bit of error handling

* add findApplicableMatches

* actually apply the abbreviations

* make the getDefault functions more efficient

* add labeling (creating s groups)

* docs

* basic python wrappers (maybe this is enough?)

* add _displayLabel and _displayLabelW support to MolDraw2D
update the docs for that

* use displayLabel props

* add more default abbrevs

* change default linker defns
add parseLinkers convenience function

* make sure attachment point atoms aren't aromatic

* change the color of dummies to be darker gray

* remove python implementation

* support abbreviations in the java wrappers

* add abbreviations to the csharp wrappers

* add abbreviations to the js wrappers

* add molParity to the list of atom props not written to CXSMILES

* support condensing SUP substance groups

* add that to the python wrappers

* Update testAbbreviations.py

* clear ring info if we added it

* document that the molecules with abbreviations removed have not been sanitized
This commit is contained in:
Greg Landrum
2020-09-28 23:09:46 +02:00
committed by GitHub
parent 1775eee644
commit d2d87909de
30 changed files with 1717 additions and 62 deletions

View File

@@ -0,0 +1,264 @@
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "Abbreviations.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <RDGeneral/types.h>
#include <RDGeneral/Invariant.h>
#include <boost/dynamic_bitset.hpp>
#include <iostream>
namespace RDKit {
namespace Abbreviations {
void applyMatches(RWMol& mol, const std::vector<AbbreviationMatch>& matches) {
boost::dynamic_bitset<> toRemove(mol.getNumAtoms());
for (const auto& amatch : matches) {
// throughout this remember that atom 0 in the match is the dummy
// convert atom 1 to be the abbreviation so that we don't have to
// worry about messing up chirality, etc.
auto connectIdx = amatch.match[1].second;
auto connectingAtom = mol.getAtomWithIdx(connectIdx);
connectingAtom->setProp(RDKit::common_properties::atomLabel,
amatch.abbrev.label);
if (!amatch.abbrev.displayLabel.empty()) {
connectingAtom->setProp(RDKit::common_properties::_displayLabel,
amatch.abbrev.displayLabel);
}
if (!amatch.abbrev.displayLabelW.empty()) {
connectingAtom->setProp(RDKit::common_properties::_displayLabelW,
amatch.abbrev.displayLabelW);
}
connectingAtom->setFormalCharge(0);
connectingAtom->setAtomicNum(0);
connectingAtom->setIsotope(0);
connectingAtom->setIsAromatic(false);
// set the hybridization so these are drawn linearly
connectingAtom->setHybridization(Atom::HybridizationType::SP);
for (unsigned int i = 2; i < amatch.match.size(); ++i) {
const auto& pr = amatch.match[i];
CHECK_INVARIANT(!toRemove[pr.second], "overlapping matches");
toRemove.set(pr.second);
// if there's a molecule associated with the match, check to see if
// additional bonds need to be formed
if (amatch.abbrev.mol &&
mol.getAtomWithIdx(pr.second)->getDegree() >
amatch.abbrev.mol->getAtomWithIdx(pr.first)->getDegree()) {
for (const auto& nbri : boost::make_iterator_range(
mol.getAtomNeighbors(mol.getAtomWithIdx(pr.second)))) {
const auto& nbr = mol[nbri];
auto nbrIdx = nbr->getIdx();
// if this neighbor isn't in the match:
if (!std::any_of(amatch.match.begin(), amatch.match.end(),
[&](const std::pair<int, int>& tpr) {
return tpr.second == rdcast<int>(nbrIdx);
})) {
mol.addBond(nbrIdx, connectIdx, Bond::BondType::SINGLE);
}
}
}
}
// make connections between any extraAttachAtoms and the connection point
for (auto oaidx : amatch.abbrev.extraAttachAtoms) {
mol.addBond(oaidx, connectIdx, Bond::BondType::SINGLE);
}
}
for (unsigned int i = toRemove.size(); i > 0; --i) {
if (toRemove[i - 1]) {
mol.removeAtom(i - 1);
}
}
}
void labelMatches(RWMol& mol, const std::vector<AbbreviationMatch>& matches) {
for (const auto& amatch : matches) {
// throughout this remember that atom 0 in the match is the dummy
SubstanceGroup sg(&mol, "SUP");
sg.setProp("LABEL", amatch.abbrev.label);
for (unsigned int i = 1; i < amatch.match.size(); ++i) {
const auto& pr = amatch.match[i];
sg.addAtomWithIdx(pr.second);
}
auto bnd =
mol.getBondBetweenAtoms(amatch.match[0].second, amatch.match[1].second);
CHECK_INVARIANT(bnd, "bond to attachment point not found");
sg.addBondWithIdx(bnd->getIdx());
sg.addAttachPoint(amatch.match[1].second, amatch.match[0].second, "1");
addSubstanceGroup(mol, sg);
}
}
std::vector<AbbreviationMatch> findApplicableAbbreviationMatches(
const ROMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage) {
std::vector<AbbreviationMatch> res;
auto nAtoms = mol.getNumAtoms();
if (!nAtoms || abbrevs.empty()) {
return res;
}
bool hasRings = mol.getRingInfo()->isInitialized();
if(!hasRings) {
MolOps::fastFindRings(mol);
}
std::vector<AbbreviationMatch> tres;
boost::dynamic_bitset<> dummies(mol.getNumAtoms());
boost::dynamic_bitset<> firstAts(mol.getNumAtoms());
boost::dynamic_bitset<> covered(mol.getNumAtoms());
for (const auto& abbrev : abbrevs) {
CHECK_INVARIANT(abbrev.mol, "molecule is null");
if (maxCoverage > 0) {
unsigned int nDummies;
abbrev.mol->getProp(common_properties::numDummies, nDummies);
if (double(abbrev.mol->getNumAtoms() - nDummies) / nAtoms >=
maxCoverage) {
continue;
}
}
auto matches = SubstructMatch(mol, *abbrev.mol);
for (const auto& match : matches) {
CHECK_INVARIANT(match.size() > 1, "bad match size");
// if we've already covered the first non-dummy atom or used it as a first
// atom skip this.
if (firstAts[match[1].second] || covered[match[1].second]) {
continue;
}
bool keepIt = true;
for (unsigned int i = 2; i < match.size(); ++i) {
const auto& pr = match[i];
if (covered[pr.second]) {
keepIt = false;
break;
}
}
if (!keepIt) {
continue;
}
for (unsigned int i = 1; i < match.size(); ++i) {
const auto& pr = match[i];
covered.set(pr.second);
}
dummies.set(match[0].second);
firstAts.set(match[1].second);
if (!firstAts[match[0].second]) {
tres.emplace_back(match, abbrev);
}
}
}
for (const auto& itm : tres) {
// if the dummy in this wasn't a first atom anywhere
if (!firstAts[itm.match[0].second]) {
res.push_back(std::move(itm));
}
}
// if we added ring info, go ahead and remove it
if(!hasRings){
mol.getRingInfo()->reset();
}
return res;
}
void condenseMolAbbreviations(
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage, bool sanitize) {
auto applicable =
findApplicableAbbreviationMatches(mol, abbrevs, maxCoverage);
applyMatches(mol, applicable);
if (sanitize) {
MolOps::symmetrizeSSSR(mol);
}
};
void labelMolAbbreviations(RWMol& mol,
const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage) {
auto applicable =
findApplicableAbbreviationMatches(mol, abbrevs, maxCoverage);
labelMatches(mol, applicable);
};
RDKIT_ABBREVIATIONS_EXPORT void condenseAbbreviationSubstanceGroups(
RWMol& mol) {
auto& molSGroups = getSubstanceGroups(mol);
std::vector<AbbreviationMatch> abbrevMatches;
for (const auto& sg : molSGroups) {
if (sg.getProp<std::string>("TYPE") == "SUP") {
AbbreviationMatch abbrevMatch;
std::string label = "abbrev";
sg.getPropIfPresent("LABEL", label);
abbrevMatch.abbrev.label = label;
auto ats = sg.getAtoms();
auto bnds = sg.getBonds();
if (bnds.empty()) {
BOOST_LOG(rdWarningLog) << "SUP group without any bonds" << std::endl;
} else {
bool firstAttachFound = false;
for (unsigned int i = 0; i < bnds.size(); ++i) {
auto bnd = mol.getBondWithIdx(bnds[i]);
unsigned int mAt; // sgroup atom in the match
unsigned int oAt; // add the first attachment point to the beginning
// of the atom list
if (std::find(ats.begin(), ats.end(), bnd->getBeginAtomIdx()) !=
ats.end()) {
oAt = bnd->getEndAtomIdx();
mAt = bnd->getBeginAtomIdx();
} else if (std::find(ats.begin(), ats.end(), bnd->getEndAtomIdx()) !=
ats.end()) {
oAt = bnd->getBeginAtomIdx();
mAt = bnd->getEndAtomIdx();
} else {
BOOST_LOG(rdWarningLog) << "SUP group includes bond not connected "
"to any of the abbreviation atoms"
<< std::endl;
continue;
}
if (!firstAttachFound) {
// make sure the atom connected to the first attachment point
// is the first one in the match
if (*ats.begin() != mAt) {
ats.erase(std::find(ats.begin(), ats.end(), mAt));
ats.insert(ats.begin(), mAt);
}
ats.insert(ats.begin(), oAt);
firstAttachFound = true;
} else {
abbrevMatch.abbrev.extraAttachAtoms.push_back(oAt);
}
}
}
// create a match record:
for (unsigned int i = 0; i < ats.size(); ++i) {
abbrevMatch.match.push_back({i, ats[i]});
}
abbrevMatches.push_back(abbrevMatch);
}
}
if (!abbrevMatches.empty()) {
applyMatches(mol, abbrevMatches);
} else {
BOOST_LOG(rdWarningLog) << "no suitable SubstanceGroups found" << std::endl;
}
}; // namespace Abbreviations
} // namespace Abbreviations
} // namespace RDKit

View File

@@ -0,0 +1,131 @@
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/export.h>
#ifndef RD_ABBREVIATIONS_H
#define RD_ABBREVIATIONS_H
#include <vector>
#include <string>
#include <memory>
namespace RDKit {
class ROMol;
class RWMol;
namespace Abbreviations {
RDKIT_ABBREVIATIONS_EXPORT struct AbbreviationDefinition {
std::string label;
std::string displayLabel;
std::string displayLabelW;
std::string smarts;
std::shared_ptr<ROMol> mol; //! optional
std::vector<unsigned int> extraAttachAtoms; //! optional
bool operator==(const AbbreviationDefinition& other) const {
return label == other.label && displayLabel == other.displayLabel &&
displayLabelW == other.displayLabelW && smarts == other.smarts;
}
bool operator!=(const AbbreviationDefinition& other) const {
return !(*this == other);
}
};
RDKIT_ABBREVIATIONS_EXPORT struct AbbreviationMatch {
std::vector<std::pair<int, int>> match;
AbbreviationDefinition abbrev;
AbbreviationMatch(const std::vector<std::pair<int, int>>& matchArg,
const AbbreviationDefinition& abbrevArg)
: match(matchArg), abbrev(abbrevArg){};
AbbreviationMatch() : match(), abbrev(){};
bool operator==(const AbbreviationMatch& other) const {
return abbrev == other.abbrev && match == other.match;
}
bool operator!=(const AbbreviationMatch& other) const {
return !(*this == other);
}
};
namespace common_properties {
RDKIT_ABBREVIATIONS_EXPORT extern const std::string numDummies;
}
namespace Utils {
//! returns the default set of abbreviation definitions
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
getDefaultAbbreviations();
//! returns the default set of linker definitions
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
getDefaultLinkers();
//! parses a string describing abbreviation matches and returns the result
/*
\param text the data to be parsed, see below for the format
\param removeExtraDummies controls whether or not dummy atoms beyond atom 0 are
removed. Set this to true to create abbreviations for linkers
\param allowConnectionToDummies allows abbreviations to directly connect to
abbreviations. set this to true for linkers
Format of the text data:
A series of lines, each of which contains:
label SMARTS displayLabel displayLabelW
the "displayLabel" and "displayLabelW" fields are optional.
where label is the label used for the abbreviation,
SMARTS is the SMARTS definition of the abbreviation.
displayLabel is used in drawings to render the abbreviations.
displayLabelW is the display label if a bond comes in from the right
Use dummies to indicate attachment points. The assumption is that the first
atom is a dummy (one will be added if this is not true) and that the second
atom is the surrogate for the rest of the group.
*/
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
parseAbbreviations(const std::string& text, bool removeExtraDummies = false,
bool allowConnectionToDummies = false);
//! \brief equivalent to calling \c parseAbbreviations(text,true,true)
inline std::vector<AbbreviationDefinition> parseLinkers(
const std::string& text) {
return parseAbbreviations(text, true, true);
};
} // namespace Utils
//! returns all matches for the abbreviations across the molecule
/*!
\param abbrevs the abbreviations to look for. This list is used in order.
\param maxCoverage any abbreviation that covers than more than this fraction
of the molecule's atoms (not counting dummies) will not be returned.
*/
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationMatch>
findApplicableAbbreviationMatches(
const ROMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage = 0.4);
//! applies the abbreviation matches to a molecule, modifying it in place.
//! the modified molecule is not sanitized
RDKIT_ABBREVIATIONS_EXPORT void applyMatches(
RWMol& mol, const std::vector<AbbreviationMatch>& matches);
//! creates "SUP" SubstanceGroups on the molecule describing the abbreviation
RDKIT_ABBREVIATIONS_EXPORT void labelMatches(
RWMol& mol, const std::vector<AbbreviationMatch>& matches);
//! convenience function for finding and applying abbreviations
//! the modified molecule is not sanitized
RDKIT_ABBREVIATIONS_EXPORT void condenseMolAbbreviations(
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage = 0.4, bool sanitize = true);
//! convenience function for finding and labeling abbreviations as SUP
//! SubstanceGroups
RDKIT_ABBREVIATIONS_EXPORT void labelMolAbbreviations(
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
double maxCoverage = 0.4);
//! collapses abbreviation (i.e. "SUP") substance groups
//! the modified molecule is not sanitized
RDKIT_ABBREVIATIONS_EXPORT void condenseAbbreviationSubstanceGroups(RWMol& mol);
} // namespace Abbreviations
} // namespace RDKit
#endif

View File

@@ -0,0 +1,221 @@
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "Abbreviations.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/RDKitQueries.h>
#include <boost/tokenizer.hpp>
using tokenizer = boost::tokenizer<boost::char_separator<char>>;
namespace RDKit {
namespace Abbreviations {
namespace common_properties {
const std::string numDummies = "_numDummies";
}
namespace Utils {
namespace data {
/*
Translations of superatom labels to SMILES.
First atom of SMILES string should be the one connected to the rest of
the molecule.
ADAPTED FROM: https://github.com/openbabel/superatoms/blob/master/superatom.txt
Originally from http://cactus.nci.nih.gov/osra/
The left-aligned form is the one recognized in MDL alias lines;
the right-aligned form may be used in 2D depiction.
label smiles display_label display_label_w
*/
const std::string defaultAbbreviations =
R"ABBREVS(CO2Et C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
COOEt C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
OiBu OCC(C)C OiBu iBuO
nDec CCCCCCCCCC nDec
nNon CCCCCCCCC nNon
nOct CCCCCCCC nOct
nHept CCCCCCC nHept
nHex CCCCCC nHex
nPent CCCCC nPent
iPent C(C)CCC iPent
tBu C(C)(C)C tBu
iBu C(C)CC iBu
nBu CCCC nBu
iPr C(C)C iPr
nPr CCC nPr
Et CC Et
NCF3 NC(F)(F)F NCF<sub>3</sub> F<sub>3</sub>CN
CF3 C(F)(F)F CF<sub>3</sub> F<sub>3</sub>C
CCl3 C(Cl)(Cl)Cl CCl<sub>3</sub> Cl<sub>3</sub>C
CN C#N CN NC
NC [N+]#[C-] NC CN
N(OH)CH3 N([OH])C N(OH)CH<sub>3</sub> CH<sub>3</sub>(OH)N
NO2 [N+](=O)[O-] NO<sub>2</sub> O<sub>2</sub>N
NO N=O NO ON
SO3H S(=O)(=O)[OH] SO<sub>3</sub>H HO<sub>3</sub>S
CO2H C(=O)[OH] CO<sub>2</sub>H HO<sub>2</sub>C
COOH C(=O)[OH] COOH HOOC
OEt OCC OEt EtO
OAc OC(=O)C OAc AcO
NHAc NC(=O)C NHAc AcNH
Ac C(=O)C Ac
CHO C=O CHO OHC
NMe NC NMe MeN
SMe SC SMe MeS
OMe OC OMe MeO
CO2- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC
COO- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC)ABBREVS";
/*
Translations of linker superatom labels to SMILES.
First atom of SMILES string should be a dummy connected to the rest of
the molecule. The other linker dummy/dummies show the other attachments
*/
const std::string defaultLinkers =
R"ABBREVS(PEG6 *OCCOCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*)ABBREVS";
// other possible abbreviations that might be useful:
/*
PEG6 *OCCOCCOCCOCCOCC* PEG6
PEG5 *OCCOCCOCCOCCOCC* PEG5
PEG4 *OCCOCCOCCOCC* PEG4
PEG3 *OCCOCCOCC* PEG3
Dec *CCCCCCCCCC*
Non *CCCCCCCCC*
Oct *CCCCCCCC*
Hept *CCCCCCC*
Hex *CCCCCC*
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy
ala *N[C@@H](C)C(=O)* ala
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
asn *N[C@@H](CC(N)=O)C(=O)* asn
asp *N[C@@H](CC(O)=O)C(=O)* asp
cys *N[C@@H](CS)C(=O)* cys
gln *N[C@@H](CCC(N)=O)C(=O)* gln
glu *N[C@@H](CCC(O)=O)C(=O)* glu
gly *NCC(=O)* gly
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
ile *N[C@@H](C(C)CC)C(=O)* ile
leu *N[C@@H](CC(C)C)C(=O)* leu
lys *N[C@@H](CCCCN)C(=O)* lys
met *N[C@@H](CCSC)C(=O)* met
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
pro *N1[C@@H](CCC1)C(=O)* pro
ser *N[C@@H](CO)C(=O)* ser
thr *N[C@@H](C(O)C)C(=O)* thr
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
val *N[C@@H](C(C)C)C(=O)* val
*/
} // namespace data
namespace detail {
ROMol *createAbbreviationMol(const std::string &txt, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::string smarts;
if (txt[0] != '*') {
smarts = "*" + txt;
} else {
smarts = txt;
}
RWMol *q = SmartsToMol(smarts);
if (!q) {
return q;
}
if (q->getNumAtoms() < 2) {
BOOST_LOG(rdErrorLog) << "abbreviation with <2 atoms ignored" << std::endl;
delete q;
return nullptr;
}
MolOps::AdjustQueryParameters ps;
ps.adjustDegree = true;
ps.adjustDegreeFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
ps.adjustRingCount = true;
ps.adjustRingCountFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
MolOps::adjustQueryProperties(*q, &ps);
if (!allowConnectionToDummies) {
auto qry = makeAtomNumQuery(0);
qry->setNegation(true);
q->getAtomWithIdx(0)->expandQuery(qry);
}
unsigned int nDummies = std::count_if(smarts.begin(), smarts.end(),
[](char c) { return c == '*'; });
if (removeExtraDummies) {
for (unsigned int i = q->getNumAtoms() - 1; i > 0; --i) {
auto at = q->getAtomWithIdx(i);
if (at->hasQuery() && at->getQuery()->getDescription() == "AtomNull") {
q->removeAtom(i);
--nDummies;
}
}
}
q->setProp(common_properties::numDummies, nDummies);
return q;
}
} // namespace detail
std::vector<AbbreviationDefinition> parseAbbreviations(
const std::string &text, bool removeExtraDummies,
bool allowConnectionToDummies) {
std::vector<AbbreviationDefinition> res;
boost::char_separator<char> lineSep("\n");
tokenizer lines(text, lineSep);
boost::char_separator<char> fieldSep(" \t");
for (const auto line : lines) {
AbbreviationDefinition defn;
tokenizer fields(line, fieldSep);
tokenizer::iterator field = fields.begin();
defn.label = *field;
++field;
defn.smarts = *field;
++field;
if (field != fields.end()) {
defn.displayLabel = *field;
++field;
if (field != fields.end()) {
defn.displayLabelW = *field;
}
}
defn.mol.reset(detail::createAbbreviationMol(
defn.smarts, removeExtraDummies, allowConnectionToDummies));
if (defn.mol) {
res.push_back(defn);
}
}
return res;
}
std::vector<AbbreviationDefinition> getDefaultAbbreviations() {
static auto defs = parseAbbreviations(data::defaultAbbreviations);
return defs;
}
std::vector<AbbreviationDefinition> getDefaultLinkers() {
static auto defs = parseAbbreviations(data::defaultLinkers, true, true);
return defs;
}
} // namespace Utils
} // namespace Abbreviations
} // namespace RDKit

View File

@@ -0,0 +1,15 @@
rdkit_library(Abbreviations
Abbreviations.cpp
AbbreviationsUtils.cpp
LINK_LIBRARIES SmilesParse
SubstructMatch GraphMol RDGeneral)
target_compile_definitions(Abbreviations PRIVATE RDKIT_ABBREVIATIONS_BUILD)
rdkit_headers(Abbreviations.h DEST GraphMol/Abbreviations)
rdkit_catch_test(testAbbreviations ../catch_main.cpp catch_tests.cpp
LINK_LIBRARIES Abbreviations SmilesParse FileParsers )
if(RDK_BUILD_PYTHON_WRAPPERS)
add_subdirectory(Wrap)
endif()

View File

@@ -0,0 +1,8 @@
rdkit_python_extension(rdAbbreviations
rdAbbreviations.cpp
DEST Chem
LINK_LIBRARIES Abbreviations
GraphMol )
add_pytest(pyAbbreviations ${CMAKE_CURRENT_SOURCE_DIR}/testAbbreviations.py)

View File

@@ -0,0 +1,100 @@
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDBoost/python.h>
#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
#include <GraphMol/GraphMol.h>
#include <RDBoost/Wrap.h>
#include <GraphMol/Abbreviations/Abbreviations.h>
namespace python = boost::python;
using namespace RDKit;
namespace {
ROMol *condenseMolAbbreviationsHelper(const ROMol *mol,
python::object pyabbrevs,
double maxCoverage, bool sanitize) {
RWMol *res = new RWMol(*mol);
auto abbrevs =
pythonObjectToVect<Abbreviations::AbbreviationDefinition>(pyabbrevs);
Abbreviations::condenseMolAbbreviations(*res, *abbrevs, maxCoverage,
sanitize);
return rdcast<ROMol *>(res);
}
ROMol *condenseAbbreviationSGroupHelper(const ROMol *mol) {
RWMol *res = new RWMol(*mol);
Abbreviations::condenseAbbreviationSubstanceGroups(*res);
return rdcast<ROMol *>(res);
}
ROMol *labelMolAbbreviationsHelper(const ROMol *mol, python::object pyabbrevs,
double maxCoverage) {
RWMol *res = new RWMol(*mol);
auto abbrevs =
pythonObjectToVect<Abbreviations::AbbreviationDefinition>(pyabbrevs);
Abbreviations::labelMolAbbreviations(*res, *abbrevs, maxCoverage);
return rdcast<ROMol *>(res);
}
} // namespace
BOOST_PYTHON_MODULE(rdAbbreviations) {
python::scope().attr("__doc__") =
"Module containing functions for working with molecular abbreviations";
// RegisterVectorConverter<Abbreviations::AbbreviationMatch>();
RegisterVectorConverter<Abbreviations::AbbreviationDefinition>();
python::class_<Abbreviations::AbbreviationDefinition>(
"AbbreviationDefinition", "Abbreviation Definition", python::init<>())
.def_readwrite("label", &Abbreviations::AbbreviationDefinition::label,
"the label")
.def_readwrite(
"displayLabel", &Abbreviations::AbbreviationDefinition::displayLabel,
"the label in a drawing when the bond comes from the right")
.def_readwrite("displayLabelW",
&Abbreviations::AbbreviationDefinition::displayLabelW,
"the label in a drawing when the bond comes from the west")
.def_readwrite(
"mol", &Abbreviations::AbbreviationDefinition::mol,
"the query molecule (should have a dummy as the first atom)");
python::def("GetDefaultAbbreviations",
&Abbreviations::Utils::getDefaultAbbreviations,
"returns a list of the default abbreviation definitions");
python::def("GetDefaultLinkers", &Abbreviations::Utils::getDefaultLinkers,
"returns a list of the default linker definitions");
python::def("ParseAbbreviations", &Abbreviations::Utils::parseAbbreviations,
(python::arg("text"), python::arg("removeExtraDummies") = false,
python::arg("allowConnectionToDummies") = false),
"returns a set of abbreviation definitions from a string");
python::def("ParseLinkers", &Abbreviations::Utils::parseLinkers,
(python::arg("text")),
"returns a set of linker definitions from a string");
python::def(
"CondenseMolAbbreviations", &condenseMolAbbreviationsHelper,
(python::arg("mol"), python::arg("abbrevs"),
python::arg("maxCoverage") = 0.4, python::arg("sanitize") = true),
python::return_value_policy<python::manage_new_object>(),
"Finds and replaces abbreviations in a molecule. The result is not sanitized.");
python::def("LabelMolAbbreviations", &labelMolAbbreviationsHelper,
(python::arg("mol"), python::arg("abbrevs"),
python::arg("maxCoverage") = 0.4),
python::return_value_policy<python::manage_new_object>(),
"Finds abbreviations and adds to them to a molecule as \"SUP\" "
"SubstanceGroups");
python::def(
"CondenseAbbreviationSubstanceGroups", &condenseAbbreviationSGroupHelper,
(python::arg("mol")),
python::return_value_policy<python::manage_new_object>(),
"Finds and replaces abbrevation (i.e. \"SUP\") substance groups in a "
"molecule. The result is not sanitized.");
}

View File

@@ -0,0 +1,131 @@
#
# Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
# @@ All Rights Reserved @@
#
# This file is part of the RDKit.
# The contents are covered by the terms of the BSD license
# which is included in the file license.txt, found at the root
# of the RDKit source tree.
#
from rdkit import Chem
from rdkit.Chem import rdAbbreviations
import unittest
class TestCase(unittest.TestCase):
def setUp(self):
self.defaultAbbrevs = rdAbbreviations.GetDefaultAbbreviations()
self.defaultLinkers = rdAbbreviations.GetDefaultLinkers()
self.customLinkers = rdAbbreviations.ParseLinkers('''PEG3 *OCCOCCOCC* PEG3
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy''')
def testParsingAbbrevs(self):
defn = '''CO2Et C(=O)OCC
COOEt C(=O)OCC
OiBu OCC(C)C
tBu C(C)(C)C'''
abbrevs = rdAbbreviations.ParseAbbreviations(defn)
m = Chem.MolFromSmiles('CCC(=O)OCC')
nm = rdAbbreviations.CondenseMolAbbreviations(m, abbrevs, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), '*CC |$CO2Et;;$|')
def testCondense(self):
m = Chem.MolFromSmiles('FC(F)(F)CC(=O)O')
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), '*C* |$CF3;;CO2H$|')
m = Chem.MolFromSmiles('CCC(F)(F)F')
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs)
self.assertEqual(Chem.MolToCXSmiles(nm), '*C(F)(F)F |$Et;;;;$|')
# make sure we don't mess up chirality
m = Chem.MolFromSmiles('FC(F)(F)[C@](Cl)(F)I')
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), '*[C@@](F)(Cl)I |$CF3;;;;$|')
def testLabel(self):
m = Chem.MolFromSmiles('CC(C)CC(F)(F)F')
nm = rdAbbreviations.LabelMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
sgs = Chem.GetMolSubstanceGroups(nm)
self.assertEqual(len(sgs), 2)
self.assertEqual(sgs[0].GetProp('TYPE'), "SUP")
self.assertEqual(sgs[0].GetProp('LABEL'), "iPr")
self.assertEqual(list(sgs[0].GetAtoms()), [1, 0, 2])
self.assertEqual(list(sgs[0].GetBonds()), [2])
aps = sgs[0].GetAttachPoints()
self.assertEqual(len(aps), 1)
self.assertEqual(aps[0].aIdx, 1)
self.assertEqual(aps[0].lvIdx, 3)
self.assertEqual(sgs[1].GetProp('TYPE'), "SUP")
self.assertEqual(sgs[1].GetProp('LABEL'), "CF3")
self.assertEqual(list(sgs[1].GetAtoms()), [4, 5, 6, 7])
self.assertEqual(list(sgs[1].GetBonds()), [3])
aps = sgs[1].GetAttachPoints()
self.assertEqual(len(aps), 1)
self.assertEqual(aps[0].aIdx, 4)
self.assertEqual(aps[0].lvIdx, 3)
def testCondenseLinkers(self):
m = Chem.MolFromSmiles('FCOCCOCCOCCCCCCCCCCl')
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultLinkers, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), 'FC**Cl |$;;PEG3;Hept;$|')
m = Chem.MolFromSmiles('COC1CCC(C)CC1')
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.customLinkers, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
def testAbbreviationsAndLinkers(self):
m = Chem.MolFromSmiles('COC1CCC(C)CC1')
# wouldn't normally do this in this order:
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), '*C1CCC(C)CC1 |$OMe;;;;;;;$|')
nm = rdAbbreviations.CondenseMolAbbreviations(nm, self.customLinkers, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), '**C |$OMe;Cy;$|')
# This is a more logical order
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.customLinkers, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
nm = rdAbbreviations.CondenseMolAbbreviations(nm, self.defaultAbbrevs, maxCoverage=1.0)
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
def testAbbreviationsSubstanceGroups(self):
m = Chem.MolFromMolBlock('''
Mrv2014 09152006492D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 7 7 1 0 0
M V30 BEGIN ATOM
M V30 1 C 5.25 -5.9858 0 0
M V30 2 C 4.48 -7.3196 0 0
M V30 3 C 6.02 -7.3196 0 0
M V30 4 F 8.6873 -8.8596 0 0
M V30 5 C 7.3537 -8.0896 0 0
M V30 6 F 6.02 -8.8596 0 0
M V30 7 F 7.3537 -6.5496 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 3 1
M V30 3 1 2 3
M V30 4 1 3 5
M V30 5 1 4 5
M V30 6 1 5 6
M V30 7 1 5 7
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 0 ATOMS=(4 4 5 6 7) SAP=(3 5 3 1) XBONDS=(1 4) LABEL=CF3
M V30 END SGROUP
M V30 END CTAB
M END''')
nm = rdAbbreviations.CondenseAbbreviationSubstanceGroups(m)
nm.RemoveAllConformers() # avoid coords in CXSMILES
self.assertEqual(Chem.MolToCXSmiles(nm), '*C1CC1 |$CF3;;;$|')
if __name__ == '__main__': # pragma: nocover
unittest.main()

View File

@@ -0,0 +1,530 @@
//
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "catch.hpp"
#include "RDGeneral/test.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/Abbreviations/Abbreviations.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/FileParsers/SequenceParsers.h>
#include <GraphMol/FileParsers/FileParsers.h>
using namespace RDKit;
TEST_CASE("parsing") {
SECTION("abbreviations") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
CHECK(abbrevs.size() == 37);
CHECK(abbrevs[0].label == "CO2Et");
CHECK(abbrevs[0].displayLabel == "CO<sub>2</sub>Et");
CHECK(abbrevs[0].displayLabelW == "EtO<sub>2</sub>C");
CHECK(abbrevs[0].smarts == "C(=O)OCC");
REQUIRE(abbrevs[0].mol);
CHECK(abbrevs[0].mol->getNumAtoms() == 6);
unsigned int nDummies = 0;
CHECK(abbrevs[0].mol->getPropIfPresent(
Abbreviations::common_properties::numDummies, nDummies));
CHECK(nDummies == 1);
}
SECTION("linkers") {
auto abbrevs = Abbreviations::Utils::getDefaultLinkers();
CHECK(abbrevs.size() == 8);
CHECK(abbrevs[0].label == "PEG6");
CHECK(abbrevs[0].displayLabel == "PEG6");
CHECK(abbrevs[0].displayLabelW.empty());
CHECK(abbrevs[0].smarts == "*OCCOCCOCCOCCOCCOCC*");
REQUIRE(abbrevs[0].mol);
CHECK(abbrevs[0].mol->getNumAtoms() == 19);
unsigned int nDummies = 0;
CHECK(abbrevs[0].mol->getPropIfPresent(
Abbreviations::common_properties::numDummies, nDummies));
CHECK(nDummies == 1);
}
SECTION("bad SMILES in defintions") {
const std::string defns = R"ABBREVS(CO2Et C(=O)OCC
COOEt fail
OiBu OCC(C)C)ABBREVS";
auto abbrevs = Abbreviations::Utils::parseAbbreviations(defns);
REQUIRE(abbrevs.size() == 2);
CHECK(abbrevs[0].label == "CO2Et");
CHECK(abbrevs[1].label == "OiBu");
}
}
TEST_CASE("findApplicableMatches") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
SECTION("basics") {
auto m = "NCCC(F)(F)F"_smiles;
REQUIRE(m);
{
double maxCoverage = 0.4;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.empty());
}
{
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.size() == 1);
CHECK(matches[0].abbrev.label == "CF3");
CHECK(matches[0].match[0].second == 2);
CHECK(matches[0].match[1].second == 3);
}
}
SECTION("multiple abbreviations") {
{
auto m = "FC(F)(F)CC(=O)O"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.size() == 2);
CHECK(matches[0].abbrev.label == "CF3");
CHECK(matches[1].abbrev.label == "CO2H");
}
{ // overlapping
auto m = "FC(F)(F)C(=O)O"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.empty());
}
{ // overlapping
auto m = "FC(F)(F)C(F)(F)F"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.empty());
}
{ // overlapping, one is too big, so there is an abbreviation for the other
auto m = "CCC(F)(F)F"_smiles;
REQUIRE(m);
double maxCoverage = 0.4;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.size() == 1);
CHECK(matches[0].abbrev.label == "Et");
// remove the size constraint and there's no abbreviation:
maxCoverage = 1.0;
matches = Abbreviations::findApplicableAbbreviationMatches(*m, abbrevs,
maxCoverage);
CHECK(matches.empty());
}
}
}
TEST_CASE("findApplicableMatches linkers") {
auto linkers = Abbreviations::Utils::getDefaultLinkers();
SECTION("basics") {
{
auto m = "FCOCCOCCOCCNCCCCCCCCl"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, linkers, maxCoverage);
CHECK(matches.size() == 2);
CHECK(matches[0].abbrev.label == "PEG3");
CHECK(matches[1].abbrev.label == "Hept");
}
{ // directly connected
auto m = "FCOCCOCCOCCCCCCCCCCl"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, linkers, maxCoverage);
CHECK(matches.size() == 2);
CHECK(matches[0].abbrev.label == "PEG3");
CHECK(matches[1].abbrev.label == "Hept");
CHECK(matches[0].match[9].second == 10);
CHECK(matches[1].match[0].second == 10);
}
}
}
TEST_CASE("applyMatches") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
SECTION("basics") {
{
auto m = "FC(F)(F)CC(=O)O"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.size() == 2);
Abbreviations::applyMatches(*m, matches);
CHECK(m->getNumAtoms() == 3);
CHECK(MolToCXSmiles(*m) == "*C* |$CF3;;CO2H$|");
}
}
}
TEST_CASE("applyMatches linkers") {
auto linkers =
Abbreviations::Utils::parseLinkers(R"ABBREV(PEG3 *OCCOCCOCC* PEG3
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy)ABBREV");
SECTION("basics") {
{
auto m = "FCOCCOCCOCCCCCCCCl"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, linkers, maxCoverage);
CHECK(matches.size() == 2);
Abbreviations::applyMatches(*m, matches);
CHECK(m->getNumAtoms() == 5);
CHECK(MolToCXSmiles(*m) == "FC**Cl |$;;PEG3;Pent;$|");
}
{
auto m = "COC1CCC(C)CC1"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, linkers, maxCoverage);
CHECK(matches.size() == 1);
Abbreviations::applyMatches(*m, matches);
CHECK(m->getNumAtoms() == 4);
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
}
}
}
TEST_CASE("condense abbreviations") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
SECTION("basics") {
{
auto m = "FC(F)(F)CC(=O)O"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
CHECK(MolToCXSmiles(*m) == "*C* |$CF3;;CO2H$|");
}
}
}
TEST_CASE("condense abbreviations linkers") {
auto linkers = Abbreviations::Utils::getDefaultLinkers();
auto customLinkers =
Abbreviations::Utils::parseLinkers(R"ABBREV(PEG3 *OCCOCCOCC* PEG3
Pent *CCCCC*
Cy *C1CCC(*)CC1 Cy
ala *N[C@@H](C)C(=O)* ala
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
asn *N[C@@H](CC(N)=O)C(=O)* asn
asp *N[C@@H](CC(O)=O)C(=O)* asp
cys *N[C@@H](CS)C(=O)* cys
gln *N[C@@H](CCC(N)=O)C(=O)* gln
glu *N[C@@H](CCC(O)=O)C(=O)* glu
gly *NCC(=O)* gly
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
ile *N[C@@H](C(C)CC)C(=O)* ile
leu *N[C@@H](CC(C)C)C(=O)* leu
lys *N[C@@H](CCCCN)C(=O)* lys
met *N[C@@H](CCSC)C(=O)* met
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
pro *N1[C@@H](CCC1)C(=O)* pro
ser *N[C@@H](CO)C(=O)* ser
thr *N[C@@H](C(O)C)C(=O)* thr
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
val *N[C@@H](C(C)C)C(=O)* val)ABBREV");
SECTION("basics") {
{
auto m = "FCOCCOCCOCCCCCCCCCCl"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
CHECK(m->getNumAtoms() == 5);
CHECK(MolToCXSmiles(*m) == "FC**Cl |$;;PEG3;Hept;$|");
}
{
auto m = "COC1CCC(C)CC1"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, customLinkers, maxCoverage);
CHECK(m->getNumAtoms() == 4);
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
}
}
SECTION("peptides") {
std::unique_ptr<RWMol> m(SequenceToMol("GYTKC"));
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, customLinkers, maxCoverage);
CHECK(MolToCXSmiles(*m) == "NCC(=O)****O |$;;;;tyr;thr;lys;cys;$|");
}
}
TEST_CASE("abbreviations and linkers") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
auto linkers = Abbreviations::Utils::parseLinkers(
R"ABBREV(Cy *C1CCC(*)CC1 Cy)ABBREV");
SECTION("basics") {
{ // this isn't the order we'd normally do this in:
auto m = "COC1CCC(C)CC1"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
CHECK(m->getNumAtoms() == 8);
CHECK(MolToCXSmiles(*m) == "*C1CCC(C)CC1 |$OMe;;;;;;;$|");
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
CHECK(m->getNumAtoms() == 3);
CHECK(MolToCXSmiles(*m) == "**C |$OMe;Cy;$|");
}
{ // a more sensible order
auto m = "COC1CCC(C)CC1"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
CHECK(m->getNumAtoms() == 4);
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
CHECK(m->getNumAtoms() == 4);
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
}
}
}
TEST_CASE("labelMatches") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
SECTION("basics") {
{
auto m = "CC(C)CC(F)(F)F"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
auto matches = Abbreviations::findApplicableAbbreviationMatches(
*m, abbrevs, maxCoverage);
CHECK(matches.size() == 2);
Abbreviations::labelMatches(*m, matches);
CHECK(m->getNumAtoms() == 8);
const auto &sgs = getSubstanceGroups(*m);
REQUIRE(sgs.size() == 2);
CHECK(sgs[0].getProp<std::string>("TYPE") == "SUP");
CHECK(sgs[0].getProp<std::string>("LABEL") == "iPr");
CHECK(sgs[0].getBonds() == std::vector<unsigned int>({2}));
CHECK(sgs[0].getAtoms() == std::vector<unsigned int>({1, 0, 2}));
CHECK(sgs[0].getAttachPoints().size() == 1);
CHECK(sgs[0].getAttachPoints()[0].aIdx == 1);
CHECK(sgs[0].getAttachPoints()[0].lvIdx == 3);
CHECK(sgs[1].getProp<std::string>("TYPE") == "SUP");
CHECK(sgs[1].getProp<std::string>("LABEL") == "CF3");
CHECK(sgs[1].getBonds() == std::vector<unsigned int>({3}));
CHECK(sgs[1].getAtoms() == std::vector<unsigned int>({4, 5, 6, 7}));
CHECK(sgs[1].getAttachPoints().size() == 1);
CHECK(sgs[1].getAttachPoints()[0].aIdx == 4);
CHECK(sgs[1].getAttachPoints()[0].lvIdx == 3);
}
}
}
TEST_CASE("labelMolAbbreviations") {
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
SECTION("basics") {
{
auto m = "CC(C)CC(F)(F)F"_smiles;
REQUIRE(m);
double maxCoverage = 1.0;
Abbreviations::labelMolAbbreviations(*m, abbrevs, maxCoverage);
CHECK(m->getNumAtoms() == 8);
const auto &sgs = getSubstanceGroups(*m);
REQUIRE(sgs.size() == 2);
CHECK(sgs[0].getProp<std::string>("TYPE") == "SUP");
CHECK(sgs[0].getProp<std::string>("LABEL") == "iPr");
CHECK(sgs[0].getBonds() == std::vector<unsigned int>({2}));
CHECK(sgs[0].getAtoms() == std::vector<unsigned int>({1, 0, 2}));
CHECK(sgs[0].getAttachPoints().size() == 1);
CHECK(sgs[0].getAttachPoints()[0].aIdx == 1);
CHECK(sgs[0].getAttachPoints()[0].lvIdx == 3);
CHECK(sgs[1].getProp<std::string>("TYPE") == "SUP");
CHECK(sgs[1].getProp<std::string>("LABEL") == "CF3");
CHECK(sgs[1].getBonds() == std::vector<unsigned int>({3}));
CHECK(sgs[1].getAtoms() == std::vector<unsigned int>({4, 5, 6, 7}));
CHECK(sgs[1].getAttachPoints().size() == 1);
CHECK(sgs[1].getAttachPoints()[0].aIdx == 4);
CHECK(sgs[1].getAttachPoints()[0].lvIdx == 3);
}
}
}
TEST_CASE("condenseAbbreviationSubstanceGroups") {
SECTION("abbreviations") {
auto m = R"CTAB(
ACCLDraw09152005292D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 10 10 2 0 0
M V30 BEGIN ATOM
M V30 1 C 12.8333 -9.32 0 0 CFG=3
M V30 2 C 13.8565 -8.7293 0 0
M V30 3 O 14.8802 -9.3201 0 0
M V30 4 O 13.8565 -7.5471 0 0
M V30 5 C 11.6489 -9.32 0 0
M V30 6 C 12.241 -10.3432 0 0 CFG=3
M V30 7 C 12.241 -11.5253 0 0 CFG=3
M V30 8 F 12.241 -12.5874 0 0
M V30 9 F 11.0366 -11.5253 0 0
M V30 10 F 13.4231 -11.5253 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 2 2 4
M V30 2 1 2 3
M V30 3 1 1 2
M V30 4 1 5 6
M V30 5 1 5 1
M V30 6 1 1 6
M V30 7 1 7 10
M V30 8 1 7 9
M V30 9 1 7 8
M V30 10 1 6 7
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 1 ATOMS=(3 2 3 4) XBONDS=(1 3) CSTATE=(4 3 -1.02 -0.59 0) LABEL=-
M V30 CO2H
M V30 2 SUP 2 ATOMS=(4 7 8 9 10) XBONDS=(1 10) CSTATE=(4 10 0 1.18 0) LABEL=-
M V30 CF3
M V30 END SGROUP
M V30 END CTAB
M END)CTAB"_ctab;
REQUIRE(m);
CHECK(m->getNumAtoms() == 10);
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
CHECK(m->getNumAtoms() == 5);
// remove the conformer before generating CXSMILES
m->clearConformers();
CHECK(MolToCXSmiles(*m) == "*C1CC1* |$CO2H;;;;CF3$|");
}
SECTION("abbreviations MRV") {
auto m = R"CTAB(
Mrv2014 09152006492D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 7 7 1 0 0
M V30 BEGIN ATOM
M V30 1 C 5.25 -5.9858 0 0
M V30 2 C 4.48 -7.3196 0 0
M V30 3 C 6.02 -7.3196 0 0
M V30 4 F 8.6873 -8.8596 0 0
M V30 5 C 7.3537 -8.0896 0 0
M V30 6 F 6.02 -8.8596 0 0
M V30 7 F 7.3537 -6.5496 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 3 1
M V30 3 1 2 3
M V30 4 1 3 5
M V30 5 1 4 5
M V30 6 1 5 6
M V30 7 1 5 7
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 0 ATOMS=(4 4 5 6 7) SAP=(3 5 3 1) XBONDS=(1 4) LABEL=CF3
M V30 END SGROUP
M V30 END CTAB
M END
)CTAB"_ctab;
REQUIRE(m);
CHECK(m->getNumAtoms() == 7);
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
CHECK(m->getNumAtoms() == 4);
// remove the conformer before generating CXSMILES
m->clearConformers();
CHECK(MolToCXSmiles(*m) == "*C1CC1 |$CF3;;;$|");
}
SECTION("linker") {
auto m = R"CTAB(
ACCLDraw09152006102D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 8 7 1 0 0
M V30 BEGIN ATOM
M V30 1 C 7.2482 -5.1911 0 0
M V30 2 O 5.8143 -6.2327 0 0
M V30 3 C 6.77 -5.5382 0 0
M V30 4 C 7.8494 -6.0186 0 0
M V30 5 O 8.8052 -5.3241 0 0
M V30 6 C 9.8845 -5.8046 0 0
M V30 7 C 10.8403 -5.1101 0 0
M V30 8 C 9.4066 -6.1518 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 2 3
M V30 3 1 3 4
M V30 4 1 4 5
M V30 5 1 5 6
M V30 6 1 6 7
M V30 7 1 7 8
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 1 ATOMS=(6 2 3 4 5 6 7) XBONDS=(2 1 7) CSTATE=(4 1 -1.08 0.48 0) -
M V30 CSTATE=(4 7 1.08 -0.48 0) LABEL=PEG2
M V30 END SGROUP
M V30 END CTAB
M END)CTAB"_ctab;
REQUIRE(m);
CHECK(m->getNumAtoms() == 8);
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
CHECK(m->getNumAtoms() == 3);
// remove the conformer before generating CXSMILES
m->clearConformers();
CHECK(MolToCXSmiles(*m) == "C*C |$;PEG2;$|");
}
SECTION("linker MRV") {
auto m = R"CTAB(
Mrv2014 09152006522D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 8 7 1 0 0
M V30 BEGIN ATOM
M V30 1 C 1.625 -8.9167 0 0
M V30 2 O 2.9587 -8.1467 0 0
M V30 3 C 4.2924 -8.9167 0 0
M V30 4 C 5.626 -8.1467 0 0
M V30 5 O 6.9597 -8.9167 0 0
M V30 6 C 8.2934 -8.1467 0 0
M V30 7 C 9.6271 -8.9167 0 0
M V30 8 C 10.9608 -8.1467 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 2 3
M V30 3 1 3 4
M V30 4 1 4 5
M V30 5 1 5 6
M V30 6 1 6 7
M V30 7 1 7 8
M V30 END BOND
M V30 BEGIN SGROUP
M V30 1 SUP 0 ATOMS=(6 2 3 4 5 6 7) SAP=(3 2 1 1) SAP=(3 7 8 2) XBONDS=(2 1 -
M V30 7) LABEL=PEG2 ESTATE=E
M V30 END SGROUP
M V30 END CTAB
M END
)CTAB"_ctab;
REQUIRE(m);
CHECK(m->getNumAtoms() == 8);
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
CHECK(m->getNumAtoms() == 3);
// remove the conformer before generating CXSMILES
m->clearConformers();
CHECK(MolToCXSmiles(*m) == "C*C |$;PEG2;$|");
}
}

View File

@@ -109,6 +109,7 @@ endif()
add_subdirectory(MolStandardize)
add_subdirectory(ScaffoldNetwork)
add_subdirectory(MolEnumerator)
add_subdirectory(Abbreviations)
rdkit_test(graphmolTest1 test1.cpp LINK_LIBRARIES FileParsers SmilesParse GraphMol

View File

@@ -3018,13 +3018,14 @@ void MolDraw2D::adjustBondEndForLabel(int atnum, const Point2D &nbr_cds,
pair<string, OrientType> MolDraw2D::getAtomSymbolAndOrientation(
const Atom &atom) const {
OrientType orient = getAtomOrientation(atom);
string symbol = getAtomSymbol(atom);
string symbol = getAtomSymbol(atom, orient);
return std::make_pair(symbol, orient);
}
// ****************************************************************************
string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom,
OrientType orientation) const {
// adds XML-like annotation for super- and sub-script, in the same manner
// as MolDrawing.py. My first thought was for a LaTeX-like system,
// obviously...
@@ -3037,6 +3038,25 @@ string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
// specified labels are trump: no matter what else happens we will show
// them.
symbol = drawOptions().atomLabels.find(atom.getIdx())->second;
} else if (atom.hasProp(common_properties::_displayLabel) ||
atom.hasProp(common_properties::_displayLabelW)) {
// logic here: if either _displayLabel or _displayLabelW is set, we will
// definitely use one of those. if only one is set, we'll use that one if
// both are set and the orientation is W then we'll use _displayLabelW,
// otherwise _displayLabel
std::string lbl;
std::string lblw;
atom.getPropIfPresent(common_properties::_displayLabel, lbl);
atom.getPropIfPresent(common_properties::_displayLabelW, lblw);
if (lbl.empty()) {
lbl = lblw;
}
if (orientation == OrientType::W && !lblw.empty()) {
symbol = lblw;
} else {
symbol = lbl;
}
} else if (atom.hasProp(common_properties::atomLabel)) {
symbol = atom.getProp<std::string>(common_properties::atomLabel);
} else if (drawOptions().dummiesAreAttachments && atom.getAtomicNum() == 0 &&
@@ -3120,7 +3140,7 @@ string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
}
// cout << "Atom symbol " << atom.getIdx() << " : " << symbol << endl;
return symbol;
}
} // namespace RDKit
// ****************************************************************************
OrientType MolDraw2D::getAtomOrientation(const RDKit::Atom &atom) const {

View File

@@ -151,7 +151,7 @@ typedef std::vector<unsigned int> DashPattern;
inline void assignDefaultPalette(ColourPalette &palette) {
palette.clear();
palette[-1] = DrawColour(0, 0, 0);
palette[0] = DrawColour(0.5, 0.5, 0.5);
palette[0] = DrawColour(0.1, 0.1, 0.1);
palette[1] = palette[6] = DrawColour(0.0, 0.0, 0.0);
palette[7] = DrawColour(0.0, 0.0, 1.0);
palette[8] = DrawColour(1.0, 0.0, 0.0);
@@ -782,7 +782,7 @@ class RDKIT_MOLDRAW2D_EXPORT MolDraw2D {
// adds LaTeX-like annotation for super- and sub-script.
std::pair<std::string, OrientType> getAtomSymbolAndOrientation(
const Atom &atom) const;
std::string getAtomSymbol(const Atom &atom) const;
std::string getAtomSymbol(const Atom &atom, OrientType orientation) const;
OrientType getAtomOrientation(const Atom &atom) const;
// things used by calculateScale.

View File

@@ -928,7 +928,8 @@ std::string get_coords_block(const ROMol &mol,
std::string get_atom_props_block(const ROMol &mol,
const std::vector<unsigned int> &atomOrder) {
std::vector<std::string> skip = {common_properties::atomLabel,
common_properties::molFileValue};
common_properties::molFileValue,
common_properties::molParity};
std::string res = "";
unsigned int which = 0;
for (auto idx : atomOrder) {

View File

@@ -0,0 +1,19 @@
/*
*
* Copyright (c) 2020, Greg Landrum and T5 Informatics GmbH
* All rights reserved.
*
* This file is part of the RDKit.
* The contents are covered by the terms of the BSD license
* which is included in the file license.txt, found at the root
* of the RDKit source tree.
*
*/
%{
#include <GraphMol/Abbreviations/Abbreviations.h>
%}
%template(AbbreviationDefinition_Vect) std::vector<RDKit::Abbreviations::AbbreviationDefinition>;
%template(AbbreviationMatch_Vect) std::vector<RDKit::Abbreviations::AbbreviationMatch>;
%include <GraphMol/Abbreviations/Abbreviations.h>

View File

@@ -20,7 +20,7 @@ if(RDK_BUILD_INCHI_SUPPORT)
set(swigRDKitLibList "${swigRDKitLibList}RDInchiLib;${INCHI_LIBRARIES};")
endif(RDK_BUILD_INCHI_SUPPORT)
set(swigRDKitLibList "${swigRDKitLibList}"
"ScaffoldNetwork;MolHash;RGroupDecomposition;SubstructLibrary;TautomerQuery;"
"Abbreviations;ScaffoldNetwork;MolHash;RGroupDecomposition;SubstructLibrary;TautomerQuery;"
"MolEnumerator;"
"MolStandardize;FilterCatalog;Catalogs;FMCS;MolDraw2D;FileParsers;SmilesParse;"
"Depictor;SubstructMatch;ChemReactions;Fingerprints;ChemTransforms;"

View File

@@ -256,6 +256,7 @@ typedef unsigned long long int uintmax_t;
%include "../TautomerQuery.i"
%include "../SubstanceGroup.i"
%include "../MolHash.i"
%include "../Abbreviations.i"
%include "../Streams.i"

View File

@@ -374,6 +374,11 @@ ADD_TEST(JavaMolHashTest
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"
org.RDKit.MolHashTest)
ADD_TEST(JavaAbbreviationsTests
java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR}
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"
org.RDKit.AbbreviationsTests)
ADD_TEST(JavaDiversityPickerTests
java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR}
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"

View File

@@ -238,6 +238,7 @@ typedef unsigned long long int uintmax_t;
%include "../SubstanceGroup.i"
%include "../MolEnumerator.i"
%include "../MolHash.i"
%include "../Abbreviations.i"
%include "../Streams.i"
// Create a class to throw various sorts of errors for testing. Required for unit tests in ErrorHandlingTests.java

View File

@@ -0,0 +1,72 @@
/*
*
* Copyright (c) 2019 Greg Landrum and T5 Informatics GmbH
* All rights reserved.
*
* This file is part of the RDKit.
* The contents are covered by the terms of the BSD license
* which is included in the file license.txt, found at the root
* of the RDKit source tree.
*/
package org.RDKit;
import static org.junit.Assert.*;
import org.junit.*;
public class AbbreviationsTests extends GraphMolTest {
@Before public void setUp() {
}
@Test
public void test1Basics() {
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultAbbreviations();
RWMol mol = RWMol.MolFromSmiles("C1CCC1C(F)(F)F");
assertEquals(mol.getNumAtoms(),8);
RDKFuncs.condenseMolAbbreviations(mol,abbrevs);
// no changes here due to the threshold
assertEquals(mol.getNumAtoms(),8);
RDKFuncs.condenseMolAbbreviations(mol,abbrevs, 1.0);
assertEquals(mol.getNumAtoms(),5);
assertEquals(RDKFuncs.MolToCXSmiles(mol),"*C1CCC1 |$CF3;;;;$|");
}
@Test
public void test2LinkerBasics() {
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultLinkers();
RWMol mol = RWMol.MolFromSmiles("COCCOCCOCCOCCCl");
assertEquals(mol.getNumAtoms(),14);
RDKFuncs.condenseMolAbbreviations(mol,abbrevs);
// no changes here due to the threshold
assertEquals(mol.getNumAtoms(),14);
RDKFuncs.condenseMolAbbreviations(mol,abbrevs, 1.0);
assertEquals(mol.getNumAtoms(),3);
assertEquals(RDKFuncs.MolToCXSmiles(mol),"C*Cl |$;PEG4;$|");
}
@Test
public void test3Matching() {
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultAbbreviations();
RWMol mol = RWMol.MolFromSmiles("C1CCC1C(F)(F)F");
assertEquals(mol.getNumAtoms(),8);
AbbreviationMatch_Vect matches = RDKFuncs.findApplicableAbbreviationMatches(mol,abbrevs,1.0);
assertEquals(matches.size(),1);
assertEquals(matches.get(0).getAbbrev().getLabel(),"CF3");
RDKFuncs.applyMatches(mol,matches);
assertEquals(mol.getNumAtoms(),5);
assertEquals(RDKFuncs.MolToCXSmiles(mol),"*C1CCC1 |$CF3;;;;$|");
}
public static void main(String args[]) {
org.junit.runner.JUnitCore.main("org.RDKit.AbbreviationsTests");
}
}

View File

@@ -9,7 +9,7 @@ if(RDK_BUILD_FREETYPE_SUPPORT)
endif()
endif()
add_executable(RDKit_minimal jswrapper.cpp minilib.cpp)
target_link_libraries(RDKit_minimal CIPLabeler_static MolDraw2D_static Depictor_static RDInchiLib_static SubstructMatch_static FileParsers_static
target_link_libraries(RDKit_minimal Abbreviations_static CIPLabeler_static MolDraw2D_static Depictor_static RDInchiLib_static SubstructMatch_static FileParsers_static
SmilesParse_static GraphMol_static RDGeometryLib_static RDGeneral_static)
set_target_properties(RDKit_minimal PROPERTIES LINK_FLAGS "--bind")

View File

@@ -90,7 +90,9 @@ EMSCRIPTEN_BINDINGS(RDKit_minimal) {
class_<JSMol>("Mol")
.function("is_valid", &JSMol::is_valid)
.function("get_smiles", &JSMol::get_smiles)
.function("get_cxsmiles", &JSMol::get_cxsmiles)
.function("get_molblock", &JSMol::get_molblock)
.function("get_v3Kmolblock", &JSMol::get_v3Kmolblock)
.function("get_inchi", &JSMol::get_inchi)
.function("get_svg",
select_overload<std::string() const>(&JSMol::get_svg))
@@ -122,6 +124,11 @@ EMSCRIPTEN_BINDINGS(RDKit_minimal) {
select_overload<std::string() const>(&JSMol::get_new_coords))
.function("get_new_coords", select_overload<std::string(bool) const>(
&JSMol::get_new_coords))
.function("condense_abbreviations",
select_overload<std::string()>(&JSMol::condense_abbreviations))
.function("condense_abbreviations",
select_overload<std::string(double, bool)>(
&JSMol::condense_abbreviations))
.function("add_hs", &JSMol::add_hs)
.function("remove_hs", &JSMol::remove_hs);

View File

@@ -27,6 +27,7 @@
#include <GraphMol/Fingerprints/MorganFingerprints.h>
#include <GraphMol/Depictor/RDDepictor.h>
#include <GraphMol/CIPLabeler/CIPLabeler.h>
#include <GraphMol/Abbreviations/Abbreviations.h>
#include <DataStructs/BitOps.h>
#include <INCHI-API/inchi.h>
@@ -105,7 +106,7 @@ std::string process_details(const std::string &details, unsigned int &width,
}
namespace {
ROMol *mol_from_input(const std::string &input) {
RWMol *mol_from_input(const std::string &input) {
RWMol *res = nullptr;
if (input.find("M END") != std::string::npos) {
bool sanitize = false;
@@ -127,7 +128,7 @@ ROMol *mol_from_input(const std::string &input) {
return res;
}
ROMol *qmol_from_input(const std::string &input) {
RWMol *qmol_from_input(const std::string &input) {
RWMol *res = nullptr;
if (input.find("M END") != std::string::npos) {
bool sanitize = false;
@@ -169,6 +170,10 @@ std::string JSMol::get_smiles() const {
if (!d_mol) return "";
return MolToSmiles(*d_mol);
}
std::string JSMol::get_cxsmiles() const {
if (!d_mol) return "";
return MolToCXSmiles(*d_mol);
}
std::string JSMol::get_svg(unsigned int w, unsigned int h) const {
if (!d_mol) return "";
return svg_(*d_mol, w, h);
@@ -190,6 +195,10 @@ std::string JSMol::get_molblock() const {
if (!d_mol) return "";
return MolToMolBlock(*d_mol);
}
std::string JSMol::get_v3Kmolblock() const {
if (!d_mol) return "";
return MolToV3KMolBlock(*d_mol);
}
namespace {
void get_sss_json(const ROMol *d_mol, const ROMol *q_mol,
@@ -413,17 +422,49 @@ std::string JSMol::add_hs() const {
return MolToMolBlock(molCopy, includeStereo, confId, kekulize);
}
std::string JSMol::condense_abbreviations(double maxCoverage, bool useLinkers) {
if (!d_mol) return "";
if (!useLinkers) {
Abbreviations::condenseMolAbbreviations(
*d_mol, Abbreviations::Utils::getDefaultAbbreviations(), maxCoverage);
} else {
Abbreviations::condenseMolAbbreviations(
*d_mol, Abbreviations::Utils::getDefaultLinkers(), maxCoverage);
}
return "";
}
std::string JSMol::condense_abbreviations_from_defs(
const std::string &definitions, double maxCoverage, bool areLinkers) {
static std::string lastDefs = "";
static std::vector<Abbreviations::AbbreviationDefinition> abbrevs;
if (definitions != lastDefs) {
// yes, we are making the assumption that the "areLinkers" argument remains
// the same if the definitions are the same
bool removeExtraDummies = areLinkers;
bool allowConnectionToDummies = areLinkers;
lastDefs = definitions;
try {
abbrevs = Abbreviations::Utils::parseAbbreviations(
definitions, removeExtraDummies, allowConnectionToDummies);
} catch (...) {
return "cannot parse abbreviations";
}
}
Abbreviations::condenseMolAbbreviations(*d_mol, abbrevs, maxCoverage);
}
std::string get_inchikey_for_inchi(const std::string &input) {
return InchiToInchiKey(input);
}
JSMol *get_mol(const std::string &input) {
ROMol *mol = mol_from_input(input);
RWMol *mol = mol_from_input(input);
return new JSMol(mol);
}
JSMol *get_qmol(const std::string &input) {
ROMol *mol = qmol_from_input(input);
RWMol *mol = qmol_from_input(input);
return new JSMol(mol);
}

View File

@@ -14,9 +14,11 @@
class JSMol {
public:
JSMol() : d_mol(nullptr){};
JSMol(RDKit::ROMol *mol) : d_mol(mol){};
JSMol(RDKit::RWMol *mol) : d_mol(mol){};
std::string get_smiles() const;
std::string get_cxsmiles() const;
std::string get_molblock() const;
std::string get_v3Kmolblock() const;
std::string get_inchi() const;
std::string get_svg(unsigned int width, unsigned int height) const;
std::string get_svg() const {
@@ -28,6 +30,13 @@ class JSMol {
std::string get_descriptors() const;
std::string get_morgan_fp(unsigned int radius, unsigned int len) const;
std::string get_morgan_fp() const { return get_morgan_fp(2, 2048); };
std::string condense_abbreviations(double maxCoverage, bool useLinkers);
std::string condense_abbreviations() {
return condense_abbreviations(0.4, false);
};
std::string condense_abbreviations_from_defs(const std::string &definitions,
double maxCoverage,
bool areLinkers);
bool is_valid() const { return d_mol.get() != nullptr; };
@@ -40,7 +49,7 @@ class JSMol {
std::string remove_hs() const;
std::string add_hs() const;
std::unique_ptr<RDKit::ROMol> d_mol;
std::unique_ptr<RDKit::RWMol> d_mol;
static constexpr unsigned int d_defaultWidth = 250;
static constexpr unsigned int d_defaultHeight = 200;
};

View File

@@ -82,11 +82,23 @@ function test_sketcher_services2(){
assert(molb2.search(" H ")<0);
}
function test_abbreviations(){
var bmol = Module.get_mol("C1CCC1C(F)(F)F");
assert.equal(bmol.is_valid(),1);
bmol.condense_abbreviations();
assert.equal(bmol.get_cxsmiles(),"FC(F)(F)C1CCC1");
bmol.condense_abbreviations(1.0,false);
assert.equal(bmol.get_cxsmiles(),"*C1CCC1 |$CF3;;;;$|");
}
Module.onRuntimeInitialized = () => {
console.log(Module.version());
test_basics();
test_sketcher_services();
test_sketcher_services2();
test_abbreviations();
console.log("Tests finished successfully");
};

View File

@@ -127,6 +127,10 @@ const std::string atomNote = "atomNote";
const std::string bondNote = "bondNote";
const std::string _isotopicHs = "_isotopicHs";
// molecule drawing
const std::string _displayLabel = "_displayLabel";
const std::string _displayLabelW = "_displayLabelW";
} // namespace common_properties
const double MAX_DOUBLE = std::numeric_limits<double>::max();

View File

@@ -222,6 +222,10 @@ RDKIT_RDGENERAL_EXPORT extern const std::string
_TriposAtomType; // string Mol2FileParser
// missing defs for _TriposAtomName//_TriposPartialCharge...
// molecule drawing
RDKIT_RDGENERAL_EXPORT extern const std::string _displayLabel; // string
RDKIT_RDGENERAL_EXPORT extern const std::string _displayLabelW; // string
///////////////////////////////////////////////////////////////
// misc props
RDKIT_RDGENERAL_EXPORT extern const std::string

View File

@@ -736,7 +736,7 @@ of threads allowed on your computer.
The original 2D->3D conversion provided with the RDKit was not intended
to be a replacement for a “real” conformational analysis tool; it
merely provides quick 3D structures for cases when they are
required. We believe, however, that the newer ETKDG method[#riniker2]_ should be
required. We believe, however, that the newer ETKDG method [#riniker2]_ should be
adequate for most purposes.
@@ -899,12 +899,12 @@ data/test_multi_colours.py, which produces the somewhat garish
As of version 2020.03, it is possible to add arbitrary small strings
to annotate atoms and bonds in the drawing. The strings are added as
properties 'atomNote' and
'bondNote' and they will be placed automatically
properties ``atomNote`` and
``bondNote`` and they will be placed automatically
close to the atom or bond in question in a manner intended to minimise
their clash with the rest of the drawing. For convenience, here are 3
flags in
`MolDraw2DOptions` that will add stereo information (R/S to atoms, E/Z
``MolDraw2DOptions`` that will add stereo information (R/S to atoms, E/Z
to bonds) and atom and bond sequence numbers.
.. doctest::
@@ -917,13 +917,70 @@ to bonds) and atom and bond sequence numbers.
>>> d.drawOptions().addAtomIndices = True
>>> d.DrawMolecule(mol)
>>> d.FinishDrawing()
>>> with open('atom_annotation_1.png', 'wb') as f: # doctest: +SKIP
... f.write(d.GetDrawingText())
>>> d.WriteDrawingText('atom_annotation_1.png') # doctest: +SKIP
will produce
.. image:: images/atom_annotation_1.png
If atoms have an ``atomLabel`` property set, this will be used when drawing them:
.. doctest::
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
>>> mol = Chem.MolFromSmiles(smi)
>>> mol.GetAtomWithIdx(3).GetProp("atomLabel")
'R1'
>>> mol.GetAtomWithIdx(7).GetProp("atomLabel")
'R2'
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
>>> d.WriteDrawingText("./images/atom_labels_1.png") # doctest: +SKIP
gives:
.. image:: images/atom_labels_1.png
Since the ``atomLabel`` property is also used for other things (for example in CXSMILES as demonstrated),
if you want to provide your own atom labels, it's better to use the ``_displayLabel`` property:
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
>>> mol = Chem.MolFromSmiles(smi)
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabel","R<sub>1</sub>")
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabel","R<sub>2</sub>")
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
>>> d.WriteDrawingText("./images/atom_labels_2.png") # doctest: +SKIP
this gives:
.. image:: images/atom_labels_2.png
Note that you can use ``<sup>`` and ``<sub>`` in these labels to provide super- and subscripts.
Finally, if you have atom labels which should be displayed differently when the bond comes
into them from the right (the West), you can also set the ``_displayLabelW`` property:
.. doctest::
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
>>> mol = Chem.MolFromSmiles(smi)
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabel","CO<sub>2</sub>H")
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabelW","HO<sub>2</sub>C")
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabel","CO<sub>2</sub><sup>-</sup>")
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabelW","<sup>-</sup>OOC")
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
>>> d.WriteDrawingText("./images/atom_labels_3.png") # doctest: +SKIP
this gives:
.. image:: images/atom_labels_3.png
Metadata in Molecule Images
===========================

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.0 KiB

View File

@@ -1,45 +1,46 @@
tests = [
("python", "UnitTestChem.py", {}),
("python", "UnitTestChemv2.py", {}),
("python", "UnitTestChemAtom.py", {}),
("python", "UnitTestChemBond.py", {}),
("python", "UnitTestChemSmarts.py", {}),
("python", "UnitTestFragmentDescriptors.py", {}),
("python", "UnitTestGraphDescriptors_2.py", {}),
("python", "UnitTestLipinski.py", {}),
("python", "UnitTestMCS.py", {}),
("python", "UnitTestOldBugs.py", {}),
("python", "UnitTestSATIS.py", {}),
("python", "UnitTestSmiles.py", {}),
("python", "UnitTestSuppliers.py", {}),
("python", "UnitTestSurf.py", {}),
("python", "UnitTestMol3D.py", {}),
("python", "UnitTestCatalog.py", {}),
("python", "UnitTestDescriptors.py", {}),
("python", "UnitTestInchi.py", {}),
("python", "UnitTestFunctionalGroups.py", {}),
("python", "UnitTestCrippen.py", {}),
("python", "UnitTestPandasTools.py", {}),
("python", "UnitTestDocTestsChem.py", {}),
("python", "UnitTestFeatFinderCLI.py", {}),
("python", "UnitTestQED.py", {}),
("python", "UnitTestSaltRemover.py", {}),
("python", "test_list.py", {'dir': 'AtomPairs'}),
("python", "test_list.py", {'dir': 'ChemUtils'}),
("python", "test_list.py", {'dir': 'EState'}),
("python", "test_list.py", {'dir': 'FeatMaps'}),
("python", "test_list.py", {'dir': 'Fingerprints'}),
("python", "test_list.py", {'dir': 'Pharm2D'}),
("python", "test_list.py", {'dir': 'Pharm3D'}),
("python", "test_list.py", {'dir': 'Subshape'}),
("python", "test_list.py", {'dir': 'Suppliers'}),
("python", "test_list.py", {'dir': 'Scaffolds'}),
("python", "test_list.py", {'dir': 'Draw'}),
("python", "test_list.py", {'dir': 'Fraggle'}),
("python", "test_list.py", {'dir': 'SimpleEnum'}),
("python", "test_list.py", {'dir': 'Features'}),
("python", "test_list.py", {'dir': 'MolStandardize'})
]
tests = [("python", "UnitTestChem.py", {}), ("python", "UnitTestChemv2.py", {}),
("python", "UnitTestChemAtom.py", {}), ("python", "UnitTestChemBond.py", {}),
("python", "UnitTestChemSmarts.py", {}), ("python", "UnitTestFragmentDescriptors.py", {}),
("python", "UnitTestGraphDescriptors_2.py", {}), ("python", "UnitTestLipinski.py", {}),
("python", "UnitTestMCS.py", {}), ("python", "UnitTestOldBugs.py", {}),
("python", "UnitTestSATIS.py", {}), ("python", "UnitTestSmiles.py", {}),
("python", "UnitTestSuppliers.py", {}), ("python", "UnitTestSurf.py", {}),
("python", "UnitTestMol3D.py", {}), ("python", "UnitTestCatalog.py", {}),
("python", "UnitTestDescriptors.py", {}), ("python", "UnitTestInchi.py", {}),
("python", "UnitTestFunctionalGroups.py", {}), ("python", "UnitTestCrippen.py", {}),
("python", "UnitTestPandasTools.py", {}), ("python", "UnitTestDocTestsChem.py", {}),
("python", "UnitTestFeatFinderCLI.py", {}), ("python", "UnitTestQED.py", {}),
("python", "UnitTestSaltRemover.py", {}), ("python", "test_list.py", {
'dir': 'AtomPairs'
}), ("python", "test_list.py", {
'dir': 'ChemUtils'
}), ("python", "test_list.py", {
'dir': 'EState'
}), ("python", "test_list.py", {
'dir': 'FeatMaps'
}), ("python", "test_list.py", {
'dir': 'Fingerprints'
}), ("python", "test_list.py", {
'dir': 'Pharm2D'
}), ("python", "test_list.py", {
'dir': 'Pharm3D'
}), ("python", "test_list.py", {
'dir': 'Subshape'
}), ("python", "test_list.py", {
'dir': 'Suppliers'
}), ("python", "test_list.py", {
'dir': 'Scaffolds'
}), ("python", "test_list.py", {
'dir': 'Draw'
}), ("python", "test_list.py", {
'dir': 'Fraggle'
}), ("python", "test_list.py", {
'dir': 'SimpleEnum'
}), ("python", "test_list.py", {
'dir': 'Features'
}), ("python", "test_list.py", {
'dir': 'MolStandardize'
})]
# only attempt the MolKey tests if we have the pre-reqs:
try: