mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Add support for abbreviations (#3406)
* support read-only access to cstates from python * expose GetBrackets * expose getAttachPoints too remove vestigial SubstanceGroupCState_VECT * backup * backup * basics working * backup * add label_mol_abbreviations * fix a bug in the chirality handling * add linkers, needs more testing * add another peptide test * sanitize results by default * just need rings * getting started with the C++ form of abbreviations * a bit of error handling * add findApplicableMatches * actually apply the abbreviations * make the getDefault functions more efficient * add labeling (creating s groups) * docs * basic python wrappers (maybe this is enough?) * add _displayLabel and _displayLabelW support to MolDraw2D update the docs for that * use displayLabel props * add more default abbrevs * change default linker defns add parseLinkers convenience function * make sure attachment point atoms aren't aromatic * change the color of dummies to be darker gray * remove python implementation * support abbreviations in the java wrappers * add abbreviations to the csharp wrappers * add abbreviations to the js wrappers * add molParity to the list of atom props not written to CXSMILES * support condensing SUP substance groups * add that to the python wrappers * Update testAbbreviations.py * clear ring info if we added it * document that the molecules with abbreviations removed have not been sanitized
This commit is contained in:
264
Code/GraphMol/Abbreviations/Abbreviations.cpp
Normal file
264
Code/GraphMol/Abbreviations/Abbreviations.cpp
Normal file
@@ -0,0 +1,264 @@
|
||||
//
|
||||
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#include "Abbreviations.h"
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/Substruct/SubstructMatch.h>
|
||||
#include <RDGeneral/types.h>
|
||||
#include <RDGeneral/Invariant.h>
|
||||
|
||||
#include <boost/dynamic_bitset.hpp>
|
||||
#include <iostream>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace Abbreviations {
|
||||
|
||||
void applyMatches(RWMol& mol, const std::vector<AbbreviationMatch>& matches) {
|
||||
boost::dynamic_bitset<> toRemove(mol.getNumAtoms());
|
||||
for (const auto& amatch : matches) {
|
||||
// throughout this remember that atom 0 in the match is the dummy
|
||||
|
||||
// convert atom 1 to be the abbreviation so that we don't have to
|
||||
// worry about messing up chirality, etc.
|
||||
auto connectIdx = amatch.match[1].second;
|
||||
auto connectingAtom = mol.getAtomWithIdx(connectIdx);
|
||||
connectingAtom->setProp(RDKit::common_properties::atomLabel,
|
||||
amatch.abbrev.label);
|
||||
if (!amatch.abbrev.displayLabel.empty()) {
|
||||
connectingAtom->setProp(RDKit::common_properties::_displayLabel,
|
||||
amatch.abbrev.displayLabel);
|
||||
}
|
||||
if (!amatch.abbrev.displayLabelW.empty()) {
|
||||
connectingAtom->setProp(RDKit::common_properties::_displayLabelW,
|
||||
amatch.abbrev.displayLabelW);
|
||||
}
|
||||
|
||||
connectingAtom->setFormalCharge(0);
|
||||
connectingAtom->setAtomicNum(0);
|
||||
connectingAtom->setIsotope(0);
|
||||
connectingAtom->setIsAromatic(false);
|
||||
|
||||
// set the hybridization so these are drawn linearly
|
||||
connectingAtom->setHybridization(Atom::HybridizationType::SP);
|
||||
|
||||
for (unsigned int i = 2; i < amatch.match.size(); ++i) {
|
||||
const auto& pr = amatch.match[i];
|
||||
CHECK_INVARIANT(!toRemove[pr.second], "overlapping matches");
|
||||
toRemove.set(pr.second);
|
||||
// if there's a molecule associated with the match, check to see if
|
||||
// additional bonds need to be formed
|
||||
if (amatch.abbrev.mol &&
|
||||
mol.getAtomWithIdx(pr.second)->getDegree() >
|
||||
amatch.abbrev.mol->getAtomWithIdx(pr.first)->getDegree()) {
|
||||
for (const auto& nbri : boost::make_iterator_range(
|
||||
mol.getAtomNeighbors(mol.getAtomWithIdx(pr.second)))) {
|
||||
const auto& nbr = mol[nbri];
|
||||
auto nbrIdx = nbr->getIdx();
|
||||
// if this neighbor isn't in the match:
|
||||
if (!std::any_of(amatch.match.begin(), amatch.match.end(),
|
||||
[&](const std::pair<int, int>& tpr) {
|
||||
return tpr.second == rdcast<int>(nbrIdx);
|
||||
})) {
|
||||
mol.addBond(nbrIdx, connectIdx, Bond::BondType::SINGLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// make connections between any extraAttachAtoms and the connection point
|
||||
for (auto oaidx : amatch.abbrev.extraAttachAtoms) {
|
||||
mol.addBond(oaidx, connectIdx, Bond::BondType::SINGLE);
|
||||
}
|
||||
}
|
||||
for (unsigned int i = toRemove.size(); i > 0; --i) {
|
||||
if (toRemove[i - 1]) {
|
||||
mol.removeAtom(i - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void labelMatches(RWMol& mol, const std::vector<AbbreviationMatch>& matches) {
|
||||
for (const auto& amatch : matches) {
|
||||
// throughout this remember that atom 0 in the match is the dummy
|
||||
SubstanceGroup sg(&mol, "SUP");
|
||||
sg.setProp("LABEL", amatch.abbrev.label);
|
||||
|
||||
for (unsigned int i = 1; i < amatch.match.size(); ++i) {
|
||||
const auto& pr = amatch.match[i];
|
||||
sg.addAtomWithIdx(pr.second);
|
||||
}
|
||||
auto bnd =
|
||||
mol.getBondBetweenAtoms(amatch.match[0].second, amatch.match[1].second);
|
||||
CHECK_INVARIANT(bnd, "bond to attachment point not found");
|
||||
sg.addBondWithIdx(bnd->getIdx());
|
||||
sg.addAttachPoint(amatch.match[1].second, amatch.match[0].second, "1");
|
||||
addSubstanceGroup(mol, sg);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<AbbreviationMatch> findApplicableAbbreviationMatches(
|
||||
const ROMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage) {
|
||||
std::vector<AbbreviationMatch> res;
|
||||
auto nAtoms = mol.getNumAtoms();
|
||||
if (!nAtoms || abbrevs.empty()) {
|
||||
return res;
|
||||
}
|
||||
|
||||
bool hasRings = mol.getRingInfo()->isInitialized();
|
||||
if(!hasRings) {
|
||||
MolOps::fastFindRings(mol);
|
||||
}
|
||||
|
||||
std::vector<AbbreviationMatch> tres;
|
||||
boost::dynamic_bitset<> dummies(mol.getNumAtoms());
|
||||
boost::dynamic_bitset<> firstAts(mol.getNumAtoms());
|
||||
boost::dynamic_bitset<> covered(mol.getNumAtoms());
|
||||
|
||||
for (const auto& abbrev : abbrevs) {
|
||||
CHECK_INVARIANT(abbrev.mol, "molecule is null");
|
||||
if (maxCoverage > 0) {
|
||||
unsigned int nDummies;
|
||||
abbrev.mol->getProp(common_properties::numDummies, nDummies);
|
||||
if (double(abbrev.mol->getNumAtoms() - nDummies) / nAtoms >=
|
||||
maxCoverage) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
auto matches = SubstructMatch(mol, *abbrev.mol);
|
||||
for (const auto& match : matches) {
|
||||
CHECK_INVARIANT(match.size() > 1, "bad match size");
|
||||
// if we've already covered the first non-dummy atom or used it as a first
|
||||
// atom skip this.
|
||||
if (firstAts[match[1].second] || covered[match[1].second]) {
|
||||
continue;
|
||||
}
|
||||
bool keepIt = true;
|
||||
for (unsigned int i = 2; i < match.size(); ++i) {
|
||||
const auto& pr = match[i];
|
||||
if (covered[pr.second]) {
|
||||
keepIt = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!keepIt) {
|
||||
continue;
|
||||
}
|
||||
for (unsigned int i = 1; i < match.size(); ++i) {
|
||||
const auto& pr = match[i];
|
||||
covered.set(pr.second);
|
||||
}
|
||||
dummies.set(match[0].second);
|
||||
firstAts.set(match[1].second);
|
||||
if (!firstAts[match[0].second]) {
|
||||
tres.emplace_back(match, abbrev);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto& itm : tres) {
|
||||
// if the dummy in this wasn't a first atom anywhere
|
||||
if (!firstAts[itm.match[0].second]) {
|
||||
res.push_back(std::move(itm));
|
||||
}
|
||||
}
|
||||
|
||||
// if we added ring info, go ahead and remove it
|
||||
if(!hasRings){
|
||||
mol.getRingInfo()->reset();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void condenseMolAbbreviations(
|
||||
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage, bool sanitize) {
|
||||
auto applicable =
|
||||
findApplicableAbbreviationMatches(mol, abbrevs, maxCoverage);
|
||||
applyMatches(mol, applicable);
|
||||
if (sanitize) {
|
||||
MolOps::symmetrizeSSSR(mol);
|
||||
}
|
||||
};
|
||||
|
||||
void labelMolAbbreviations(RWMol& mol,
|
||||
const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage) {
|
||||
auto applicable =
|
||||
findApplicableAbbreviationMatches(mol, abbrevs, maxCoverage);
|
||||
labelMatches(mol, applicable);
|
||||
};
|
||||
|
||||
RDKIT_ABBREVIATIONS_EXPORT void condenseAbbreviationSubstanceGroups(
|
||||
RWMol& mol) {
|
||||
auto& molSGroups = getSubstanceGroups(mol);
|
||||
std::vector<AbbreviationMatch> abbrevMatches;
|
||||
for (const auto& sg : molSGroups) {
|
||||
if (sg.getProp<std::string>("TYPE") == "SUP") {
|
||||
AbbreviationMatch abbrevMatch;
|
||||
std::string label = "abbrev";
|
||||
sg.getPropIfPresent("LABEL", label);
|
||||
abbrevMatch.abbrev.label = label;
|
||||
auto ats = sg.getAtoms();
|
||||
auto bnds = sg.getBonds();
|
||||
if (bnds.empty()) {
|
||||
BOOST_LOG(rdWarningLog) << "SUP group without any bonds" << std::endl;
|
||||
} else {
|
||||
bool firstAttachFound = false;
|
||||
for (unsigned int i = 0; i < bnds.size(); ++i) {
|
||||
auto bnd = mol.getBondWithIdx(bnds[i]);
|
||||
unsigned int mAt; // sgroup atom in the match
|
||||
unsigned int oAt; // add the first attachment point to the beginning
|
||||
// of the atom list
|
||||
if (std::find(ats.begin(), ats.end(), bnd->getBeginAtomIdx()) !=
|
||||
ats.end()) {
|
||||
oAt = bnd->getEndAtomIdx();
|
||||
mAt = bnd->getBeginAtomIdx();
|
||||
} else if (std::find(ats.begin(), ats.end(), bnd->getEndAtomIdx()) !=
|
||||
ats.end()) {
|
||||
oAt = bnd->getBeginAtomIdx();
|
||||
mAt = bnd->getEndAtomIdx();
|
||||
} else {
|
||||
BOOST_LOG(rdWarningLog) << "SUP group includes bond not connected "
|
||||
"to any of the abbreviation atoms"
|
||||
<< std::endl;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!firstAttachFound) {
|
||||
// make sure the atom connected to the first attachment point
|
||||
// is the first one in the match
|
||||
if (*ats.begin() != mAt) {
|
||||
ats.erase(std::find(ats.begin(), ats.end(), mAt));
|
||||
ats.insert(ats.begin(), mAt);
|
||||
}
|
||||
ats.insert(ats.begin(), oAt);
|
||||
firstAttachFound = true;
|
||||
} else {
|
||||
abbrevMatch.abbrev.extraAttachAtoms.push_back(oAt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// create a match record:
|
||||
for (unsigned int i = 0; i < ats.size(); ++i) {
|
||||
abbrevMatch.match.push_back({i, ats[i]});
|
||||
}
|
||||
abbrevMatches.push_back(abbrevMatch);
|
||||
}
|
||||
}
|
||||
if (!abbrevMatches.empty()) {
|
||||
applyMatches(mol, abbrevMatches);
|
||||
} else {
|
||||
BOOST_LOG(rdWarningLog) << "no suitable SubstanceGroups found" << std::endl;
|
||||
}
|
||||
}; // namespace Abbreviations
|
||||
|
||||
} // namespace Abbreviations
|
||||
} // namespace RDKit
|
||||
131
Code/GraphMol/Abbreviations/Abbreviations.h
Normal file
131
Code/GraphMol/Abbreviations/Abbreviations.h
Normal file
@@ -0,0 +1,131 @@
|
||||
//
|
||||
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#include <RDGeneral/export.h>
|
||||
#ifndef RD_ABBREVIATIONS_H
|
||||
#define RD_ABBREVIATIONS_H
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace RDKit {
|
||||
class ROMol;
|
||||
class RWMol;
|
||||
|
||||
namespace Abbreviations {
|
||||
RDKIT_ABBREVIATIONS_EXPORT struct AbbreviationDefinition {
|
||||
std::string label;
|
||||
std::string displayLabel;
|
||||
std::string displayLabelW;
|
||||
std::string smarts;
|
||||
std::shared_ptr<ROMol> mol; //! optional
|
||||
std::vector<unsigned int> extraAttachAtoms; //! optional
|
||||
bool operator==(const AbbreviationDefinition& other) const {
|
||||
return label == other.label && displayLabel == other.displayLabel &&
|
||||
displayLabelW == other.displayLabelW && smarts == other.smarts;
|
||||
}
|
||||
bool operator!=(const AbbreviationDefinition& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
RDKIT_ABBREVIATIONS_EXPORT struct AbbreviationMatch {
|
||||
std::vector<std::pair<int, int>> match;
|
||||
AbbreviationDefinition abbrev;
|
||||
AbbreviationMatch(const std::vector<std::pair<int, int>>& matchArg,
|
||||
const AbbreviationDefinition& abbrevArg)
|
||||
: match(matchArg), abbrev(abbrevArg){};
|
||||
AbbreviationMatch() : match(), abbrev(){};
|
||||
bool operator==(const AbbreviationMatch& other) const {
|
||||
return abbrev == other.abbrev && match == other.match;
|
||||
}
|
||||
bool operator!=(const AbbreviationMatch& other) const {
|
||||
return !(*this == other);
|
||||
}
|
||||
};
|
||||
namespace common_properties {
|
||||
RDKIT_ABBREVIATIONS_EXPORT extern const std::string numDummies;
|
||||
}
|
||||
namespace Utils {
|
||||
//! returns the default set of abbreviation definitions
|
||||
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
|
||||
getDefaultAbbreviations();
|
||||
//! returns the default set of linker definitions
|
||||
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
|
||||
getDefaultLinkers();
|
||||
|
||||
//! parses a string describing abbreviation matches and returns the result
|
||||
/*
|
||||
|
||||
\param text the data to be parsed, see below for the format
|
||||
\param removeExtraDummies controls whether or not dummy atoms beyond atom 0 are
|
||||
removed. Set this to true to create abbreviations for linkers
|
||||
\param allowConnectionToDummies allows abbreviations to directly connect to
|
||||
abbreviations. set this to true for linkers
|
||||
|
||||
Format of the text data:
|
||||
A series of lines, each of which contains:
|
||||
|
||||
label SMARTS displayLabel displayLabelW
|
||||
|
||||
the "displayLabel" and "displayLabelW" fields are optional.
|
||||
where label is the label used for the abbreviation,
|
||||
SMARTS is the SMARTS definition of the abbreviation.
|
||||
displayLabel is used in drawings to render the abbreviations.
|
||||
displayLabelW is the display label if a bond comes in from the right
|
||||
|
||||
Use dummies to indicate attachment points. The assumption is that the first
|
||||
atom is a dummy (one will be added if this is not true) and that the second
|
||||
atom is the surrogate for the rest of the group.
|
||||
|
||||
*/
|
||||
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationDefinition>
|
||||
parseAbbreviations(const std::string& text, bool removeExtraDummies = false,
|
||||
bool allowConnectionToDummies = false);
|
||||
//! \brief equivalent to calling \c parseAbbreviations(text,true,true)
|
||||
inline std::vector<AbbreviationDefinition> parseLinkers(
|
||||
const std::string& text) {
|
||||
return parseAbbreviations(text, true, true);
|
||||
};
|
||||
} // namespace Utils
|
||||
|
||||
//! returns all matches for the abbreviations across the molecule
|
||||
/*!
|
||||
|
||||
\param abbrevs the abbreviations to look for. This list is used in order.
|
||||
\param maxCoverage any abbreviation that covers than more than this fraction
|
||||
of the molecule's atoms (not counting dummies) will not be returned.
|
||||
*/
|
||||
RDKIT_ABBREVIATIONS_EXPORT std::vector<AbbreviationMatch>
|
||||
findApplicableAbbreviationMatches(
|
||||
const ROMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage = 0.4);
|
||||
//! applies the abbreviation matches to a molecule, modifying it in place.
|
||||
//! the modified molecule is not sanitized
|
||||
RDKIT_ABBREVIATIONS_EXPORT void applyMatches(
|
||||
RWMol& mol, const std::vector<AbbreviationMatch>& matches);
|
||||
//! creates "SUP" SubstanceGroups on the molecule describing the abbreviation
|
||||
RDKIT_ABBREVIATIONS_EXPORT void labelMatches(
|
||||
RWMol& mol, const std::vector<AbbreviationMatch>& matches);
|
||||
//! convenience function for finding and applying abbreviations
|
||||
//! the modified molecule is not sanitized
|
||||
RDKIT_ABBREVIATIONS_EXPORT void condenseMolAbbreviations(
|
||||
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage = 0.4, bool sanitize = true);
|
||||
//! convenience function for finding and labeling abbreviations as SUP
|
||||
//! SubstanceGroups
|
||||
RDKIT_ABBREVIATIONS_EXPORT void labelMolAbbreviations(
|
||||
RWMol& mol, const std::vector<AbbreviationDefinition>& abbrevs,
|
||||
double maxCoverage = 0.4);
|
||||
//! collapses abbreviation (i.e. "SUP") substance groups
|
||||
//! the modified molecule is not sanitized
|
||||
RDKIT_ABBREVIATIONS_EXPORT void condenseAbbreviationSubstanceGroups(RWMol& mol);
|
||||
|
||||
} // namespace Abbreviations
|
||||
} // namespace RDKit
|
||||
#endif
|
||||
221
Code/GraphMol/Abbreviations/AbbreviationsUtils.cpp
Normal file
221
Code/GraphMol/Abbreviations/AbbreviationsUtils.cpp
Normal file
@@ -0,0 +1,221 @@
|
||||
//
|
||||
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#include "Abbreviations.h"
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/SmilesParse/SmilesParse.h>
|
||||
#include <GraphMol/RDKitQueries.h>
|
||||
#include <boost/tokenizer.hpp>
|
||||
|
||||
using tokenizer = boost::tokenizer<boost::char_separator<char>>;
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace Abbreviations {
|
||||
|
||||
namespace common_properties {
|
||||
const std::string numDummies = "_numDummies";
|
||||
}
|
||||
|
||||
namespace Utils {
|
||||
namespace data {
|
||||
/*
|
||||
Translations of superatom labels to SMILES.
|
||||
|
||||
First atom of SMILES string should be the one connected to the rest of
|
||||
the molecule.
|
||||
|
||||
ADAPTED FROM: https://github.com/openbabel/superatoms/blob/master/superatom.txt
|
||||
|
||||
Originally from http://cactus.nci.nih.gov/osra/
|
||||
|
||||
The left-aligned form is the one recognized in MDL alias lines;
|
||||
the right-aligned form may be used in 2D depiction.
|
||||
|
||||
label smiles display_label display_label_w
|
||||
*/
|
||||
const std::string defaultAbbreviations =
|
||||
R"ABBREVS(CO2Et C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
|
||||
COOEt C(=O)OCC CO<sub>2</sub>Et EtO<sub>2</sub>C
|
||||
OiBu OCC(C)C OiBu iBuO
|
||||
nDec CCCCCCCCCC nDec
|
||||
nNon CCCCCCCCC nNon
|
||||
nOct CCCCCCCC nOct
|
||||
nHept CCCCCCC nHept
|
||||
nHex CCCCCC nHex
|
||||
nPent CCCCC nPent
|
||||
iPent C(C)CCC iPent
|
||||
tBu C(C)(C)C tBu
|
||||
iBu C(C)CC iBu
|
||||
nBu CCCC nBu
|
||||
iPr C(C)C iPr
|
||||
nPr CCC nPr
|
||||
Et CC Et
|
||||
NCF3 NC(F)(F)F NCF<sub>3</sub> F<sub>3</sub>CN
|
||||
CF3 C(F)(F)F CF<sub>3</sub> F<sub>3</sub>C
|
||||
CCl3 C(Cl)(Cl)Cl CCl<sub>3</sub> Cl<sub>3</sub>C
|
||||
CN C#N CN NC
|
||||
NC [N+]#[C-] NC CN
|
||||
N(OH)CH3 N([OH])C N(OH)CH<sub>3</sub> CH<sub>3</sub>(OH)N
|
||||
NO2 [N+](=O)[O-] NO<sub>2</sub> O<sub>2</sub>N
|
||||
NO N=O NO ON
|
||||
SO3H S(=O)(=O)[OH] SO<sub>3</sub>H HO<sub>3</sub>S
|
||||
CO2H C(=O)[OH] CO<sub>2</sub>H HO<sub>2</sub>C
|
||||
COOH C(=O)[OH] COOH HOOC
|
||||
OEt OCC OEt EtO
|
||||
OAc OC(=O)C OAc AcO
|
||||
NHAc NC(=O)C NHAc AcNH
|
||||
Ac C(=O)C Ac
|
||||
CHO C=O CHO OHC
|
||||
NMe NC NMe MeN
|
||||
SMe SC SMe MeS
|
||||
OMe OC OMe MeO
|
||||
CO2- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC
|
||||
COO- C(=O)[O-] COO<sup>-</sup> <sup>-</sup>OOC)ABBREVS";
|
||||
|
||||
/*
|
||||
Translations of linker superatom labels to SMILES.
|
||||
|
||||
First atom of SMILES string should be a dummy connected to the rest of
|
||||
the molecule. The other linker dummy/dummies show the other attachments
|
||||
|
||||
*/
|
||||
const std::string defaultLinkers =
|
||||
R"ABBREVS(PEG6 *OCCOCCOCCOCCOCCOCC* PEG6
|
||||
PEG5 *OCCOCCOCCOCCOCC* PEG5
|
||||
PEG4 *OCCOCCOCCOCC* PEG4
|
||||
PEG3 *OCCOCCOCC* PEG3
|
||||
Dec *CCCCCCCCCC*
|
||||
Non *CCCCCCCCC*
|
||||
Oct *CCCCCCCC*
|
||||
Hept *CCCCCCC*)ABBREVS";
|
||||
// other possible abbreviations that might be useful:
|
||||
/*
|
||||
PEG6 *OCCOCCOCCOCCOCC* PEG6
|
||||
PEG5 *OCCOCCOCCOCCOCC* PEG5
|
||||
PEG4 *OCCOCCOCCOCC* PEG4
|
||||
PEG3 *OCCOCCOCC* PEG3
|
||||
Dec *CCCCCCCCCC*
|
||||
Non *CCCCCCCCC*
|
||||
Oct *CCCCCCCC*
|
||||
Hept *CCCCCCC*
|
||||
Hex *CCCCCC*
|
||||
Pent *CCCCC*
|
||||
Cy *C1CCC(*)CC1 Cy
|
||||
ala *N[C@@H](C)C(=O)* ala
|
||||
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
|
||||
asn *N[C@@H](CC(N)=O)C(=O)* asn
|
||||
asp *N[C@@H](CC(O)=O)C(=O)* asp
|
||||
cys *N[C@@H](CS)C(=O)* cys
|
||||
gln *N[C@@H](CCC(N)=O)C(=O)* gln
|
||||
glu *N[C@@H](CCC(O)=O)C(=O)* glu
|
||||
gly *NCC(=O)* gly
|
||||
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
|
||||
ile *N[C@@H](C(C)CC)C(=O)* ile
|
||||
leu *N[C@@H](CC(C)C)C(=O)* leu
|
||||
lys *N[C@@H](CCCCN)C(=O)* lys
|
||||
met *N[C@@H](CCSC)C(=O)* met
|
||||
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
|
||||
pro *N1[C@@H](CCC1)C(=O)* pro
|
||||
ser *N[C@@H](CO)C(=O)* ser
|
||||
thr *N[C@@H](C(O)C)C(=O)* thr
|
||||
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
|
||||
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
|
||||
val *N[C@@H](C(C)C)C(=O)* val
|
||||
*/
|
||||
} // namespace data
|
||||
|
||||
namespace detail {
|
||||
ROMol *createAbbreviationMol(const std::string &txt, bool removeExtraDummies,
|
||||
bool allowConnectionToDummies) {
|
||||
std::string smarts;
|
||||
if (txt[0] != '*') {
|
||||
smarts = "*" + txt;
|
||||
} else {
|
||||
smarts = txt;
|
||||
}
|
||||
RWMol *q = SmartsToMol(smarts);
|
||||
if (!q) {
|
||||
return q;
|
||||
}
|
||||
if (q->getNumAtoms() < 2) {
|
||||
BOOST_LOG(rdErrorLog) << "abbreviation with <2 atoms ignored" << std::endl;
|
||||
delete q;
|
||||
return nullptr;
|
||||
}
|
||||
MolOps::AdjustQueryParameters ps;
|
||||
ps.adjustDegree = true;
|
||||
ps.adjustDegreeFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
|
||||
ps.adjustRingCount = true;
|
||||
ps.adjustRingCountFlags = MolOps::AdjustQueryWhichFlags::ADJUST_IGNOREDUMMIES;
|
||||
MolOps::adjustQueryProperties(*q, &ps);
|
||||
if (!allowConnectionToDummies) {
|
||||
auto qry = makeAtomNumQuery(0);
|
||||
qry->setNegation(true);
|
||||
q->getAtomWithIdx(0)->expandQuery(qry);
|
||||
}
|
||||
unsigned int nDummies = std::count_if(smarts.begin(), smarts.end(),
|
||||
[](char c) { return c == '*'; });
|
||||
if (removeExtraDummies) {
|
||||
for (unsigned int i = q->getNumAtoms() - 1; i > 0; --i) {
|
||||
auto at = q->getAtomWithIdx(i);
|
||||
if (at->hasQuery() && at->getQuery()->getDescription() == "AtomNull") {
|
||||
q->removeAtom(i);
|
||||
--nDummies;
|
||||
}
|
||||
}
|
||||
}
|
||||
q->setProp(common_properties::numDummies, nDummies);
|
||||
return q;
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
std::vector<AbbreviationDefinition> parseAbbreviations(
|
||||
const std::string &text, bool removeExtraDummies,
|
||||
bool allowConnectionToDummies) {
|
||||
std::vector<AbbreviationDefinition> res;
|
||||
boost::char_separator<char> lineSep("\n");
|
||||
tokenizer lines(text, lineSep);
|
||||
boost::char_separator<char> fieldSep(" \t");
|
||||
for (const auto line : lines) {
|
||||
AbbreviationDefinition defn;
|
||||
tokenizer fields(line, fieldSep);
|
||||
tokenizer::iterator field = fields.begin();
|
||||
defn.label = *field;
|
||||
++field;
|
||||
defn.smarts = *field;
|
||||
++field;
|
||||
if (field != fields.end()) {
|
||||
defn.displayLabel = *field;
|
||||
++field;
|
||||
if (field != fields.end()) {
|
||||
defn.displayLabelW = *field;
|
||||
}
|
||||
}
|
||||
defn.mol.reset(detail::createAbbreviationMol(
|
||||
defn.smarts, removeExtraDummies, allowConnectionToDummies));
|
||||
if (defn.mol) {
|
||||
res.push_back(defn);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
std::vector<AbbreviationDefinition> getDefaultAbbreviations() {
|
||||
static auto defs = parseAbbreviations(data::defaultAbbreviations);
|
||||
return defs;
|
||||
}
|
||||
std::vector<AbbreviationDefinition> getDefaultLinkers() {
|
||||
static auto defs = parseAbbreviations(data::defaultLinkers, true, true);
|
||||
return defs;
|
||||
}
|
||||
} // namespace Utils
|
||||
|
||||
} // namespace Abbreviations
|
||||
} // namespace RDKit
|
||||
15
Code/GraphMol/Abbreviations/CMakeLists.txt
Normal file
15
Code/GraphMol/Abbreviations/CMakeLists.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
rdkit_library(Abbreviations
|
||||
Abbreviations.cpp
|
||||
AbbreviationsUtils.cpp
|
||||
LINK_LIBRARIES SmilesParse
|
||||
SubstructMatch GraphMol RDGeneral)
|
||||
target_compile_definitions(Abbreviations PRIVATE RDKIT_ABBREVIATIONS_BUILD)
|
||||
|
||||
rdkit_headers(Abbreviations.h DEST GraphMol/Abbreviations)
|
||||
|
||||
rdkit_catch_test(testAbbreviations ../catch_main.cpp catch_tests.cpp
|
||||
LINK_LIBRARIES Abbreviations SmilesParse FileParsers )
|
||||
|
||||
if(RDK_BUILD_PYTHON_WRAPPERS)
|
||||
add_subdirectory(Wrap)
|
||||
endif()
|
||||
8
Code/GraphMol/Abbreviations/Wrap/CMakeLists.txt
Normal file
8
Code/GraphMol/Abbreviations/Wrap/CMakeLists.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
rdkit_python_extension(rdAbbreviations
|
||||
rdAbbreviations.cpp
|
||||
DEST Chem
|
||||
LINK_LIBRARIES Abbreviations
|
||||
GraphMol )
|
||||
|
||||
add_pytest(pyAbbreviations ${CMAKE_CURRENT_SOURCE_DIR}/testAbbreviations.py)
|
||||
|
||||
100
Code/GraphMol/Abbreviations/Wrap/rdAbbreviations.cpp
Normal file
100
Code/GraphMol/Abbreviations/Wrap/rdAbbreviations.cpp
Normal file
@@ -0,0 +1,100 @@
|
||||
//
|
||||
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <RDBoost/python.h>
|
||||
#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
|
||||
#include <GraphMol/GraphMol.h>
|
||||
#include <RDBoost/Wrap.h>
|
||||
|
||||
#include <GraphMol/Abbreviations/Abbreviations.h>
|
||||
|
||||
namespace python = boost::python;
|
||||
using namespace RDKit;
|
||||
|
||||
namespace {
|
||||
|
||||
ROMol *condenseMolAbbreviationsHelper(const ROMol *mol,
|
||||
python::object pyabbrevs,
|
||||
double maxCoverage, bool sanitize) {
|
||||
RWMol *res = new RWMol(*mol);
|
||||
auto abbrevs =
|
||||
pythonObjectToVect<Abbreviations::AbbreviationDefinition>(pyabbrevs);
|
||||
Abbreviations::condenseMolAbbreviations(*res, *abbrevs, maxCoverage,
|
||||
sanitize);
|
||||
return rdcast<ROMol *>(res);
|
||||
}
|
||||
|
||||
ROMol *condenseAbbreviationSGroupHelper(const ROMol *mol) {
|
||||
RWMol *res = new RWMol(*mol);
|
||||
Abbreviations::condenseAbbreviationSubstanceGroups(*res);
|
||||
return rdcast<ROMol *>(res);
|
||||
}
|
||||
|
||||
ROMol *labelMolAbbreviationsHelper(const ROMol *mol, python::object pyabbrevs,
|
||||
double maxCoverage) {
|
||||
RWMol *res = new RWMol(*mol);
|
||||
auto abbrevs =
|
||||
pythonObjectToVect<Abbreviations::AbbreviationDefinition>(pyabbrevs);
|
||||
Abbreviations::labelMolAbbreviations(*res, *abbrevs, maxCoverage);
|
||||
return rdcast<ROMol *>(res);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
BOOST_PYTHON_MODULE(rdAbbreviations) {
|
||||
python::scope().attr("__doc__") =
|
||||
"Module containing functions for working with molecular abbreviations";
|
||||
// RegisterVectorConverter<Abbreviations::AbbreviationMatch>();
|
||||
RegisterVectorConverter<Abbreviations::AbbreviationDefinition>();
|
||||
|
||||
python::class_<Abbreviations::AbbreviationDefinition>(
|
||||
"AbbreviationDefinition", "Abbreviation Definition", python::init<>())
|
||||
.def_readwrite("label", &Abbreviations::AbbreviationDefinition::label,
|
||||
"the label")
|
||||
.def_readwrite(
|
||||
"displayLabel", &Abbreviations::AbbreviationDefinition::displayLabel,
|
||||
"the label in a drawing when the bond comes from the right")
|
||||
.def_readwrite("displayLabelW",
|
||||
&Abbreviations::AbbreviationDefinition::displayLabelW,
|
||||
"the label in a drawing when the bond comes from the west")
|
||||
.def_readwrite(
|
||||
"mol", &Abbreviations::AbbreviationDefinition::mol,
|
||||
"the query molecule (should have a dummy as the first atom)");
|
||||
|
||||
python::def("GetDefaultAbbreviations",
|
||||
&Abbreviations::Utils::getDefaultAbbreviations,
|
||||
"returns a list of the default abbreviation definitions");
|
||||
python::def("GetDefaultLinkers", &Abbreviations::Utils::getDefaultLinkers,
|
||||
"returns a list of the default linker definitions");
|
||||
python::def("ParseAbbreviations", &Abbreviations::Utils::parseAbbreviations,
|
||||
(python::arg("text"), python::arg("removeExtraDummies") = false,
|
||||
python::arg("allowConnectionToDummies") = false),
|
||||
"returns a set of abbreviation definitions from a string");
|
||||
python::def("ParseLinkers", &Abbreviations::Utils::parseLinkers,
|
||||
(python::arg("text")),
|
||||
"returns a set of linker definitions from a string");
|
||||
python::def(
|
||||
"CondenseMolAbbreviations", &condenseMolAbbreviationsHelper,
|
||||
(python::arg("mol"), python::arg("abbrevs"),
|
||||
python::arg("maxCoverage") = 0.4, python::arg("sanitize") = true),
|
||||
python::return_value_policy<python::manage_new_object>(),
|
||||
"Finds and replaces abbreviations in a molecule. The result is not sanitized.");
|
||||
python::def("LabelMolAbbreviations", &labelMolAbbreviationsHelper,
|
||||
(python::arg("mol"), python::arg("abbrevs"),
|
||||
python::arg("maxCoverage") = 0.4),
|
||||
python::return_value_policy<python::manage_new_object>(),
|
||||
"Finds abbreviations and adds to them to a molecule as \"SUP\" "
|
||||
"SubstanceGroups");
|
||||
python::def(
|
||||
"CondenseAbbreviationSubstanceGroups", &condenseAbbreviationSGroupHelper,
|
||||
(python::arg("mol")),
|
||||
python::return_value_policy<python::manage_new_object>(),
|
||||
"Finds and replaces abbrevation (i.e. \"SUP\") substance groups in a "
|
||||
"molecule. The result is not sanitized.");
|
||||
}
|
||||
131
Code/GraphMol/Abbreviations/Wrap/testAbbreviations.py
Normal file
131
Code/GraphMol/Abbreviations/Wrap/testAbbreviations.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#
|
||||
# Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
# @@ All Rights Reserved @@
|
||||
#
|
||||
# This file is part of the RDKit.
|
||||
# The contents are covered by the terms of the BSD license
|
||||
# which is included in the file license.txt, found at the root
|
||||
# of the RDKit source tree.
|
||||
|
||||
#
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import rdAbbreviations
|
||||
|
||||
import unittest
|
||||
|
||||
|
||||
class TestCase(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.defaultAbbrevs = rdAbbreviations.GetDefaultAbbreviations()
|
||||
self.defaultLinkers = rdAbbreviations.GetDefaultLinkers()
|
||||
self.customLinkers = rdAbbreviations.ParseLinkers('''PEG3 *OCCOCCOCC* PEG3
|
||||
Pent *CCCCC*
|
||||
Cy *C1CCC(*)CC1 Cy''')
|
||||
|
||||
def testParsingAbbrevs(self):
|
||||
defn = '''CO2Et C(=O)OCC
|
||||
COOEt C(=O)OCC
|
||||
OiBu OCC(C)C
|
||||
tBu C(C)(C)C'''
|
||||
abbrevs = rdAbbreviations.ParseAbbreviations(defn)
|
||||
m = Chem.MolFromSmiles('CCC(=O)OCC')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, abbrevs, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*CC |$CO2Et;;$|')
|
||||
|
||||
def testCondense(self):
|
||||
m = Chem.MolFromSmiles('FC(F)(F)CC(=O)O')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*C* |$CF3;;CO2H$|')
|
||||
m = Chem.MolFromSmiles('CCC(F)(F)F')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*C(F)(F)F |$Et;;;;$|')
|
||||
|
||||
# make sure we don't mess up chirality
|
||||
m = Chem.MolFromSmiles('FC(F)(F)[C@](Cl)(F)I')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*[C@@](F)(Cl)I |$CF3;;;;$|')
|
||||
|
||||
def testLabel(self):
|
||||
m = Chem.MolFromSmiles('CC(C)CC(F)(F)F')
|
||||
nm = rdAbbreviations.LabelMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
|
||||
sgs = Chem.GetMolSubstanceGroups(nm)
|
||||
self.assertEqual(len(sgs), 2)
|
||||
self.assertEqual(sgs[0].GetProp('TYPE'), "SUP")
|
||||
self.assertEqual(sgs[0].GetProp('LABEL'), "iPr")
|
||||
self.assertEqual(list(sgs[0].GetAtoms()), [1, 0, 2])
|
||||
self.assertEqual(list(sgs[0].GetBonds()), [2])
|
||||
aps = sgs[0].GetAttachPoints()
|
||||
self.assertEqual(len(aps), 1)
|
||||
self.assertEqual(aps[0].aIdx, 1)
|
||||
self.assertEqual(aps[0].lvIdx, 3)
|
||||
|
||||
self.assertEqual(sgs[1].GetProp('TYPE'), "SUP")
|
||||
self.assertEqual(sgs[1].GetProp('LABEL'), "CF3")
|
||||
self.assertEqual(list(sgs[1].GetAtoms()), [4, 5, 6, 7])
|
||||
self.assertEqual(list(sgs[1].GetBonds()), [3])
|
||||
aps = sgs[1].GetAttachPoints()
|
||||
self.assertEqual(len(aps), 1)
|
||||
self.assertEqual(aps[0].aIdx, 4)
|
||||
self.assertEqual(aps[0].lvIdx, 3)
|
||||
|
||||
def testCondenseLinkers(self):
|
||||
m = Chem.MolFromSmiles('FCOCCOCCOCCCCCCCCCCl')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultLinkers, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), 'FC**Cl |$;;PEG3;Hept;$|')
|
||||
|
||||
m = Chem.MolFromSmiles('COC1CCC(C)CC1')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.customLinkers, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
|
||||
|
||||
def testAbbreviationsAndLinkers(self):
|
||||
m = Chem.MolFromSmiles('COC1CCC(C)CC1')
|
||||
# wouldn't normally do this in this order:
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.defaultAbbrevs, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*C1CCC(C)CC1 |$OMe;;;;;;;$|')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(nm, self.customLinkers, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '**C |$OMe;Cy;$|')
|
||||
|
||||
# This is a more logical order
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(m, self.customLinkers, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
|
||||
nm = rdAbbreviations.CondenseMolAbbreviations(nm, self.defaultAbbrevs, maxCoverage=1.0)
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), 'C*OC |$;Cy;;$|')
|
||||
|
||||
def testAbbreviationsSubstanceGroups(self):
|
||||
m = Chem.MolFromMolBlock('''
|
||||
Mrv2014 09152006492D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 7 7 1 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 5.25 -5.9858 0 0
|
||||
M V30 2 C 4.48 -7.3196 0 0
|
||||
M V30 3 C 6.02 -7.3196 0 0
|
||||
M V30 4 F 8.6873 -8.8596 0 0
|
||||
M V30 5 C 7.3537 -8.0896 0 0
|
||||
M V30 6 F 6.02 -8.8596 0 0
|
||||
M V30 7 F 7.3537 -6.5496 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2
|
||||
M V30 2 1 3 1
|
||||
M V30 3 1 2 3
|
||||
M V30 4 1 3 5
|
||||
M V30 5 1 4 5
|
||||
M V30 6 1 5 6
|
||||
M V30 7 1 5 7
|
||||
M V30 END BOND
|
||||
M V30 BEGIN SGROUP
|
||||
M V30 1 SUP 0 ATOMS=(4 4 5 6 7) SAP=(3 5 3 1) XBONDS=(1 4) LABEL=CF3
|
||||
M V30 END SGROUP
|
||||
M V30 END CTAB
|
||||
M END''')
|
||||
nm = rdAbbreviations.CondenseAbbreviationSubstanceGroups(m)
|
||||
nm.RemoveAllConformers() # avoid coords in CXSMILES
|
||||
self.assertEqual(Chem.MolToCXSmiles(nm), '*C1CC1 |$CF3;;;$|')
|
||||
|
||||
|
||||
if __name__ == '__main__': # pragma: nocover
|
||||
unittest.main()
|
||||
530
Code/GraphMol/Abbreviations/catch_tests.cpp
Normal file
530
Code/GraphMol/Abbreviations/catch_tests.cpp
Normal file
@@ -0,0 +1,530 @@
|
||||
//
|
||||
// Copyright (C) 2020 Greg Landrum and T5 Informatics GmbH
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include "catch.hpp"
|
||||
#include "RDGeneral/test.h"
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/Abbreviations/Abbreviations.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
#include <GraphMol/SmilesParse/SmilesParse.h>
|
||||
#include <GraphMol/FileParsers/SequenceParsers.h>
|
||||
#include <GraphMol/FileParsers/FileParsers.h>
|
||||
|
||||
using namespace RDKit;
|
||||
|
||||
TEST_CASE("parsing") {
|
||||
SECTION("abbreviations") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
CHECK(abbrevs.size() == 37);
|
||||
CHECK(abbrevs[0].label == "CO2Et");
|
||||
CHECK(abbrevs[0].displayLabel == "CO<sub>2</sub>Et");
|
||||
CHECK(abbrevs[0].displayLabelW == "EtO<sub>2</sub>C");
|
||||
CHECK(abbrevs[0].smarts == "C(=O)OCC");
|
||||
REQUIRE(abbrevs[0].mol);
|
||||
CHECK(abbrevs[0].mol->getNumAtoms() == 6);
|
||||
unsigned int nDummies = 0;
|
||||
CHECK(abbrevs[0].mol->getPropIfPresent(
|
||||
Abbreviations::common_properties::numDummies, nDummies));
|
||||
CHECK(nDummies == 1);
|
||||
}
|
||||
SECTION("linkers") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultLinkers();
|
||||
CHECK(abbrevs.size() == 8);
|
||||
CHECK(abbrevs[0].label == "PEG6");
|
||||
CHECK(abbrevs[0].displayLabel == "PEG6");
|
||||
CHECK(abbrevs[0].displayLabelW.empty());
|
||||
CHECK(abbrevs[0].smarts == "*OCCOCCOCCOCCOCCOCC*");
|
||||
REQUIRE(abbrevs[0].mol);
|
||||
CHECK(abbrevs[0].mol->getNumAtoms() == 19);
|
||||
unsigned int nDummies = 0;
|
||||
CHECK(abbrevs[0].mol->getPropIfPresent(
|
||||
Abbreviations::common_properties::numDummies, nDummies));
|
||||
CHECK(nDummies == 1);
|
||||
}
|
||||
SECTION("bad SMILES in defintions") {
|
||||
const std::string defns = R"ABBREVS(CO2Et C(=O)OCC
|
||||
COOEt fail
|
||||
OiBu OCC(C)C)ABBREVS";
|
||||
auto abbrevs = Abbreviations::Utils::parseAbbreviations(defns);
|
||||
REQUIRE(abbrevs.size() == 2);
|
||||
CHECK(abbrevs[0].label == "CO2Et");
|
||||
CHECK(abbrevs[1].label == "OiBu");
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("findApplicableMatches") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
SECTION("basics") {
|
||||
auto m = "NCCC(F)(F)F"_smiles;
|
||||
REQUIRE(m);
|
||||
{
|
||||
double maxCoverage = 0.4;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.empty());
|
||||
}
|
||||
{
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.size() == 1);
|
||||
CHECK(matches[0].abbrev.label == "CF3");
|
||||
CHECK(matches[0].match[0].second == 2);
|
||||
CHECK(matches[0].match[1].second == 3);
|
||||
}
|
||||
}
|
||||
SECTION("multiple abbreviations") {
|
||||
{
|
||||
auto m = "FC(F)(F)CC(=O)O"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
CHECK(matches[0].abbrev.label == "CF3");
|
||||
CHECK(matches[1].abbrev.label == "CO2H");
|
||||
}
|
||||
{ // overlapping
|
||||
auto m = "FC(F)(F)C(=O)O"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.empty());
|
||||
}
|
||||
{ // overlapping
|
||||
auto m = "FC(F)(F)C(F)(F)F"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.empty());
|
||||
}
|
||||
{ // overlapping, one is too big, so there is an abbreviation for the other
|
||||
auto m = "CCC(F)(F)F"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 0.4;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.size() == 1);
|
||||
CHECK(matches[0].abbrev.label == "Et");
|
||||
// remove the size constraint and there's no abbreviation:
|
||||
maxCoverage = 1.0;
|
||||
matches = Abbreviations::findApplicableAbbreviationMatches(*m, abbrevs,
|
||||
maxCoverage);
|
||||
CHECK(matches.empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
TEST_CASE("findApplicableMatches linkers") {
|
||||
auto linkers = Abbreviations::Utils::getDefaultLinkers();
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "FCOCCOCCOCCNCCCCCCCCl"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, linkers, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
CHECK(matches[0].abbrev.label == "PEG3");
|
||||
CHECK(matches[1].abbrev.label == "Hept");
|
||||
}
|
||||
{ // directly connected
|
||||
auto m = "FCOCCOCCOCCCCCCCCCCl"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, linkers, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
CHECK(matches[0].abbrev.label == "PEG3");
|
||||
CHECK(matches[1].abbrev.label == "Hept");
|
||||
CHECK(matches[0].match[9].second == 10);
|
||||
CHECK(matches[1].match[0].second == 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("applyMatches") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "FC(F)(F)CC(=O)O"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
Abbreviations::applyMatches(*m, matches);
|
||||
CHECK(m->getNumAtoms() == 3);
|
||||
CHECK(MolToCXSmiles(*m) == "*C* |$CF3;;CO2H$|");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("applyMatches linkers") {
|
||||
auto linkers =
|
||||
Abbreviations::Utils::parseLinkers(R"ABBREV(PEG3 *OCCOCCOCC* PEG3
|
||||
Pent *CCCCC*
|
||||
Cy *C1CCC(*)CC1 Cy)ABBREV");
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "FCOCCOCCOCCCCCCCCl"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, linkers, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
Abbreviations::applyMatches(*m, matches);
|
||||
CHECK(m->getNumAtoms() == 5);
|
||||
CHECK(MolToCXSmiles(*m) == "FC**Cl |$;;PEG3;Pent;$|");
|
||||
}
|
||||
{
|
||||
auto m = "COC1CCC(C)CC1"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, linkers, maxCoverage);
|
||||
CHECK(matches.size() == 1);
|
||||
Abbreviations::applyMatches(*m, matches);
|
||||
CHECK(m->getNumAtoms() == 4);
|
||||
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("condense abbreviations") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "FC(F)(F)CC(=O)O"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
|
||||
CHECK(MolToCXSmiles(*m) == "*C* |$CF3;;CO2H$|");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("condense abbreviations linkers") {
|
||||
auto linkers = Abbreviations::Utils::getDefaultLinkers();
|
||||
auto customLinkers =
|
||||
Abbreviations::Utils::parseLinkers(R"ABBREV(PEG3 *OCCOCCOCC* PEG3
|
||||
Pent *CCCCC*
|
||||
Cy *C1CCC(*)CC1 Cy
|
||||
ala *N[C@@H](C)C(=O)* ala
|
||||
arg *N[C@@H](CCCNC(N)=[NH])C(=O)* arg
|
||||
asn *N[C@@H](CC(N)=O)C(=O)* asn
|
||||
asp *N[C@@H](CC(O)=O)C(=O)* asp
|
||||
cys *N[C@@H](CS)C(=O)* cys
|
||||
gln *N[C@@H](CCC(N)=O)C(=O)* gln
|
||||
glu *N[C@@H](CCC(O)=O)C(=O)* glu
|
||||
gly *NCC(=O)* gly
|
||||
his *N[C@@H](Cc1c[nH]cn1)C(=O)* his
|
||||
ile *N[C@@H](C(C)CC)C(=O)* ile
|
||||
leu *N[C@@H](CC(C)C)C(=O)* leu
|
||||
lys *N[C@@H](CCCCN)C(=O)* lys
|
||||
met *N[C@@H](CCSC)C(=O)* met
|
||||
phe *N[C@@H](Cc1ccccc1)C(=O)* phe
|
||||
pro *N1[C@@H](CCC1)C(=O)* pro
|
||||
ser *N[C@@H](CO)C(=O)* ser
|
||||
thr *N[C@@H](C(O)C)C(=O)* thr
|
||||
trp *N[C@@H](Cc1c[nH]c2ccccc21)C(=O)* trp
|
||||
tyr *N[C@@H](Cc1ccc(O)cc1)C(=O)* tyr
|
||||
val *N[C@@H](C(C)C)C(=O)* val)ABBREV");
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "FCOCCOCCOCCCCCCCCCCl"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 5);
|
||||
CHECK(MolToCXSmiles(*m) == "FC**Cl |$;;PEG3;Hept;$|");
|
||||
}
|
||||
{
|
||||
auto m = "COC1CCC(C)CC1"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, customLinkers, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 4);
|
||||
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
|
||||
}
|
||||
}
|
||||
SECTION("peptides") {
|
||||
std::unique_ptr<RWMol> m(SequenceToMol("GYTKC"));
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, customLinkers, maxCoverage);
|
||||
CHECK(MolToCXSmiles(*m) == "NCC(=O)****O |$;;;;tyr;thr;lys;cys;$|");
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("abbreviations and linkers") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
auto linkers = Abbreviations::Utils::parseLinkers(
|
||||
R"ABBREV(Cy *C1CCC(*)CC1 Cy)ABBREV");
|
||||
SECTION("basics") {
|
||||
{ // this isn't the order we'd normally do this in:
|
||||
auto m = "COC1CCC(C)CC1"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 8);
|
||||
CHECK(MolToCXSmiles(*m) == "*C1CCC(C)CC1 |$OMe;;;;;;;$|");
|
||||
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 3);
|
||||
CHECK(MolToCXSmiles(*m) == "**C |$OMe;Cy;$|");
|
||||
}
|
||||
{ // a more sensible order
|
||||
auto m = "COC1CCC(C)CC1"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::condenseMolAbbreviations(*m, linkers, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 4);
|
||||
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
|
||||
Abbreviations::condenseMolAbbreviations(*m, abbrevs, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 4);
|
||||
CHECK(MolToCXSmiles(*m) == "C*OC |$;Cy;;$|");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("labelMatches") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "CC(C)CC(F)(F)F"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
auto matches = Abbreviations::findApplicableAbbreviationMatches(
|
||||
*m, abbrevs, maxCoverage);
|
||||
CHECK(matches.size() == 2);
|
||||
Abbreviations::labelMatches(*m, matches);
|
||||
CHECK(m->getNumAtoms() == 8);
|
||||
const auto &sgs = getSubstanceGroups(*m);
|
||||
REQUIRE(sgs.size() == 2);
|
||||
CHECK(sgs[0].getProp<std::string>("TYPE") == "SUP");
|
||||
CHECK(sgs[0].getProp<std::string>("LABEL") == "iPr");
|
||||
CHECK(sgs[0].getBonds() == std::vector<unsigned int>({2}));
|
||||
CHECK(sgs[0].getAtoms() == std::vector<unsigned int>({1, 0, 2}));
|
||||
CHECK(sgs[0].getAttachPoints().size() == 1);
|
||||
CHECK(sgs[0].getAttachPoints()[0].aIdx == 1);
|
||||
CHECK(sgs[0].getAttachPoints()[0].lvIdx == 3);
|
||||
|
||||
CHECK(sgs[1].getProp<std::string>("TYPE") == "SUP");
|
||||
CHECK(sgs[1].getProp<std::string>("LABEL") == "CF3");
|
||||
CHECK(sgs[1].getBonds() == std::vector<unsigned int>({3}));
|
||||
CHECK(sgs[1].getAtoms() == std::vector<unsigned int>({4, 5, 6, 7}));
|
||||
CHECK(sgs[1].getAttachPoints().size() == 1);
|
||||
CHECK(sgs[1].getAttachPoints()[0].aIdx == 4);
|
||||
CHECK(sgs[1].getAttachPoints()[0].lvIdx == 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("labelMolAbbreviations") {
|
||||
auto abbrevs = Abbreviations::Utils::getDefaultAbbreviations();
|
||||
SECTION("basics") {
|
||||
{
|
||||
auto m = "CC(C)CC(F)(F)F"_smiles;
|
||||
REQUIRE(m);
|
||||
double maxCoverage = 1.0;
|
||||
Abbreviations::labelMolAbbreviations(*m, abbrevs, maxCoverage);
|
||||
CHECK(m->getNumAtoms() == 8);
|
||||
const auto &sgs = getSubstanceGroups(*m);
|
||||
REQUIRE(sgs.size() == 2);
|
||||
CHECK(sgs[0].getProp<std::string>("TYPE") == "SUP");
|
||||
CHECK(sgs[0].getProp<std::string>("LABEL") == "iPr");
|
||||
CHECK(sgs[0].getBonds() == std::vector<unsigned int>({2}));
|
||||
CHECK(sgs[0].getAtoms() == std::vector<unsigned int>({1, 0, 2}));
|
||||
CHECK(sgs[0].getAttachPoints().size() == 1);
|
||||
CHECK(sgs[0].getAttachPoints()[0].aIdx == 1);
|
||||
CHECK(sgs[0].getAttachPoints()[0].lvIdx == 3);
|
||||
|
||||
CHECK(sgs[1].getProp<std::string>("TYPE") == "SUP");
|
||||
CHECK(sgs[1].getProp<std::string>("LABEL") == "CF3");
|
||||
CHECK(sgs[1].getBonds() == std::vector<unsigned int>({3}));
|
||||
CHECK(sgs[1].getAtoms() == std::vector<unsigned int>({4, 5, 6, 7}));
|
||||
CHECK(sgs[1].getAttachPoints().size() == 1);
|
||||
CHECK(sgs[1].getAttachPoints()[0].aIdx == 4);
|
||||
CHECK(sgs[1].getAttachPoints()[0].lvIdx == 3);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("condenseAbbreviationSubstanceGroups") {
|
||||
SECTION("abbreviations") {
|
||||
auto m = R"CTAB(
|
||||
ACCLDraw09152005292D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 10 10 2 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 12.8333 -9.32 0 0 CFG=3
|
||||
M V30 2 C 13.8565 -8.7293 0 0
|
||||
M V30 3 O 14.8802 -9.3201 0 0
|
||||
M V30 4 O 13.8565 -7.5471 0 0
|
||||
M V30 5 C 11.6489 -9.32 0 0
|
||||
M V30 6 C 12.241 -10.3432 0 0 CFG=3
|
||||
M V30 7 C 12.241 -11.5253 0 0 CFG=3
|
||||
M V30 8 F 12.241 -12.5874 0 0
|
||||
M V30 9 F 11.0366 -11.5253 0 0
|
||||
M V30 10 F 13.4231 -11.5253 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 2 2 4
|
||||
M V30 2 1 2 3
|
||||
M V30 3 1 1 2
|
||||
M V30 4 1 5 6
|
||||
M V30 5 1 5 1
|
||||
M V30 6 1 1 6
|
||||
M V30 7 1 7 10
|
||||
M V30 8 1 7 9
|
||||
M V30 9 1 7 8
|
||||
M V30 10 1 6 7
|
||||
M V30 END BOND
|
||||
M V30 BEGIN SGROUP
|
||||
M V30 1 SUP 1 ATOMS=(3 2 3 4) XBONDS=(1 3) CSTATE=(4 3 -1.02 -0.59 0) LABEL=-
|
||||
M V30 CO2H
|
||||
M V30 2 SUP 2 ATOMS=(4 7 8 9 10) XBONDS=(1 10) CSTATE=(4 10 0 1.18 0) LABEL=-
|
||||
M V30 CF3
|
||||
M V30 END SGROUP
|
||||
M V30 END CTAB
|
||||
M END)CTAB"_ctab;
|
||||
REQUIRE(m);
|
||||
CHECK(m->getNumAtoms() == 10);
|
||||
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
|
||||
CHECK(m->getNumAtoms() == 5);
|
||||
// remove the conformer before generating CXSMILES
|
||||
m->clearConformers();
|
||||
CHECK(MolToCXSmiles(*m) == "*C1CC1* |$CO2H;;;;CF3$|");
|
||||
}
|
||||
SECTION("abbreviations MRV") {
|
||||
auto m = R"CTAB(
|
||||
Mrv2014 09152006492D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 7 7 1 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 5.25 -5.9858 0 0
|
||||
M V30 2 C 4.48 -7.3196 0 0
|
||||
M V30 3 C 6.02 -7.3196 0 0
|
||||
M V30 4 F 8.6873 -8.8596 0 0
|
||||
M V30 5 C 7.3537 -8.0896 0 0
|
||||
M V30 6 F 6.02 -8.8596 0 0
|
||||
M V30 7 F 7.3537 -6.5496 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2
|
||||
M V30 2 1 3 1
|
||||
M V30 3 1 2 3
|
||||
M V30 4 1 3 5
|
||||
M V30 5 1 4 5
|
||||
M V30 6 1 5 6
|
||||
M V30 7 1 5 7
|
||||
M V30 END BOND
|
||||
M V30 BEGIN SGROUP
|
||||
M V30 1 SUP 0 ATOMS=(4 4 5 6 7) SAP=(3 5 3 1) XBONDS=(1 4) LABEL=CF3
|
||||
M V30 END SGROUP
|
||||
M V30 END CTAB
|
||||
M END
|
||||
)CTAB"_ctab;
|
||||
REQUIRE(m);
|
||||
CHECK(m->getNumAtoms() == 7);
|
||||
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
|
||||
CHECK(m->getNumAtoms() == 4);
|
||||
// remove the conformer before generating CXSMILES
|
||||
m->clearConformers();
|
||||
CHECK(MolToCXSmiles(*m) == "*C1CC1 |$CF3;;;$|");
|
||||
}
|
||||
|
||||
SECTION("linker") {
|
||||
auto m = R"CTAB(
|
||||
ACCLDraw09152006102D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 8 7 1 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 7.2482 -5.1911 0 0
|
||||
M V30 2 O 5.8143 -6.2327 0 0
|
||||
M V30 3 C 6.77 -5.5382 0 0
|
||||
M V30 4 C 7.8494 -6.0186 0 0
|
||||
M V30 5 O 8.8052 -5.3241 0 0
|
||||
M V30 6 C 9.8845 -5.8046 0 0
|
||||
M V30 7 C 10.8403 -5.1101 0 0
|
||||
M V30 8 C 9.4066 -6.1518 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2
|
||||
M V30 2 1 2 3
|
||||
M V30 3 1 3 4
|
||||
M V30 4 1 4 5
|
||||
M V30 5 1 5 6
|
||||
M V30 6 1 6 7
|
||||
M V30 7 1 7 8
|
||||
M V30 END BOND
|
||||
M V30 BEGIN SGROUP
|
||||
M V30 1 SUP 1 ATOMS=(6 2 3 4 5 6 7) XBONDS=(2 1 7) CSTATE=(4 1 -1.08 0.48 0) -
|
||||
M V30 CSTATE=(4 7 1.08 -0.48 0) LABEL=PEG2
|
||||
M V30 END SGROUP
|
||||
M V30 END CTAB
|
||||
M END)CTAB"_ctab;
|
||||
REQUIRE(m);
|
||||
CHECK(m->getNumAtoms() == 8);
|
||||
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
|
||||
CHECK(m->getNumAtoms() == 3);
|
||||
// remove the conformer before generating CXSMILES
|
||||
m->clearConformers();
|
||||
CHECK(MolToCXSmiles(*m) == "C*C |$;PEG2;$|");
|
||||
}
|
||||
SECTION("linker MRV") {
|
||||
auto m = R"CTAB(
|
||||
Mrv2014 09152006522D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 8 7 1 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 1.625 -8.9167 0 0
|
||||
M V30 2 O 2.9587 -8.1467 0 0
|
||||
M V30 3 C 4.2924 -8.9167 0 0
|
||||
M V30 4 C 5.626 -8.1467 0 0
|
||||
M V30 5 O 6.9597 -8.9167 0 0
|
||||
M V30 6 C 8.2934 -8.1467 0 0
|
||||
M V30 7 C 9.6271 -8.9167 0 0
|
||||
M V30 8 C 10.9608 -8.1467 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2
|
||||
M V30 2 1 2 3
|
||||
M V30 3 1 3 4
|
||||
M V30 4 1 4 5
|
||||
M V30 5 1 5 6
|
||||
M V30 6 1 6 7
|
||||
M V30 7 1 7 8
|
||||
M V30 END BOND
|
||||
M V30 BEGIN SGROUP
|
||||
M V30 1 SUP 0 ATOMS=(6 2 3 4 5 6 7) SAP=(3 2 1 1) SAP=(3 7 8 2) XBONDS=(2 1 -
|
||||
M V30 7) LABEL=PEG2 ESTATE=E
|
||||
M V30 END SGROUP
|
||||
M V30 END CTAB
|
||||
M END
|
||||
)CTAB"_ctab;
|
||||
REQUIRE(m);
|
||||
CHECK(m->getNumAtoms() == 8);
|
||||
Abbreviations::condenseAbbreviationSubstanceGroups(*m);
|
||||
CHECK(m->getNumAtoms() == 3);
|
||||
// remove the conformer before generating CXSMILES
|
||||
m->clearConformers();
|
||||
CHECK(MolToCXSmiles(*m) == "C*C |$;PEG2;$|");
|
||||
}
|
||||
}
|
||||
@@ -109,6 +109,7 @@ endif()
|
||||
add_subdirectory(MolStandardize)
|
||||
add_subdirectory(ScaffoldNetwork)
|
||||
add_subdirectory(MolEnumerator)
|
||||
add_subdirectory(Abbreviations)
|
||||
|
||||
|
||||
rdkit_test(graphmolTest1 test1.cpp LINK_LIBRARIES FileParsers SmilesParse GraphMol
|
||||
|
||||
@@ -3018,13 +3018,14 @@ void MolDraw2D::adjustBondEndForLabel(int atnum, const Point2D &nbr_cds,
|
||||
pair<string, OrientType> MolDraw2D::getAtomSymbolAndOrientation(
|
||||
const Atom &atom) const {
|
||||
OrientType orient = getAtomOrientation(atom);
|
||||
string symbol = getAtomSymbol(atom);
|
||||
string symbol = getAtomSymbol(atom, orient);
|
||||
|
||||
return std::make_pair(symbol, orient);
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
|
||||
string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom,
|
||||
OrientType orientation) const {
|
||||
// adds XML-like annotation for super- and sub-script, in the same manner
|
||||
// as MolDrawing.py. My first thought was for a LaTeX-like system,
|
||||
// obviously...
|
||||
@@ -3037,6 +3038,25 @@ string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
|
||||
// specified labels are trump: no matter what else happens we will show
|
||||
// them.
|
||||
symbol = drawOptions().atomLabels.find(atom.getIdx())->second;
|
||||
} else if (atom.hasProp(common_properties::_displayLabel) ||
|
||||
atom.hasProp(common_properties::_displayLabelW)) {
|
||||
// logic here: if either _displayLabel or _displayLabelW is set, we will
|
||||
// definitely use one of those. if only one is set, we'll use that one if
|
||||
// both are set and the orientation is W then we'll use _displayLabelW,
|
||||
// otherwise _displayLabel
|
||||
|
||||
std::string lbl;
|
||||
std::string lblw;
|
||||
atom.getPropIfPresent(common_properties::_displayLabel, lbl);
|
||||
atom.getPropIfPresent(common_properties::_displayLabelW, lblw);
|
||||
if (lbl.empty()) {
|
||||
lbl = lblw;
|
||||
}
|
||||
if (orientation == OrientType::W && !lblw.empty()) {
|
||||
symbol = lblw;
|
||||
} else {
|
||||
symbol = lbl;
|
||||
}
|
||||
} else if (atom.hasProp(common_properties::atomLabel)) {
|
||||
symbol = atom.getProp<std::string>(common_properties::atomLabel);
|
||||
} else if (drawOptions().dummiesAreAttachments && atom.getAtomicNum() == 0 &&
|
||||
@@ -3120,7 +3140,7 @@ string MolDraw2D::getAtomSymbol(const RDKit::Atom &atom) const {
|
||||
}
|
||||
// cout << "Atom symbol " << atom.getIdx() << " : " << symbol << endl;
|
||||
return symbol;
|
||||
}
|
||||
} // namespace RDKit
|
||||
|
||||
// ****************************************************************************
|
||||
OrientType MolDraw2D::getAtomOrientation(const RDKit::Atom &atom) const {
|
||||
|
||||
@@ -151,7 +151,7 @@ typedef std::vector<unsigned int> DashPattern;
|
||||
inline void assignDefaultPalette(ColourPalette &palette) {
|
||||
palette.clear();
|
||||
palette[-1] = DrawColour(0, 0, 0);
|
||||
palette[0] = DrawColour(0.5, 0.5, 0.5);
|
||||
palette[0] = DrawColour(0.1, 0.1, 0.1);
|
||||
palette[1] = palette[6] = DrawColour(0.0, 0.0, 0.0);
|
||||
palette[7] = DrawColour(0.0, 0.0, 1.0);
|
||||
palette[8] = DrawColour(1.0, 0.0, 0.0);
|
||||
@@ -782,7 +782,7 @@ class RDKIT_MOLDRAW2D_EXPORT MolDraw2D {
|
||||
// adds LaTeX-like annotation for super- and sub-script.
|
||||
std::pair<std::string, OrientType> getAtomSymbolAndOrientation(
|
||||
const Atom &atom) const;
|
||||
std::string getAtomSymbol(const Atom &atom) const;
|
||||
std::string getAtomSymbol(const Atom &atom, OrientType orientation) const;
|
||||
OrientType getAtomOrientation(const Atom &atom) const;
|
||||
|
||||
// things used by calculateScale.
|
||||
|
||||
@@ -928,7 +928,8 @@ std::string get_coords_block(const ROMol &mol,
|
||||
std::string get_atom_props_block(const ROMol &mol,
|
||||
const std::vector<unsigned int> &atomOrder) {
|
||||
std::vector<std::string> skip = {common_properties::atomLabel,
|
||||
common_properties::molFileValue};
|
||||
common_properties::molFileValue,
|
||||
common_properties::molParity};
|
||||
std::string res = "";
|
||||
unsigned int which = 0;
|
||||
for (auto idx : atomOrder) {
|
||||
|
||||
19
Code/JavaWrappers/Abbreviations.i
Normal file
19
Code/JavaWrappers/Abbreviations.i
Normal file
@@ -0,0 +1,19 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2020, Greg Landrum and T5 Informatics GmbH
|
||||
* All rights reserved.
|
||||
*
|
||||
* This file is part of the RDKit.
|
||||
* The contents are covered by the terms of the BSD license
|
||||
* which is included in the file license.txt, found at the root
|
||||
* of the RDKit source tree.
|
||||
*
|
||||
*/
|
||||
%{
|
||||
#include <GraphMol/Abbreviations/Abbreviations.h>
|
||||
%}
|
||||
|
||||
|
||||
%template(AbbreviationDefinition_Vect) std::vector<RDKit::Abbreviations::AbbreviationDefinition>;
|
||||
%template(AbbreviationMatch_Vect) std::vector<RDKit::Abbreviations::AbbreviationMatch>;
|
||||
%include <GraphMol/Abbreviations/Abbreviations.h>
|
||||
@@ -20,7 +20,7 @@ if(RDK_BUILD_INCHI_SUPPORT)
|
||||
set(swigRDKitLibList "${swigRDKitLibList}RDInchiLib;${INCHI_LIBRARIES};")
|
||||
endif(RDK_BUILD_INCHI_SUPPORT)
|
||||
set(swigRDKitLibList "${swigRDKitLibList}"
|
||||
"ScaffoldNetwork;MolHash;RGroupDecomposition;SubstructLibrary;TautomerQuery;"
|
||||
"Abbreviations;ScaffoldNetwork;MolHash;RGroupDecomposition;SubstructLibrary;TautomerQuery;"
|
||||
"MolEnumerator;"
|
||||
"MolStandardize;FilterCatalog;Catalogs;FMCS;MolDraw2D;FileParsers;SmilesParse;"
|
||||
"Depictor;SubstructMatch;ChemReactions;Fingerprints;ChemTransforms;"
|
||||
|
||||
@@ -256,6 +256,7 @@ typedef unsigned long long int uintmax_t;
|
||||
%include "../TautomerQuery.i"
|
||||
%include "../SubstanceGroup.i"
|
||||
%include "../MolHash.i"
|
||||
%include "../Abbreviations.i"
|
||||
%include "../Streams.i"
|
||||
|
||||
|
||||
|
||||
@@ -374,6 +374,11 @@ ADD_TEST(JavaMolHashTest
|
||||
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"
|
||||
org.RDKit.MolHashTest)
|
||||
|
||||
ADD_TEST(JavaAbbreviationsTests
|
||||
java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR}
|
||||
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"
|
||||
org.RDKit.AbbreviationsTests)
|
||||
|
||||
ADD_TEST(JavaDiversityPickerTests
|
||||
java -Djava.library.path=${CMAKE_CURRENT_SOURCE_DIR}
|
||||
-cp "${JUNIT_JAR}${PATH_SEP}${CMAKE_JAVA_TEST_OUTDIR}${PATH_SEP}${CMAKE_CURRENT_SOURCE_DIR}/org.RDKit.jar"
|
||||
|
||||
@@ -238,6 +238,7 @@ typedef unsigned long long int uintmax_t;
|
||||
%include "../SubstanceGroup.i"
|
||||
%include "../MolEnumerator.i"
|
||||
%include "../MolHash.i"
|
||||
%include "../Abbreviations.i"
|
||||
%include "../Streams.i"
|
||||
|
||||
// Create a class to throw various sorts of errors for testing. Required for unit tests in ErrorHandlingTests.java
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2019 Greg Landrum and T5 Informatics GmbH
|
||||
* All rights reserved.
|
||||
*
|
||||
* This file is part of the RDKit.
|
||||
* The contents are covered by the terms of the BSD license
|
||||
* which is included in the file license.txt, found at the root
|
||||
* of the RDKit source tree.
|
||||
*/
|
||||
package org.RDKit;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import org.junit.*;
|
||||
|
||||
public class AbbreviationsTests extends GraphMolTest {
|
||||
|
||||
@Before public void setUp() {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1Basics() {
|
||||
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultAbbreviations();
|
||||
RWMol mol = RWMol.MolFromSmiles("C1CCC1C(F)(F)F");
|
||||
assertEquals(mol.getNumAtoms(),8);
|
||||
|
||||
RDKFuncs.condenseMolAbbreviations(mol,abbrevs);
|
||||
// no changes here due to the threshold
|
||||
assertEquals(mol.getNumAtoms(),8);
|
||||
|
||||
RDKFuncs.condenseMolAbbreviations(mol,abbrevs, 1.0);
|
||||
assertEquals(mol.getNumAtoms(),5);
|
||||
assertEquals(RDKFuncs.MolToCXSmiles(mol),"*C1CCC1 |$CF3;;;;$|");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2LinkerBasics() {
|
||||
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultLinkers();
|
||||
RWMol mol = RWMol.MolFromSmiles("COCCOCCOCCOCCCl");
|
||||
assertEquals(mol.getNumAtoms(),14);
|
||||
|
||||
RDKFuncs.condenseMolAbbreviations(mol,abbrevs);
|
||||
// no changes here due to the threshold
|
||||
assertEquals(mol.getNumAtoms(),14);
|
||||
|
||||
RDKFuncs.condenseMolAbbreviations(mol,abbrevs, 1.0);
|
||||
assertEquals(mol.getNumAtoms(),3);
|
||||
assertEquals(RDKFuncs.MolToCXSmiles(mol),"C*Cl |$;PEG4;$|");
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test3Matching() {
|
||||
AbbreviationDefinition_Vect abbrevs = RDKFuncs.getDefaultAbbreviations();
|
||||
RWMol mol = RWMol.MolFromSmiles("C1CCC1C(F)(F)F");
|
||||
assertEquals(mol.getNumAtoms(),8);
|
||||
|
||||
AbbreviationMatch_Vect matches = RDKFuncs.findApplicableAbbreviationMatches(mol,abbrevs,1.0);
|
||||
assertEquals(matches.size(),1);
|
||||
assertEquals(matches.get(0).getAbbrev().getLabel(),"CF3");
|
||||
|
||||
RDKFuncs.applyMatches(mol,matches);
|
||||
assertEquals(mol.getNumAtoms(),5);
|
||||
assertEquals(RDKFuncs.MolToCXSmiles(mol),"*C1CCC1 |$CF3;;;;$|");
|
||||
|
||||
}
|
||||
|
||||
public static void main(String args[]) {
|
||||
org.junit.runner.JUnitCore.main("org.RDKit.AbbreviationsTests");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -9,7 +9,7 @@ if(RDK_BUILD_FREETYPE_SUPPORT)
|
||||
endif()
|
||||
endif()
|
||||
add_executable(RDKit_minimal jswrapper.cpp minilib.cpp)
|
||||
target_link_libraries(RDKit_minimal CIPLabeler_static MolDraw2D_static Depictor_static RDInchiLib_static SubstructMatch_static FileParsers_static
|
||||
target_link_libraries(RDKit_minimal Abbreviations_static CIPLabeler_static MolDraw2D_static Depictor_static RDInchiLib_static SubstructMatch_static FileParsers_static
|
||||
SmilesParse_static GraphMol_static RDGeometryLib_static RDGeneral_static)
|
||||
|
||||
set_target_properties(RDKit_minimal PROPERTIES LINK_FLAGS "--bind")
|
||||
|
||||
@@ -90,7 +90,9 @@ EMSCRIPTEN_BINDINGS(RDKit_minimal) {
|
||||
class_<JSMol>("Mol")
|
||||
.function("is_valid", &JSMol::is_valid)
|
||||
.function("get_smiles", &JSMol::get_smiles)
|
||||
.function("get_cxsmiles", &JSMol::get_cxsmiles)
|
||||
.function("get_molblock", &JSMol::get_molblock)
|
||||
.function("get_v3Kmolblock", &JSMol::get_v3Kmolblock)
|
||||
.function("get_inchi", &JSMol::get_inchi)
|
||||
.function("get_svg",
|
||||
select_overload<std::string() const>(&JSMol::get_svg))
|
||||
@@ -122,6 +124,11 @@ EMSCRIPTEN_BINDINGS(RDKit_minimal) {
|
||||
select_overload<std::string() const>(&JSMol::get_new_coords))
|
||||
.function("get_new_coords", select_overload<std::string(bool) const>(
|
||||
&JSMol::get_new_coords))
|
||||
.function("condense_abbreviations",
|
||||
select_overload<std::string()>(&JSMol::condense_abbreviations))
|
||||
.function("condense_abbreviations",
|
||||
select_overload<std::string(double, bool)>(
|
||||
&JSMol::condense_abbreviations))
|
||||
.function("add_hs", &JSMol::add_hs)
|
||||
.function("remove_hs", &JSMol::remove_hs);
|
||||
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <GraphMol/Fingerprints/MorganFingerprints.h>
|
||||
#include <GraphMol/Depictor/RDDepictor.h>
|
||||
#include <GraphMol/CIPLabeler/CIPLabeler.h>
|
||||
#include <GraphMol/Abbreviations/Abbreviations.h>
|
||||
#include <DataStructs/BitOps.h>
|
||||
|
||||
#include <INCHI-API/inchi.h>
|
||||
@@ -105,7 +106,7 @@ std::string process_details(const std::string &details, unsigned int &width,
|
||||
}
|
||||
|
||||
namespace {
|
||||
ROMol *mol_from_input(const std::string &input) {
|
||||
RWMol *mol_from_input(const std::string &input) {
|
||||
RWMol *res = nullptr;
|
||||
if (input.find("M END") != std::string::npos) {
|
||||
bool sanitize = false;
|
||||
@@ -127,7 +128,7 @@ ROMol *mol_from_input(const std::string &input) {
|
||||
return res;
|
||||
}
|
||||
|
||||
ROMol *qmol_from_input(const std::string &input) {
|
||||
RWMol *qmol_from_input(const std::string &input) {
|
||||
RWMol *res = nullptr;
|
||||
if (input.find("M END") != std::string::npos) {
|
||||
bool sanitize = false;
|
||||
@@ -169,6 +170,10 @@ std::string JSMol::get_smiles() const {
|
||||
if (!d_mol) return "";
|
||||
return MolToSmiles(*d_mol);
|
||||
}
|
||||
std::string JSMol::get_cxsmiles() const {
|
||||
if (!d_mol) return "";
|
||||
return MolToCXSmiles(*d_mol);
|
||||
}
|
||||
std::string JSMol::get_svg(unsigned int w, unsigned int h) const {
|
||||
if (!d_mol) return "";
|
||||
return svg_(*d_mol, w, h);
|
||||
@@ -190,6 +195,10 @@ std::string JSMol::get_molblock() const {
|
||||
if (!d_mol) return "";
|
||||
return MolToMolBlock(*d_mol);
|
||||
}
|
||||
std::string JSMol::get_v3Kmolblock() const {
|
||||
if (!d_mol) return "";
|
||||
return MolToV3KMolBlock(*d_mol);
|
||||
}
|
||||
|
||||
namespace {
|
||||
void get_sss_json(const ROMol *d_mol, const ROMol *q_mol,
|
||||
@@ -413,17 +422,49 @@ std::string JSMol::add_hs() const {
|
||||
return MolToMolBlock(molCopy, includeStereo, confId, kekulize);
|
||||
}
|
||||
|
||||
std::string JSMol::condense_abbreviations(double maxCoverage, bool useLinkers) {
|
||||
if (!d_mol) return "";
|
||||
if (!useLinkers) {
|
||||
Abbreviations::condenseMolAbbreviations(
|
||||
*d_mol, Abbreviations::Utils::getDefaultAbbreviations(), maxCoverage);
|
||||
} else {
|
||||
Abbreviations::condenseMolAbbreviations(
|
||||
*d_mol, Abbreviations::Utils::getDefaultLinkers(), maxCoverage);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string JSMol::condense_abbreviations_from_defs(
|
||||
const std::string &definitions, double maxCoverage, bool areLinkers) {
|
||||
static std::string lastDefs = "";
|
||||
static std::vector<Abbreviations::AbbreviationDefinition> abbrevs;
|
||||
if (definitions != lastDefs) {
|
||||
// yes, we are making the assumption that the "areLinkers" argument remains
|
||||
// the same if the definitions are the same
|
||||
bool removeExtraDummies = areLinkers;
|
||||
bool allowConnectionToDummies = areLinkers;
|
||||
lastDefs = definitions;
|
||||
try {
|
||||
abbrevs = Abbreviations::Utils::parseAbbreviations(
|
||||
definitions, removeExtraDummies, allowConnectionToDummies);
|
||||
} catch (...) {
|
||||
return "cannot parse abbreviations";
|
||||
}
|
||||
}
|
||||
Abbreviations::condenseMolAbbreviations(*d_mol, abbrevs, maxCoverage);
|
||||
}
|
||||
|
||||
std::string get_inchikey_for_inchi(const std::string &input) {
|
||||
return InchiToInchiKey(input);
|
||||
}
|
||||
|
||||
JSMol *get_mol(const std::string &input) {
|
||||
ROMol *mol = mol_from_input(input);
|
||||
RWMol *mol = mol_from_input(input);
|
||||
return new JSMol(mol);
|
||||
}
|
||||
|
||||
JSMol *get_qmol(const std::string &input) {
|
||||
ROMol *mol = qmol_from_input(input);
|
||||
RWMol *mol = qmol_from_input(input);
|
||||
return new JSMol(mol);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,9 +14,11 @@
|
||||
class JSMol {
|
||||
public:
|
||||
JSMol() : d_mol(nullptr){};
|
||||
JSMol(RDKit::ROMol *mol) : d_mol(mol){};
|
||||
JSMol(RDKit::RWMol *mol) : d_mol(mol){};
|
||||
std::string get_smiles() const;
|
||||
std::string get_cxsmiles() const;
|
||||
std::string get_molblock() const;
|
||||
std::string get_v3Kmolblock() const;
|
||||
std::string get_inchi() const;
|
||||
std::string get_svg(unsigned int width, unsigned int height) const;
|
||||
std::string get_svg() const {
|
||||
@@ -28,6 +30,13 @@ class JSMol {
|
||||
std::string get_descriptors() const;
|
||||
std::string get_morgan_fp(unsigned int radius, unsigned int len) const;
|
||||
std::string get_morgan_fp() const { return get_morgan_fp(2, 2048); };
|
||||
std::string condense_abbreviations(double maxCoverage, bool useLinkers);
|
||||
std::string condense_abbreviations() {
|
||||
return condense_abbreviations(0.4, false);
|
||||
};
|
||||
std::string condense_abbreviations_from_defs(const std::string &definitions,
|
||||
double maxCoverage,
|
||||
bool areLinkers);
|
||||
|
||||
bool is_valid() const { return d_mol.get() != nullptr; };
|
||||
|
||||
@@ -40,7 +49,7 @@ class JSMol {
|
||||
std::string remove_hs() const;
|
||||
std::string add_hs() const;
|
||||
|
||||
std::unique_ptr<RDKit::ROMol> d_mol;
|
||||
std::unique_ptr<RDKit::RWMol> d_mol;
|
||||
static constexpr unsigned int d_defaultWidth = 250;
|
||||
static constexpr unsigned int d_defaultHeight = 200;
|
||||
};
|
||||
|
||||
@@ -82,11 +82,23 @@ function test_sketcher_services2(){
|
||||
assert(molb2.search(" H ")<0);
|
||||
}
|
||||
|
||||
|
||||
function test_abbreviations(){
|
||||
var bmol = Module.get_mol("C1CCC1C(F)(F)F");
|
||||
assert.equal(bmol.is_valid(),1);
|
||||
bmol.condense_abbreviations();
|
||||
assert.equal(bmol.get_cxsmiles(),"FC(F)(F)C1CCC1");
|
||||
bmol.condense_abbreviations(1.0,false);
|
||||
assert.equal(bmol.get_cxsmiles(),"*C1CCC1 |$CF3;;;;$|");
|
||||
}
|
||||
|
||||
|
||||
Module.onRuntimeInitialized = () => {
|
||||
console.log(Module.version());
|
||||
test_basics();
|
||||
test_sketcher_services();
|
||||
test_sketcher_services2();
|
||||
test_abbreviations();
|
||||
console.log("Tests finished successfully");
|
||||
};
|
||||
|
||||
|
||||
@@ -127,6 +127,10 @@ const std::string atomNote = "atomNote";
|
||||
const std::string bondNote = "bondNote";
|
||||
const std::string _isotopicHs = "_isotopicHs";
|
||||
|
||||
// molecule drawing
|
||||
const std::string _displayLabel = "_displayLabel";
|
||||
const std::string _displayLabelW = "_displayLabelW";
|
||||
|
||||
} // namespace common_properties
|
||||
|
||||
const double MAX_DOUBLE = std::numeric_limits<double>::max();
|
||||
|
||||
@@ -222,6 +222,10 @@ RDKIT_RDGENERAL_EXPORT extern const std::string
|
||||
_TriposAtomType; // string Mol2FileParser
|
||||
// missing defs for _TriposAtomName//_TriposPartialCharge...
|
||||
|
||||
// molecule drawing
|
||||
RDKIT_RDGENERAL_EXPORT extern const std::string _displayLabel; // string
|
||||
RDKIT_RDGENERAL_EXPORT extern const std::string _displayLabelW; // string
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// misc props
|
||||
RDKIT_RDGENERAL_EXPORT extern const std::string
|
||||
|
||||
@@ -736,7 +736,7 @@ of threads allowed on your computer.
|
||||
The original 2D->3D conversion provided with the RDKit was not intended
|
||||
to be a replacement for a “real” conformational analysis tool; it
|
||||
merely provides quick 3D structures for cases when they are
|
||||
required. We believe, however, that the newer ETKDG method[#riniker2]_ should be
|
||||
required. We believe, however, that the newer ETKDG method [#riniker2]_ should be
|
||||
adequate for most purposes.
|
||||
|
||||
|
||||
@@ -899,12 +899,12 @@ data/test_multi_colours.py, which produces the somewhat garish
|
||||
|
||||
As of version 2020.03, it is possible to add arbitrary small strings
|
||||
to annotate atoms and bonds in the drawing. The strings are added as
|
||||
properties 'atomNote' and
|
||||
'bondNote' and they will be placed automatically
|
||||
properties ``atomNote`` and
|
||||
``bondNote`` and they will be placed automatically
|
||||
close to the atom or bond in question in a manner intended to minimise
|
||||
their clash with the rest of the drawing. For convenience, here are 3
|
||||
flags in
|
||||
`MolDraw2DOptions` that will add stereo information (R/S to atoms, E/Z
|
||||
``MolDraw2DOptions`` that will add stereo information (R/S to atoms, E/Z
|
||||
to bonds) and atom and bond sequence numbers.
|
||||
|
||||
.. doctest::
|
||||
@@ -917,13 +917,70 @@ to bonds) and atom and bond sequence numbers.
|
||||
>>> d.drawOptions().addAtomIndices = True
|
||||
>>> d.DrawMolecule(mol)
|
||||
>>> d.FinishDrawing()
|
||||
>>> with open('atom_annotation_1.png', 'wb') as f: # doctest: +SKIP
|
||||
... f.write(d.GetDrawingText())
|
||||
>>> d.WriteDrawingText('atom_annotation_1.png') # doctest: +SKIP
|
||||
|
||||
will produce
|
||||
|
||||
.. image:: images/atom_annotation_1.png
|
||||
|
||||
If atoms have an ``atomLabel`` property set, this will be used when drawing them:
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
|
||||
>>> mol = Chem.MolFromSmiles(smi)
|
||||
>>> mol.GetAtomWithIdx(3).GetProp("atomLabel")
|
||||
'R1'
|
||||
>>> mol.GetAtomWithIdx(7).GetProp("atomLabel")
|
||||
'R2'
|
||||
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
|
||||
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
|
||||
>>> d.WriteDrawingText("./images/atom_labels_1.png") # doctest: +SKIP
|
||||
|
||||
gives:
|
||||
|
||||
.. image:: images/atom_labels_1.png
|
||||
|
||||
Since the ``atomLabel`` property is also used for other things (for example in CXSMILES as demonstrated),
|
||||
if you want to provide your own atom labels, it's better to use the ``_displayLabel`` property:
|
||||
|
||||
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
|
||||
>>> mol = Chem.MolFromSmiles(smi)
|
||||
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabel","R<sub>1</sub>")
|
||||
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabel","R<sub>2</sub>")
|
||||
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
|
||||
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
|
||||
>>> d.WriteDrawingText("./images/atom_labels_2.png") # doctest: +SKIP
|
||||
|
||||
this gives:
|
||||
|
||||
.. image:: images/atom_labels_2.png
|
||||
|
||||
Note that you can use ``<sup>`` and ``<sub>`` in these labels to provide super- and subscripts.
|
||||
|
||||
Finally, if you have atom labels which should be displayed differently when the bond comes
|
||||
into them from the right (the West), you can also set the ``_displayLabelW`` property:
|
||||
|
||||
|
||||
.. doctest::
|
||||
|
||||
>>> smi = 'c1nc(*)ccc1* |$;;;R1;;;;R2$|'
|
||||
>>> mol = Chem.MolFromSmiles(smi)
|
||||
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabel","CO<sub>2</sub>H")
|
||||
>>> mol.GetAtomWithIdx(3).SetProp("_displayLabelW","HO<sub>2</sub>C")
|
||||
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabel","CO<sub>2</sub><sup>-</sup>")
|
||||
>>> mol.GetAtomWithIdx(7).SetProp("_displayLabelW","<sup>-</sup>OOC")
|
||||
>>> d = rdMolDraw2D.MolDraw2DCairo(250, 250)
|
||||
>>> rdMolDraw2D.PrepareAndDrawMolecule(d,mol)
|
||||
>>> d.WriteDrawingText("./images/atom_labels_3.png") # doctest: +SKIP
|
||||
|
||||
this gives:
|
||||
|
||||
.. image:: images/atom_labels_3.png
|
||||
|
||||
|
||||
|
||||
|
||||
Metadata in Molecule Images
|
||||
===========================
|
||||
|
||||
|
||||
BIN
Docs/Book/images/atom_labels_1.png
Normal file
BIN
Docs/Book/images/atom_labels_1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
BIN
Docs/Book/images/atom_labels_2.png
Normal file
BIN
Docs/Book/images/atom_labels_2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
BIN
Docs/Book/images/atom_labels_3.png
Normal file
BIN
Docs/Book/images/atom_labels_3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.0 KiB |
@@ -1,45 +1,46 @@
|
||||
tests = [
|
||||
("python", "UnitTestChem.py", {}),
|
||||
("python", "UnitTestChemv2.py", {}),
|
||||
("python", "UnitTestChemAtom.py", {}),
|
||||
("python", "UnitTestChemBond.py", {}),
|
||||
("python", "UnitTestChemSmarts.py", {}),
|
||||
("python", "UnitTestFragmentDescriptors.py", {}),
|
||||
("python", "UnitTestGraphDescriptors_2.py", {}),
|
||||
("python", "UnitTestLipinski.py", {}),
|
||||
("python", "UnitTestMCS.py", {}),
|
||||
("python", "UnitTestOldBugs.py", {}),
|
||||
("python", "UnitTestSATIS.py", {}),
|
||||
("python", "UnitTestSmiles.py", {}),
|
||||
("python", "UnitTestSuppliers.py", {}),
|
||||
("python", "UnitTestSurf.py", {}),
|
||||
("python", "UnitTestMol3D.py", {}),
|
||||
("python", "UnitTestCatalog.py", {}),
|
||||
("python", "UnitTestDescriptors.py", {}),
|
||||
("python", "UnitTestInchi.py", {}),
|
||||
("python", "UnitTestFunctionalGroups.py", {}),
|
||||
("python", "UnitTestCrippen.py", {}),
|
||||
("python", "UnitTestPandasTools.py", {}),
|
||||
("python", "UnitTestDocTestsChem.py", {}),
|
||||
("python", "UnitTestFeatFinderCLI.py", {}),
|
||||
("python", "UnitTestQED.py", {}),
|
||||
("python", "UnitTestSaltRemover.py", {}),
|
||||
("python", "test_list.py", {'dir': 'AtomPairs'}),
|
||||
("python", "test_list.py", {'dir': 'ChemUtils'}),
|
||||
("python", "test_list.py", {'dir': 'EState'}),
|
||||
("python", "test_list.py", {'dir': 'FeatMaps'}),
|
||||
("python", "test_list.py", {'dir': 'Fingerprints'}),
|
||||
("python", "test_list.py", {'dir': 'Pharm2D'}),
|
||||
("python", "test_list.py", {'dir': 'Pharm3D'}),
|
||||
("python", "test_list.py", {'dir': 'Subshape'}),
|
||||
("python", "test_list.py", {'dir': 'Suppliers'}),
|
||||
("python", "test_list.py", {'dir': 'Scaffolds'}),
|
||||
("python", "test_list.py", {'dir': 'Draw'}),
|
||||
("python", "test_list.py", {'dir': 'Fraggle'}),
|
||||
("python", "test_list.py", {'dir': 'SimpleEnum'}),
|
||||
("python", "test_list.py", {'dir': 'Features'}),
|
||||
("python", "test_list.py", {'dir': 'MolStandardize'})
|
||||
]
|
||||
tests = [("python", "UnitTestChem.py", {}), ("python", "UnitTestChemv2.py", {}),
|
||||
("python", "UnitTestChemAtom.py", {}), ("python", "UnitTestChemBond.py", {}),
|
||||
("python", "UnitTestChemSmarts.py", {}), ("python", "UnitTestFragmentDescriptors.py", {}),
|
||||
("python", "UnitTestGraphDescriptors_2.py", {}), ("python", "UnitTestLipinski.py", {}),
|
||||
("python", "UnitTestMCS.py", {}), ("python", "UnitTestOldBugs.py", {}),
|
||||
("python", "UnitTestSATIS.py", {}), ("python", "UnitTestSmiles.py", {}),
|
||||
("python", "UnitTestSuppliers.py", {}), ("python", "UnitTestSurf.py", {}),
|
||||
("python", "UnitTestMol3D.py", {}), ("python", "UnitTestCatalog.py", {}),
|
||||
("python", "UnitTestDescriptors.py", {}), ("python", "UnitTestInchi.py", {}),
|
||||
("python", "UnitTestFunctionalGroups.py", {}), ("python", "UnitTestCrippen.py", {}),
|
||||
("python", "UnitTestPandasTools.py", {}), ("python", "UnitTestDocTestsChem.py", {}),
|
||||
("python", "UnitTestFeatFinderCLI.py", {}), ("python", "UnitTestQED.py", {}),
|
||||
("python", "UnitTestSaltRemover.py", {}), ("python", "test_list.py", {
|
||||
'dir': 'AtomPairs'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'ChemUtils'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'EState'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'FeatMaps'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Fingerprints'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Pharm2D'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Pharm3D'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Subshape'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Suppliers'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Scaffolds'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Draw'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Fraggle'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'SimpleEnum'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'Features'
|
||||
}), ("python", "test_list.py", {
|
||||
'dir': 'MolStandardize'
|
||||
})]
|
||||
|
||||
# only attempt the MolKey tests if we have the pre-reqs:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user