Extend RDKit::MolStandardize with a validation and standardization Pipeline (#7582)

* Extend RDKit::MolStandardize with a validation and standardization Pipeline

* suggested changes

* apply clang-format

* apply yapf

* MolStandardize::FeaturesValidation optionally disallow dative bonds

* add allowDativeBondType to MolStandardize::PipelineOptions

* apply clang-format

* make the API of other validation classes more consistent with MolStandardize::FeaturesValidation

* apply clang-format

* PipelineStage to enum class
remove virtual functions from Pipeline class
be explicit about enums

* light refactoring to avoid what I think is an unnecessary call to `parse`

* a bit of modernization

* make the pipeline configurable

* make parse and serialize configurable too

* switch to storing pipeline stages using uints

* add a simple test for providing a pipeline

* update pointer alignment for clang-format

* test modifying the parser and serializer

* update swig requirement

* changes in response to review

* changes in response to review

* rename PipelineResult's *MolBlock members to *MolData

* upgrade swig to 4.2 in the CI environments

* add a few missing export directives

---------

Co-authored-by: greg landrum <greg.landrum@gmail.com>
This commit is contained in:
Riccardo Vianello
2024-07-30 17:09:16 +02:00
committed by GitHub
parent 138bdc8d58
commit 3f7caf0147
20 changed files with 6494 additions and 203 deletions

View File

@@ -9,7 +9,7 @@ steps:
conda create --name rdkit_build -c conda-forge cmake \
libboost=$(boost_version) \
libboost-devel=$(boost_version) \
swig=4.1
swig=4.2
displayName: Setup build environment
- bash: |
source ${CONDA}/etc/profile.d/conda.sh

View File

@@ -21,7 +21,7 @@ steps:
libcxx=$(compiler_version) cmake=3.26 \
libboost=$(boost_version) \
libboost-devel=$(boost_version) \
cairo eigen swig=4.1
cairo eigen swig=4.2
conda activate rdkit_build
displayName: Setup build environment
- bash: |

View File

@@ -15,7 +15,7 @@ steps:
cmake=3.26 ^
libboost=$(boost_version) ^
libboost-devel=$(boost_version) ^
cairo eigen swig=4.1
cairo eigen swig=4.2
call activate rdkit_build
displayName: Install dependencies
- script: |

View File

@@ -23,7 +23,7 @@ BinPackArguments: true
ColumnLimit: 80
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
DerivePointerAlignment: true
DerivePointerAlignment: false
ExperimentalAutoDetectBinPacking: false
IndentCaseLabels: true
IndentWrappedFunctionNames: false
@@ -40,7 +40,7 @@ PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 120
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
PointerAlignment: Right
SpacesBeforeTrailingComments: 2
Cpp11BracedListStyle: true
Standard: Cpp11

View File

@@ -1,55 +1,61 @@
rdkit_library(MolStandardize
MolStandardize.cpp
Metal.cpp
Normalize.cpp
Validate.cpp
Charge.cpp
Tautomer.cpp
Fragment.cpp
FragmentCatalog/FragmentCatalogEntry.cpp
FragmentCatalog/FragmentCatalogParams.cpp
FragmentCatalog/FragmentCatalogUtils.cpp
AcidBaseCatalog/AcidBaseCatalogEntry.cpp
AcidBaseCatalog/AcidBaseCatalogParams.cpp
AcidBaseCatalog/AcidBaseCatalogUtils.cpp
TransformCatalog/TransformCatalogEntry.cpp
TransformCatalog/TransformCatalogParams.cpp
TransformCatalog/TransformCatalogUtils.cpp
TautomerCatalog/TautomerCatalogEntry.cpp
TautomerCatalog/TautomerCatalogParams.cpp
TautomerCatalog/TautomerCatalogUtils.cpp
LINK_LIBRARIES ChemReactions ChemTransforms SmilesParse SubstructMatch Descriptors GraphMol )
Pipeline.cpp
MolStandardize.cpp
Metal.cpp
Normalize.cpp
Validate.cpp
Charge.cpp
Tautomer.cpp
Fragment.cpp
FragmentCatalog/FragmentCatalogEntry.cpp
FragmentCatalog/FragmentCatalogParams.cpp
FragmentCatalog/FragmentCatalogUtils.cpp
AcidBaseCatalog/AcidBaseCatalogEntry.cpp
AcidBaseCatalog/AcidBaseCatalogParams.cpp
AcidBaseCatalog/AcidBaseCatalogUtils.cpp
TransformCatalog/TransformCatalogEntry.cpp
TransformCatalog/TransformCatalogParams.cpp
TransformCatalog/TransformCatalogUtils.cpp
TautomerCatalog/TautomerCatalogEntry.cpp
TautomerCatalog/TautomerCatalogParams.cpp
TautomerCatalog/TautomerCatalogUtils.cpp
LINK_LIBRARIES ChemReactions ChemTransforms SmilesParse SubstructMatch Descriptors GraphMol )
target_compile_definitions(MolStandardize PRIVATE RDKIT_MOLSTANDARDIZE_BUILD)
rdkit_headers(MolStandardize.h
Metal.h
Normalize.h
Validate.h
Charge.h
Tautomer.h
Fragment.h
DEST GraphMol/MolStandardize)
rdkit_headers(
Pipeline.h
MolStandardize.h
Metal.h
Normalize.h
Validate.h
Charge.h
Tautomer.h
Fragment.h
DEST GraphMol/MolStandardize)
rdkit_headers(FragmentCatalog/FragmentCatalogEntry.h
FragmentCatalog/FragmentCatalogParams.h
FragmentCatalog/FragmentCatalogUtils.h
DEST GraphMol/MolStandardize/FragmentCatalog)
rdkit_headers(
FragmentCatalog/FragmentCatalogEntry.h
FragmentCatalog/FragmentCatalogParams.h
FragmentCatalog/FragmentCatalogUtils.h
DEST GraphMol/MolStandardize/FragmentCatalog)
rdkit_headers(AcidBaseCatalog/AcidBaseCatalogEntry.h
AcidBaseCatalog/AcidBaseCatalogParams.h
AcidBaseCatalog/AcidBaseCatalogUtils.h
DEST GraphMol/MolStandardize/AcidBaseCatalog)
rdkit_headers(
AcidBaseCatalog/AcidBaseCatalogEntry.h
AcidBaseCatalog/AcidBaseCatalogParams.h
AcidBaseCatalog/AcidBaseCatalogUtils.h
DEST GraphMol/MolStandardize/AcidBaseCatalog)
rdkit_headers(TransformCatalog/TransformCatalogEntry.h
TransformCatalog/TransformCatalogParams.h
TransformCatalog/TransformCatalogUtils.h
DEST GraphMol/MolStandardize/TransformCatalog)
rdkit_headers(
TransformCatalog/TransformCatalogEntry.h
TransformCatalog/TransformCatalogParams.h
TransformCatalog/TransformCatalogUtils.h
DEST GraphMol/MolStandardize/TransformCatalog)
rdkit_headers(TautomerCatalog/TautomerCatalogEntry.h
TautomerCatalog/TautomerCatalogParams.h
TautomerCatalog/TautomerCatalogUtils.h
DEST GraphMol/MolStandardize/TautomerCatalog)
rdkit_headers(
TautomerCatalog/TautomerCatalogEntry.h
TautomerCatalog/TautomerCatalogParams.h
TautomerCatalog/TautomerCatalogUtils.h
DEST GraphMol/MolStandardize/TautomerCatalog)
if(RDK_BUILD_PYTHON_WRAPPERS)
add_subdirectory(Wrap)
@@ -63,5 +69,6 @@ rdkit_test(molTautomerTest testTautomer.cpp LINK_LIBRARIES MolStandardize )
rdkit_test(molStandardizeSmallTest test2.cpp LINK_LIBRARIES MolStandardize )
rdkit_test(molFragmentTest testFragment.cpp LINK_LIBRARIES MolStandardize )
rdkit_catch_test(molStandardizeCatchTest catch_tests.cpp LINK_LIBRARIES MolStandardize )
rdkit_catch_test(molStandardizePipelineTest testPipeline.cpp LINK_LIBRARIES MolStandardize)

View File

@@ -538,7 +538,7 @@ void Uncharger::unchargeInPlace(RWMol &mol) {
}
}
}
} // namespace MolStandardize
}
} // namespace MolStandardize
} // namespace RDKit

View File

@@ -0,0 +1,587 @@
//
// Copyright (C) 2023 Novartis Biomedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <cmath>
#include <regex>
#include <sstream>
#include "Pipeline.h"
#include "Validate.h"
#include "Metal.h"
#include "Normalize.h"
#include "Charge.h"
#include "Fragment.h"
#include <RDGeneral/FileParseException.h>
#include <GraphMol/FileParsers/FileParsers.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/Chirality.h>
namespace RDKit {
namespace MolStandardize {
void PipelineResult::append(PipelineStatus newStatus, const std::string &info) {
status = static_cast<PipelineStatus>(status | newStatus);
log.push_back({newStatus, info});
}
PipelineResult Pipeline::run(const std::string &molblock) const {
PipelineResult result;
result.status = NO_EVENT;
result.inputMolData = molblock;
// parse the molblock into an RWMol instance
result.stage = static_cast<uint32_t>(PipelineStage::PARSING_INPUT);
RWMOL_SPTR mol = parse(molblock, result, options);
if (!mol || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
!options.reportAllFailures)) {
return result;
}
RWMOL_SPTR_PAIR output;
if (mol->getNumAtoms() == 0 && options.allowEmptyMolecules) {
output = {mol, mol};
} else {
// we try sanitization and validation on a copy, because we want to preserve
// the original input molecule for later
RWMOL_SPTR molCopy{new RWMol(*mol)};
for (const auto &[stage, operation] : validationSteps) {
result.stage = stage;
molCopy = operation(molCopy, result, options);
if (!molCopy || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
!options.reportAllFailures)) {
return result;
}
}
for (const auto &[stage, operation] : standardizationSteps) {
result.stage = stage;
mol = operation(mol, result, options);
if (!mol || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
!options.reportAllFailures)) {
return result;
}
}
if (makeParent) {
result.stage = static_cast<uint32_t>(PipelineStage::MAKE_PARENT);
output = makeParent(mol, result, options);
if (!output.first || !output.second ||
((result.status & PIPELINE_ERROR) != NO_EVENT &&
!options.reportAllFailures)) {
return result;
}
} else {
output = {mol, mol};
}
}
// serialize as MolBlocks
result.stage = static_cast<uint32_t>(PipelineStage::SERIALIZING_OUTPUT);
serialize(output, result, options);
if ((result.status & PIPELINE_ERROR) != NO_EVENT &&
!options.reportAllFailures) {
return result;
}
result.stage = static_cast<uint32_t>(PipelineStage::COMPLETED);
return result;
}
namespace Operations {
RWMOL_SPTR parse(const std::string &molblock, PipelineResult &result,
const PipelineOptions &options) {
v2::FileParsers::MolFileParserParams params;
// we don't want to sanitize the molecule at this stage
params.sanitize = false;
// Hs wouldn't be anyway removed if the mol is not sanitized
params.removeHs = false;
// strict parsing is configurable via the pipeline options
params.strictParsing = options.strictParsing;
RWMOL_SPTR mol{};
try {
mol.reset(v2::FileParsers::MolFromMolBlock(molblock, params).release());
} catch (FileParseException &e) {
result.append(INPUT_ERROR, e.what());
}
if (!mol) {
result.append(INPUT_ERROR,
"Could not instantiate a valid molecule from input");
}
return mol;
}
void serialize(RWMOL_SPTR_PAIR output, PipelineResult &result,
const PipelineOptions &options) {
const ROMol &outputMol = *output.first;
const ROMol &parentMol = *output.second;
try {
if (!options.outputV2000) {
result.outputMolData = MolToV3KMolBlock(outputMol);
result.parentMolData = MolToV3KMolBlock(parentMol);
} else {
try {
result.outputMolData = MolToV2KMolBlock(outputMol);
result.parentMolData = MolToV2KMolBlock(parentMol);
} catch (ValueErrorException &e) {
result.append(OUTPUT_ERROR,
"Can't write molecule to V2000 output format: " +
std::string(e.what()));
}
}
} catch (const std::exception &e) {
result.append(OUTPUT_ERROR, "Can't write molecule to output format: " +
std::string(e.what()));
} catch (...) {
result.append(
OUTPUT_ERROR,
"An unexpected error occurred while serializing the output structures.");
}
}
RWMOL_SPTR prepareForValidation(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &) {
// Prepare the mol for validation.
try {
// The general intention is about validating the original input, and
// therefore limit the sanitization to the minimum, but it's not very useful
// to record a valence validation error for issues like a badly drawn nitro
// group that would be later fixed during by the normalization step.
//
// Some sanitization also needs to be performed in order to assign the
// stereochemistry (which needs to happen prior to reapplying the wedging,
// see below), and we need to find radicals, in order to support the
// corresponding validation criterion.
constexpr unsigned int sanitizeOps =
(MolOps::SANITIZE_CLEANUP | MolOps::SANITIZE_SYMMRINGS |
MolOps::SANITIZE_CLEANUP_ORGANOMETALLICS |
MolOps::SANITIZE_FINDRADICALS);
unsigned int failedOp = 0;
MolOps::sanitizeMol(*mol, failedOp, sanitizeOps);
// We want to restore the original MolBlock wedging, but this step may in
// some cases overwrite the ENDDOWNRIGHT/ENDUPRIGHT info that describes the
// configuration of double bonds adjacent to stereocenters. We therefore
// first assign the stereochemistry, and then restore the wedging.
constexpr bool cleanIt = true;
constexpr bool force = true;
constexpr bool flagPossible = true;
MolOps::assignStereochemistry(*mol, cleanIt, force, flagPossible);
Chirality::reapplyMolBlockWedging(*mol);
} catch (MolSanitizeException &) {
result.append(
PREPARE_FOR_VALIDATION_ERROR,
"An error occurred while preparing the molecule for validation.");
}
return mol;
}
namespace {
// The error messages from the ValidationMethod classes include some metadata
// in a string prefix that are not particularly useful within the context of
// this Pipeline. The function below removes that prefix.
static const std::regex prefix("^(ERROR|INFO): \\[.+\\] ");
std::string removeErrorPrefix(const std::string &message) {
return std::regex_replace(message, prefix, "");
}
} // namespace
RWMOL_SPTR validate(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &options) {
auto applyValidation = [&mol, &result, &options](
const ValidationMethod &v,
PipelineStatus status) -> bool {
auto errors = v.validate(*mol, options.reportAllFailures);
for (const auto &error : errors) {
result.append(status, removeErrorPrefix(error));
}
return errors.empty();
};
// check for undesired features in the input molecule (e.g., query
// atoms/bonds)
FeaturesValidation featuresValidation(options.allowEnhancedStereo,
options.allowAromaticBondType,
options.allowDativeBondType);
if (!applyValidation(featuresValidation, FEATURES_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// check the number of atoms and valence status
RDKitValidation rdkitValidation;
if (!applyValidation(rdkitValidation, BASIC_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// disallow radicals
DisallowedRadicalValidation radicalValidation;
if (!applyValidation(radicalValidation, BASIC_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// validate the isotopic numbers (if any are specified)
IsotopeValidation isotopeValidation(true);
if (!applyValidation(isotopeValidation, BASIC_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// verify that the input is a 2D structure
Is2DValidation is2DValidation(options.is2DZeroThreshold);
if (!applyValidation(is2DValidation, IS2D_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// validate the 2D layout (check for clashing atoms and abnormally long bonds)
Layout2DValidation layout2DValidation(
options.atomClashLimit, options.bondLengthLimit,
options.allowLongBondsInRings, options.allowAtomBondClashExemption,
options.minMedianBondLength);
if (!applyValidation(layout2DValidation, LAYOUT2D_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
// verify that the specified stereochemistry is formally correct
StereoValidation stereoValidation;
if (!applyValidation(stereoValidation, STEREO_VALIDATION_ERROR) &&
!options.reportAllFailures) {
return mol;
}
return mol;
}
RWMOL_SPTR prepareForStandardization(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &) {
// Prepare the mol for standardization.
try {
MolOps::sanitizeMol(*mol);
} catch (MolSanitizeException &) {
result.append(
PREPARE_FOR_STANDARDIZATION_ERROR,
"An error occurred while preparing the molecule for standardization.");
}
return mol;
}
RWMOL_SPTR standardize(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &options) {
auto smiles = MolToSmiles(*mol);
auto reference = smiles;
// bonding to metals
try {
MetalDisconnectorOptions mdOpts;
MetalDisconnector metalDisconnector(mdOpts);
std::unique_ptr<ROMol> metalNof{SmartsToMol(options.metalNof)};
metalDisconnector.setMetalNof(*metalNof);
std::unique_ptr<ROMol> metalNon{SmartsToMol(options.metalNon)};
metalDisconnector.setMetalNon(*metalNon);
metalDisconnector.disconnectInPlace(*mol);
} catch (...) {
result.append(
METAL_STANDARDIZATION_ERROR,
"An error occurred while processing the bonding of metal species.");
return mol;
}
smiles = MolToSmiles(*mol);
if (smiles != reference) {
result.append(METALS_DISCONNECTED,
"One or more metal atoms were disconnected.");
}
reference = smiles;
// functional groups
try {
std::unique_ptr<Normalizer> normalizer{};
if (options.normalizerData.empty()) {
normalizer.reset(new Normalizer);
} else {
std::istringstream sstr(options.normalizerData);
normalizer.reset(new Normalizer(sstr, options.normalizerMaxRestarts));
}
// normalizeInPlace() may return an ill-formed molecule if
// the sanitization of a transformed structure failed
// => use normalize() instead (also see GitHub #7189)
mol.reset(static_cast<RWMol *>(normalizer->normalize(*mol)));
mol->updatePropertyCache(false);
} catch (...) {
result.append(
NORMALIZER_STANDARDIZATION_ERROR,
"An error occurred while normalizing the representation of some functional groups");
return mol;
}
smiles = MolToSmiles(*mol);
if (smiles != reference) {
result.append(NORMALIZATION_APPLIED,
"The representation of some functional groups was adjusted.");
}
reference = smiles;
// keep the largest fragment
try {
LargestFragmentChooser fragmentChooser;
fragmentChooser.chooseInPlace(*mol);
} catch (...) {
result.append(
FRAGMENT_STANDARDIZATION_ERROR,
"An error occurred while removing the disconnected fragments");
return mol;
}
smiles = MolToSmiles(*mol);
if (smiles != reference) {
result.append(
FRAGMENTS_REMOVED,
"One or more disconnected fragments (e.g., counterions) were removed.");
}
// The stereochemistry is not assigned until after we are done modifying the
// molecular graph:
constexpr bool cleanIt = true;
constexpr bool force = true;
constexpr bool flagPossible = true;
MolOps::assignStereochemistry(*mol, cleanIt, force, flagPossible);
return mol;
}
RWMOL_SPTR reapplyWedging(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &) {
// in general, we want to restore the bond wedging from the input molblock,
// but we prefer to not use any wavy bonds, because of their ambiguity
// in some configurations.
// we therefore proceed in two steps, we first reapply the molblock wedging
// and then revert the changes related to double bonds with undefined/unknown
// stereochemistry and change single bonds with "unknown" direction into plain
// single bonds.
// in order to do so, we need to keep track of the current bond configuration
// settings.
using BondInfo = std::tuple<Bond::BondType, Bond::BondDir, Bond::BondStereo>;
std::map<unsigned int, BondInfo> oldBonds;
for (auto bond : mol->bonds()) {
oldBonds[bond->getIdx()] = {bond->getBondType(), bond->getBondDir(),
bond->getStereo()};
}
// 1) restore the original wedging from the input MolBlock
Chirality::reapplyMolBlockWedging(*mol);
// 2) revert the changes related to double bonds with stereo type "either":
// restore the STEREOANY direction of double bonds that have a substituent
// with direction UNKNOWN and are now STEREONONE
for (auto bond : mol->bonds()) {
if (bond->getBondType() != Bond::DOUBLE) {
continue;
}
Bond::BondStereo oldStereo = std::get<2>(oldBonds[bond->getIdx()]);
Bond::BondStereo newStereo = bond->getStereo();
bool hasAdjacentWavy{false};
for (auto atom : {bond->getBeginAtom(), bond->getEndAtom()}) {
for (auto adjacentBond : mol->atomBonds(atom)) {
if (adjacentBond == bond) {
continue;
}
if (adjacentBond->getBondDir() == Bond::UNKNOWN) {
hasAdjacentWavy = true;
}
}
}
if (hasAdjacentWavy && oldStereo == Bond::STEREOANY &&
newStereo == Bond::STEREONONE) {
bond->setStereo(Bond::STEREOANY);
result.append(
NORMALIZATION_APPLIED,
"Double bond " + std::to_string(bond->getIdx()) +
" was assigned an undefined/unknown stereochemical configuration");
}
}
// 3) set the bond direction to NONE for bonds with direction UNKNOWN
for (auto bond : mol->bonds()) {
if (bond->getBondDir() != Bond::UNKNOWN) {
continue;
}
bond->setBondDir(Bond::NONE);
result.append(NORMALIZATION_APPLIED, "The \"wavy\" style of bond " +
std::to_string(bond->getIdx()) +
" was removed");
}
return mol;
}
RWMOL_SPTR cleanup2D(RWMOL_SPTR mol, PipelineResult & /*result*/,
const PipelineOptions &options) {
// scale the atoms coordinates
// and make sure that z coords are set to 0 (some z coords may be non-null
// albeit smaller than the validation threshold - these noisy coords may in
// some cases also interfere with the perception of stereochemistry by some
// tools e.g., inchi)
if (options.scaledMedianBondLength > 0. && mol->getNumConformers()) {
auto &conf = mol->getConformer();
double medianBondLength =
sqrt(Layout2DValidation::squaredMedianBondLength(*mol, conf));
if (medianBondLength > options.minMedianBondLength) {
double scaleFactor = options.scaledMedianBondLength / medianBondLength;
unsigned int natoms = conf.getNumAtoms();
for (unsigned int i = 0; i < natoms; ++i) {
auto pos = conf.getAtomPos(i) * scaleFactor;
pos.z = 0.;
conf.setAtomPos(i, pos);
}
}
}
return mol;
}
namespace {
void replaceDativeBonds(RWMOL_SPTR mol) {
bool modified{false};
for (auto bond : mol->bonds()) {
if (bond->getBondType() != Bond::BondType::DATIVE) {
continue;
}
auto donor = bond->getBeginAtom();
donor->setFormalCharge(donor->getFormalCharge() + 1);
auto acceptor = bond->getEndAtom();
acceptor->setFormalCharge(acceptor->getFormalCharge() - 1);
bond->setBondType(Bond::BondType::SINGLE);
modified = true;
}
if (modified) {
mol->updatePropertyCache(false);
}
}
void removeHsAtProtonatedSites(RWMOL_SPTR mol) {
boost::dynamic_bitset<> protons{mol->getNumAtoms(), 0};
for (auto atom : mol->atoms()) {
if (atom->getAtomicNum() != 1 || atom->getDegree() != 1) {
continue;
}
for (auto neighbor : mol->atomNeighbors(atom)) {
if (neighbor->getFormalCharge() > 0) {
protons.set(atom->getIdx());
}
}
}
if (protons.any()) {
for (int idx = mol->getNumAtoms() - 1; idx >= 0; --idx) {
if (!protons[idx]) {
continue;
}
auto atom = mol->getAtomWithIdx(idx);
for (auto bond : mol->atomBonds(atom)) {
auto neighbor = bond->getOtherAtom(atom);
neighbor->setNumExplicitHs(neighbor->getNumExplicitHs() + 1);
break; // there are no other bonds anyways
}
mol->removeAtom(atom);
}
mol->updatePropertyCache(false);
}
}
} // namespace
RWMOL_SPTR_PAIR makeParent(RWMOL_SPTR mol, PipelineResult &result,
const PipelineOptions &) {
auto reference = MolToSmiles(*mol);
RWMOL_SPTR parent{new RWMol(*mol)};
// A "parent" structure is constructed here, in order to provide a
// representation of the original input that may be more suitable for
// identification purposes even though it may not reflect the most stable
// physical state or nicest representation for the compound.
//
// The two steps that are currently implemented for this procedure consist in
// normalizing the overall charge status and replacing any explicit dative
// bonds.
//
// If the input was submitted in an unsuitable protonation status, the
// neutralized parent structure may become the actual output from the
// standardization.
// overall charge status
try {
// The Uncharger implementation wouldn't identify the positively
// charged sites with adjacent explicit Hs correctly (it's a quite
// unlikely configuration, but potentially possible considering that
// the pipeline operates on unsanitized input).
//
// If present, these Hs are therefore removed from the molecular graph
// prior to neutralization.
removeHsAtProtonatedSites(parent);
static const bool canonicalOrdering = false;
static const bool force = true;
static const bool protonationOnly = true;
Uncharger uncharger(canonicalOrdering, force, protonationOnly);
uncharger.unchargeInPlace(*parent);
} catch (...) {
result.append(
CHARGE_STANDARDIZATION_ERROR,
"An error occurred while normalizing the compound's charge status");
return {{}, {}};
}
// Check if `mol` was submitted in a suitable ionization state
int parentCharge{};
for (auto atom : parent->atoms()) {
parentCharge += atom->getFormalCharge();
}
int molCharge{};
for (auto atom : mol->atoms()) {
molCharge += atom->getFormalCharge();
}
// If mol is neutral or in a protonation state that partially or fully
// balances the non-neutralizable charged sites in the parent structure,
// then mol is accepted. Otherwise, it is replaced by its parent.
if ((molCharge > 0 && molCharge > parentCharge) ||
(molCharge < 0 && molCharge < parentCharge)) {
mol = parent;
}
auto smiles = MolToSmiles(*mol);
if (smiles != reference) {
result.append(PROTONATION_CHANGED, "The protonation state was adjusted.");
}
reference = smiles;
// normalize the dative bonds
replaceDativeBonds(parent);
return {mol, parent};
}
} // namespace Operations
} // namespace MolStandardize
} // namespace RDKit

View File

@@ -0,0 +1,234 @@
//
// Copyright (C) 2023 Novartis Biomedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#ifndef RD_MOLSTANDARDIZE_PIPELINE_H
#define RD_MOLSTANDARDIZE_PIPELINE_H
#include <RDGeneral/export.h>
#include <GraphMol/RWMol.h>
#include <memory>
#include <string>
#include <utility>
#include <vector>
namespace RDKit {
namespace MolStandardize {
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineOptions {
// parsing
bool strictParsing{false};
// validation
bool reportAllFailures{true};
bool allowEmptyMolecules{false};
bool allowEnhancedStereo{false};
bool allowAromaticBondType{false};
bool allowDativeBondType{false};
double is2DZeroThreshold{1e-3};
double atomClashLimit{0.03};
double minMedianBondLength{1e-3};
double bondLengthLimit{100.};
bool allowLongBondsInRings{true};
bool allowAtomBondClashExemption{true};
// cleanup/standardization
// metal disconnector options
std::string metalNof{"[Li,Na,K,Rb,Cs,Fr]~[#7,#8,F]"};
std::string metalNon{};
// normalizer options
std::string normalizerData{
"// Name\tSMIRKS\n"
"Nitro to N+(O-)=O\t[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]\n"
"Sulfone to S(=O)(=O)\t[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])\n"
"Pyridine oxide to n+O-\t[nH0+0:1]=[OH0+0:2]>>[n+:1][O-:2]\n"
"Azide to N=N+=N-\t[*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]\n"
"Diazo/azo to =N+=N-\t[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]\n"
// Note: the sulfoxide transformation by default included in the
// Normalizer configuration was removed Note: the transformation below was
// ported from STRUCHK and it's not part of the default Normalizer
// configuration
"[SH](=O)(=O) to S(=O)O\t[c,C,N,O,F,Cl,Br,I:1][SH+0:2](=[O:3])=[O:4]>>[*:1][*:2]([*:3])=[*:4]\n"
// Note: the two transformations below replace the default Phosphate
// normalization in order to ensure that, if an O is available, the double
// bond is placed between P and O
"Phosphate to P(O-)=O\t[O-:1][P+;D4:2][O,S,Se,Te;-1:3]>>[O+0:1]=[P+0;D5:2][*-1:3]\n"
"Generalized phosphate to P(X-)=Y\t[S,Se,Te;-1:1][P+;D4:2][S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]\n"
"C/S+N to C/S=N+\t[C,S&!$([S+]-[O-]);X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
"P+N to P=N+\t[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
"Recombine 1,3-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]\n"
"Recombine 1,3-separated charges\t[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]\n"
"Recombine 1,3-separated charges\t[N,O,P,S;-1:1]-[a+0:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]\n"
"Recombine 1,5-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
"Recombine 1,5-separated charges\t[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]\n"
"Recombine 1,5-separated charges\t[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]\n"
// Note: four transformations were added to the normalization of aliphatic
// conjug cations in order to favor the positioning of new double bonds
// within rings
"Normalize 1,3 conjugated cation\t[N;+0!H0:1]@-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
"Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
"Normalize 1,3 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
"Normalize 1,3 conjugated cation\t[n;+0!H0:1]:[c:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]\n"
"Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
"Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
"Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
"Normalize 1,5 conjugated cation\t[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]\n"
"Charge normalization\t[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1][O-:2]\n"
"Charge recombination\t[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]\n"};
unsigned int normalizerMaxRestarts{200};
double scaledMedianBondLength{1.};
// serialization
bool outputV2000{false};
};
enum RDKIT_MOLSTANDARDIZE_EXPORT PipelineStatus {
NO_EVENT = 0,
INPUT_ERROR = (1 << 0),
PREPARE_FOR_VALIDATION_ERROR = (1 << 1),
FEATURES_VALIDATION_ERROR = (1 << 2),
BASIC_VALIDATION_ERROR = (1 << 3),
IS2D_VALIDATION_ERROR = (1 << 4),
LAYOUT2D_VALIDATION_ERROR = (1 << 5),
STEREO_VALIDATION_ERROR = (1 << 6),
VALIDATION_ERROR = (FEATURES_VALIDATION_ERROR | BASIC_VALIDATION_ERROR |
IS2D_VALIDATION_ERROR | LAYOUT2D_VALIDATION_ERROR |
STEREO_VALIDATION_ERROR),
PREPARE_FOR_STANDARDIZATION_ERROR = (1 << 7),
METAL_STANDARDIZATION_ERROR = (1 << 8),
NORMALIZER_STANDARDIZATION_ERROR = (1 << 9),
FRAGMENT_STANDARDIZATION_ERROR = (1 << 10),
CHARGE_STANDARDIZATION_ERROR = (1 << 11),
STANDARDIZATION_ERROR =
(METAL_STANDARDIZATION_ERROR | NORMALIZER_STANDARDIZATION_ERROR |
FRAGMENT_STANDARDIZATION_ERROR | CHARGE_STANDARDIZATION_ERROR),
OUTPUT_ERROR = (1 << 12),
PIPELINE_ERROR = (INPUT_ERROR | PREPARE_FOR_VALIDATION_ERROR |
VALIDATION_ERROR | PREPARE_FOR_STANDARDIZATION_ERROR |
STANDARDIZATION_ERROR | OUTPUT_ERROR),
METALS_DISCONNECTED = (1 << 23),
NORMALIZATION_APPLIED = (1 << 24),
FRAGMENTS_REMOVED = (1 << 25),
PROTONATION_CHANGED = (1 << 26),
STRUCTURE_MODIFICATION = (METALS_DISCONNECTED | NORMALIZATION_APPLIED |
FRAGMENTS_REMOVED | PROTONATION_CHANGED)
};
enum class RDKIT_MOLSTANDARDIZE_EXPORT PipelineStage : std::uint32_t {
NOT_STARTED = 0,
PARSING_INPUT,
PREPARE_FOR_VALIDATION,
VALIDATION,
PREPARE_FOR_STANDARDIZATION,
STANDARDIZATION,
REAPPLY_WEDGING,
CLEANUP_2D,
MAKE_PARENT,
SERIALIZING_OUTPUT,
COMPLETED
};
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineLogEntry {
PipelineStatus status;
std::string detail;
};
using PipelineLog = std::vector<PipelineLogEntry>;
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineResult {
PipelineStatus status;
std::uint32_t stage;
PipelineLog log;
std::string inputMolData;
std::string outputMolData;
std::string parentMolData;
void append(PipelineStatus newStatus, const std::string &info);
};
using RWMOL_SPTR_PAIR = std::pair<RWMOL_SPTR, RWMOL_SPTR>;
namespace Operations {
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForValidation(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR validate(RWMOL_SPTR mol,
PipelineResult &result,
const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForStandardization(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR standardize(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR reapplyWedging(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR cleanup2D(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR_PAIR makeParent(
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR parse(const std::string &molblock,
PipelineResult &result,
const PipelineOptions &options);
RDKIT_MOLSTANDARDIZE_EXPORT void serialize(RWMOL_SPTR_PAIR output,
PipelineResult &result,
const PipelineOptions &options);
using ParseOperation = decltype(&parse);
using SerializeOperation = decltype(&serialize);
using Operation = decltype(&prepareForValidation);
using ParentOperation = decltype(&makeParent);
using PipelineVector = std::vector<std::pair<std::uint32_t, Operation>>;
const PipelineVector validationSteps{
// input sanitization and cleanup
{static_cast<uint32_t>(PipelineStage::PREPARE_FOR_VALIDATION),
&prepareForValidation},
// validate the structure
{static_cast<uint32_t>(PipelineStage::VALIDATION), &validate}};
const PipelineVector standardizationSteps{
{static_cast<uint32_t>(PipelineStage::PREPARE_FOR_STANDARDIZATION),
&prepareForStandardization},
{static_cast<uint32_t>(PipelineStage::STANDARDIZATION), &standardize},
{static_cast<uint32_t>(PipelineStage::REAPPLY_WEDGING), &reapplyWedging},
{static_cast<uint32_t>(PipelineStage::CLEANUP_2D), &cleanup2D}};
} // namespace Operations
class RDKIT_MOLSTANDARDIZE_EXPORT Pipeline {
private:
PipelineOptions options;
Operations::ParseOperation parse = Operations::parse;
Operations::SerializeOperation serialize = Operations::serialize;
Operations::PipelineVector validationSteps = Operations::validationSteps;
Operations::PipelineVector standardizationSteps =
Operations::standardizationSteps;
Operations::ParentOperation makeParent = Operations::makeParent;
public:
Pipeline() = default;
explicit Pipeline(const PipelineOptions &o) : options(o){};
~Pipeline() = default;
PipelineResult run(const std::string &molblock) const;
void setValidationSteps(const Operations::PipelineVector &steps) {
validationSteps = steps;
}
void setStandardizationSteps(const Operations::PipelineVector &steps) {
standardizationSteps = steps;
}
void setMakeParent(Operations::ParentOperation op) { makeParent = op; }
void setParse(Operations::ParseOperation op) { parse = op; }
void setSerialize(Operations::SerializeOperation op) { serialize = op; }
private:
};
} // namespace MolStandardize
} // namespace RDKit
#endif

View File

@@ -11,18 +11,20 @@
#include "Fragment.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/QueryOps.h>
#include <GraphMol/MolStandardize/FragmentCatalog/FragmentCatalogParams.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <GraphMol/PeriodicTable.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <vector>
#include <string>
#include <utility>
#include <vector>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
using namespace std;
using namespace RDKit;
namespace RDKit {
class RWMol;
class ROMol;
@@ -30,10 +32,9 @@ class ROMol;
namespace MolStandardize {
std::vector<ValidationErrorInfo> CompositeValidation::validate(
const ROMol &mol, bool reportAllFailures) const
{
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
for (const auto & method : validations) {
for (const auto &method : validations) {
auto partial = method->validate(mol, reportAllFailures);
if (!partial.empty()) {
std::copy(partial.begin(), partial.end(), std::back_inserter(errors));
@@ -73,8 +74,8 @@ std::vector<ValidationErrorInfo> RDKitValidation::validate(
return errors;
}
std::vector<ValidationErrorInfo>
NoAtomValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
std::vector<ValidationErrorInfo> NoAtomValidation::validate(
const ROMol &mol, bool /*reportAllFailures*/) const {
std::vector<ValidationErrorInfo> errors;
unsigned int na = mol.getNumAtoms();
if (!na) {
@@ -83,8 +84,8 @@ NoAtomValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
return errors;
}
std::vector<ValidationErrorInfo>
FragmentValidation::validate(const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> FragmentValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
// REVIEW: reportAllFailures is not being used here. is that correct?
RDUNUSED_PARAM(reportAllFailures);
@@ -145,8 +146,8 @@ FragmentValidation::validate(const ROMol &mol, bool reportAllFailures) const {
return errors;
}
std::vector<ValidationErrorInfo>
NeutralValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
std::vector<ValidationErrorInfo> NeutralValidation::validate(
const ROMol &mol, bool /*reportAllFailures*/) const {
std::vector<ValidationErrorInfo> errors;
int charge = RDKit::MolOps::getFormalCharge(mol);
if (charge != 0) {
@@ -162,51 +163,48 @@ NeutralValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const
return errors;
}
std::vector<ValidationErrorInfo>
IsotopeValidation::validate(const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> IsotopeValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
unsigned int na = mol.getNumAtoms();
std::set<string> isotopes;
// loop over atoms
for (size_t i = 0; i < na; ++i) {
if (!reportAllFailures) {
if (errors.size() >= 1) {
break;
}
}
const Atom *atom = mol.getAtomWithIdx(i);
for (auto atom : mol.atoms()) {
unsigned int isotope = atom->getIsotope();
if (isotope != 0) {
std::string symbol = atom->getSymbol();
isotopes.insert(std::to_string(isotope) + symbol);
if (isotope == 0) {
continue;
}
}
for (auto &isotope : isotopes) {
errors.push_back("INFO: [IsotopeValidation] Molecule contains isotope " +
isotope);
std::string symbol = atom->getSymbol();
unsigned int atomicNum = atom->getAtomicNum();
if (atomicNum && strict) {
PeriodicTable *periodicTable = PeriodicTable::getTable();
double mass = periodicTable->getMassForIsotope(atomicNum, isotope);
if (mass == 0.0) {
errors.push_back(
"ERROR: [IsotopeValidation] The molecule contains an unknown isotope: " +
std::to_string(isotope) + symbol);
}
} else {
errors.push_back("INFO: [IsotopeValidation] Molecule contains isotope " +
std::to_string(isotope) + symbol);
}
if (!errors.empty() && !reportAllFailures) {
break;
}
}
return errors;
}
// constructor
MolVSValidation::MolVSValidation()
: CompositeValidation({
std::make_shared<NoAtomValidation>(),
std::make_shared<FragmentValidation>(),
std::make_shared<NeutralValidation>(),
std::make_shared<IsotopeValidation>()
})
{
}
: CompositeValidation({std::make_shared<NoAtomValidation>(),
std::make_shared<FragmentValidation>(),
std::make_shared<NeutralValidation>(),
std::make_shared<IsotopeValidation>()}) {}
// overloaded constructor
MolVSValidation::MolVSValidation(
const std::vector<std::shared_ptr<ValidationMethod>> & validations)
: CompositeValidation(validations)
{
}
const std::vector<std::shared_ptr<ValidationMethod>> &validations)
: CompositeValidation(validations) {}
std::vector<ValidationErrorInfo> AllowedAtomsValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
@@ -267,6 +265,735 @@ std::vector<ValidationErrorInfo> DisallowedAtomsValidation::validate(
return errors;
}
std::vector<ValidationErrorInfo> DisallowedRadicalValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
for (auto atom : mol.atoms()) {
unsigned int numRadicalElectrons = atom->getNumRadicalElectrons();
if (numRadicalElectrons == 0) {
continue;
}
unsigned int atomicNum = atom->getAtomicNum();
unsigned int degree = atom->getDegree();
if ((atomicNum == 7 || atomicNum == 8) && numRadicalElectrons == 1 &&
degree == 1) {
unsigned int neighborAtomicNum = 0;
Bond::BondType bondType = Bond::BondType::UNSPECIFIED;
for (auto neighbor : mol.atomNeighbors(atom)) {
// only one iteration is performed, because degree == 1
neighborAtomicNum = neighbor->getAtomicNum();
bondType = mol.getBondBetweenAtoms(atom->getIdx(), neighbor->getIdx())
->getBondType();
}
if (atomicNum == 7 && neighborAtomicNum == 8 &&
bondType == Bond::BondType::DOUBLE) {
// nitric oxide
continue;
}
if (atomicNum == 8 && neighborAtomicNum == 7 &&
bondType == Bond::BondType::SINGLE) {
// aminoxyl
continue;
}
}
errors.push_back(
"ERROR: [DisallowedRadicalValidation] The radical at atom " +
std::to_string(atom->getIdx()) + " is not allowed");
if (!reportAllFailures) {
break;
}
}
return errors;
}
std::vector<ValidationErrorInfo> FeaturesValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
// Optionally disallow query and dummy atoms, and aliases
for (auto atom : mol.atoms()) {
if (!allowQueries && atom->hasQuery()) {
errors.push_back("ERROR: [FeaturesValidation] Query atom " +
std::to_string(atom->getIdx()) + " is not allowed");
if (!reportAllFailures) {
return errors;
}
} else if (!allowDummies && isAtomDummy(atom)) {
errors.push_back("ERROR: [FeaturesValidation] Dummy atom " +
std::to_string(atom->getIdx()) + " is not allowed");
if (!reportAllFailures) {
return errors;
}
}
if (!allowAtomAliases && atom->hasProp(common_properties::molFileAlias)) {
errors.push_back(
"ERROR: [FeaturesValidation] Atom " + std::to_string(atom->getIdx()) +
" with alias '" +
atom->getProp<std::string>(common_properties::molFileAlias) +
"' is not allowed");
if (!reportAllFailures) {
return errors;
}
}
}
// Optionally disallow query, aromatic or dative bonds
for (auto bond : mol.bonds()) {
if (!allowQueries && bond->hasQuery()) {
errors.push_back("ERROR: [FeaturesValidation] Query bond " +
std::to_string(bond->getIdx()) + " is not allowed");
if (!reportAllFailures) {
return errors;
}
}
if (!allowAromaticBondType &&
bond->getBondType() == Bond::BondType::AROMATIC) {
errors.push_back("ERROR: [FeaturesValidation] Bond " +
std::to_string(bond->getIdx()) +
" of aromatic type is not allowed");
if (!reportAllFailures) {
return errors;
}
}
if (!allowDativeBondType && bond->getBondType() == Bond::BondType::DATIVE) {
errors.push_back("ERROR: [FeaturesValidation] Bond " +
std::to_string(bond->getIdx()) +
" of dative type is not allowed");
if (!reportAllFailures) {
return errors;
}
}
}
// Optionally disallow using the enahanced stereochemistry
if (!allowEnhancedStereo && mol.getStereoGroups().size()) {
errors.emplace_back(
"ERROR: [FeaturesValidation] Enhanced stereochemistry features are not allowed");
}
return errors;
}
std::vector<ValidationErrorInfo> Is2DValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
if (!mol.getNumConformers()) {
errors.emplace_back(
"ERROR: [Is2DValidation] The molecule has no coordinates");
return errors;
}
const auto &conf = mol.getConformer();
if (conf.is3D()) {
errors.emplace_back(
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates");
return errors;
}
// conf.is3D() is assigned by the mol format parser based on the input
// mol block designation, but also taking into account the presence of
// non-null Z coordinates or stereobonds.
//
// the following test is in this sense probably redundant, but it's still
// implemented in case molecules are built by other means.
double max_absz{};
for (const auto &p : conf.getPositions()) {
max_absz = std::max(std::abs(p.z), max_absz);
}
if (max_absz > threshold) {
errors.emplace_back(
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates");
if (!reportAllFailures) {
return errors;
}
}
if (conf.getNumAtoms() < 2) {
// there is nothing else to check here, if there is at most one atom.
return errors;
}
// verify that the atoms are not all in the same position (this often happens
// because no coordinates were assigned and all atoms appear to be placed in
// the origin)
double min_x = std::numeric_limits<double>::max();
double max_x = std::numeric_limits<double>::min();
double min_y = std::numeric_limits<double>::max();
double max_y = std::numeric_limits<double>::min();
for (const auto &p : conf.getPositions()) {
min_x = std::min(p.x, min_x);
max_x = std::max(p.x, max_x);
min_y = std::min(p.y, min_y);
max_y = std::max(p.y, max_y);
}
auto delta_x = max_x - min_x;
auto delta_y = max_y - min_y;
auto max_delta = std::max(delta_x, delta_y);
if (max_delta < threshold) {
errors.emplace_back(
"ERROR: [Is2DValidation] All atoms have the same (x,y) coordinates");
if (!reportAllFailures) {
return errors;
}
}
return errors;
}
double Layout2DValidation::squaredMedianBondLength(const ROMol &mol,
const Conformer &conf) {
// Compute the squared value of the median bond length, but exclude the bonds
// of null length.
double median = 0.0;
unsigned int numBonds = mol.getNumBonds();
if (numBonds) {
std::vector<double> values;
values.reserve(numBonds);
for (const auto &bond : mol.bonds()) {
const auto &p1 = conf.getAtomPos(bond->getBeginAtomIdx());
const auto &p2 = conf.getAtomPos(bond->getEndAtomIdx());
auto value = (p1 - p2).lengthSq();
if (value > 0.) {
values.push_back(value);
}
}
if (!values.empty()) {
std::sort(values.begin(), values.end());
numBonds = values.size();
if (numBonds % 2) {
median = values[numBonds / 2];
} else {
median = 0.5 * (values[numBonds / 2 - 1] + values[numBonds / 2]);
}
}
}
return median;
}
std::vector<ValidationErrorInfo> Layout2DValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
if (!mol.getNumConformers()) {
errors.emplace_back(
"ERROR: [Layout2DValidation] The molecule has no coordinates");
return errors;
}
const auto &conf = mol.getConformer();
unsigned int natoms = conf.getNumAtoms();
if (natoms < 2) {
// there is nothing to check here, if there is only one atom.
return errors;
}
// compute threshold values for the squared atom-atom or atom-bond
// distance and for the maximum bond length using the median squared
// bond length as reference.
auto reference = squaredMedianBondLength(mol, conf);
if (reference < minMedianBondLength * minMedianBondLength) {
errors.emplace_back(
"ERROR: [Layout2DValidation] The median bond length is smaller than the configured limit");
if (!reportAllFailures) {
return errors;
}
}
// check for atoms clashing w/ other atoms
auto atomClashThreshold = clashLimit * clashLimit * reference;
for (unsigned int i = 0; i < natoms - 1; ++i) {
const auto &pi = conf.getAtomPos(i);
for (unsigned int j = i + 1; j < natoms; ++j) {
const auto &pj = conf.getAtomPos(j);
auto d2 = (pi - pj).lengthSq();
if (d2 < atomClashThreshold) {
errors.push_back("ERROR: [Layout2DValidation] Atom " +
std::to_string(i) + " is too close to atom " +
std::to_string(j));
if (!reportAllFailures) {
return errors;
}
}
}
}
// make sure we have the required rings info available
if (allowLongBondsInRings || allowAtomBondClashExemption) {
if (!mol.getRingInfo()->isInitialized()) {
RDKit::MolOps::fastFindRings(mol);
}
}
for (auto bond : mol.bonds()) {
unsigned int i = bond->getBeginAtomIdx();
const auto &pi = conf.getAtomPos(i);
unsigned int j = bond->getEndAtomIdx();
const auto &pj = conf.getAtomPos(j);
auto ll = (pi - pj).lengthSq();
// check for exceedingly long bonds
auto bondLengthThreshold = bondLengthLimit * bondLengthLimit * reference;
if (!allowLongBondsInRings ||
mol.getRingInfo()->numBondRings(bond->getIdx()) == 0) {
if (ll > bondLengthThreshold) {
errors.push_back("ERROR: [Layout2DValidation] The length of bond " +
std::to_string(bond->getIdx()) + " between atoms " +
std::to_string(i) + " and " + std::to_string(j) +
" exceeds a configured limit");
if (!reportAllFailures) {
return errors;
}
}
}
if (allowAtomBondClashExemption) {
// is this bond exempted from atom-bond collision detection?
if ((ll > 5. * 5. * reference) &&
mol.getRingInfo()->numBondRings(bond->getIdx()) != 0) {
continue;
}
}
// check for atoms clashing with this bond
for (unsigned int k = 0; k < natoms; ++k) {
if (k == i || k == j) {
continue;
}
const auto &pk = conf.getAtomPos(k);
/*
k
/
r/
/
/
i---------------j
b
*/
auto vik = pk - pi;
auto vij = pj - pi;
auto rr = vik.lengthSq();
auto bb = vij.lengthSq();
auto rb = vik.dotProduct(vij);
static constexpr double EPS{
1.e-7}; // prevent dividing by zero in extreme cases
auto kb = (rr * bb - rb * rb) / (bb + EPS);
if (rb >= 0. && /* cos alpha > 0 */
rb <= bb && /* projection of r onto b does not exceed b */
kb < atomClashThreshold /* distance from bond < limit */
) {
errors.push_back("ERROR: [Layout2DValidation] Atom " +
std::to_string(k) + " too close to bond " +
std::to_string(bond->getIdx()));
if (!reportAllFailures) {
return errors;
}
}
}
}
return errors;
}
namespace {
bool hasStereoBond(const ROMol &mol, const Atom *atom) {
for (auto bond : mol.atomBonds(atom)) {
if (atom != bond->getBeginAtom()) {
continue;
}
auto bondDir = bond->getBondDir();
if (bondDir == Bond::BondDir::BEGINDASH ||
bondDir == Bond::BondDir::BEGINWEDGE ||
bondDir == Bond::BondDir::UNKNOWN) {
return true;
}
}
return false;
}
struct BondInfo {
const Bond *bond = nullptr;
Bond::BondDir bondDir = Bond::BondDir::NONE;
double angle = 0.;
};
struct BondDirCount {
unsigned int wedge = 0;
unsigned int dash = 0;
unsigned int unknown = 0;
unsigned int other = 0;
};
struct NeighborsInfo {
NeighborsInfo(const ROMol &mol, const Atom *atom);
std::vector<BondInfo> bonds;
BondDirCount dirCount;
};
NeighborsInfo::NeighborsInfo(const ROMol &mol, const Atom *atom) {
for (auto bond : mol.atomBonds(atom)) {
BondInfo info;
info.bond = bond;
if (bond->getBeginAtom() == atom) {
// do not consider the bond direction
// settings of bonds that begin from
// neighboring atoms
info.bondDir = bond->getBondDir();
}
bonds.push_back(info);
}
for (const auto &info : bonds) {
Bond::BondDir dir = info.bondDir;
switch (dir) {
case Bond::BondDir::BEGINDASH:
++dirCount.dash;
break;
case Bond::BondDir::BEGINWEDGE:
++dirCount.wedge;
break;
case Bond::BondDir::UNKNOWN:
++dirCount.unknown;
break;
case Bond::BondDir::NONE:
// ok, bonds with unspecified direction
// are fine to ignore
case Bond::ENDUPRIGHT:
case Bond::ENDDOWNRIGHT:
// also ignore direction settings that
// may describe the configuration of an
// adjacent double bond
break;
default:
++dirCount.other;
}
}
const auto &conf = mol.getConformer();
const auto &p = conf.getAtomPos(atom->getIdx());
const auto bond0 = bonds[0].bond;
const auto atom0 = bond0->getOtherAtom(atom);
const auto v0 = conf.getAtomPos(atom0->getIdx()) - p;
// sort the neighbors based on the angle they form
// with the first one
auto degree = bonds.size();
for (unsigned int n = 1; n < degree; ++n) {
const auto bondn = bonds[n].bond;
const auto atomn = bondn->getOtherAtom(atom);
const auto vn = conf.getAtomPos(atomn->getIdx()) - p;
bonds[n].angle = v0.signedAngleTo(vn);
}
std::sort(
bonds.begin() + 1, bonds.end(),
[](const BondInfo &a, const BondInfo &b) { return a.angle < b.angle; });
}
void check3CoordinatedStereo(const ROMol &mol, const Atom *atom,
const NeighborsInfo &neighborsInfo,
bool /*reportAllFailures*/,
std::vector<ValidationErrorInfo> &errors) {
auto numStereoBonds =
neighborsInfo.dirCount.dash + neighborsInfo.dirCount.wedge;
if (numStereoBonds == 1) {
// identify the stereo bond
unsigned int i;
for (i = 0; i < 3; ++i) {
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
if (bondDir == Bond::BondDir::BEGINDASH ||
bondDir == Bond::BondDir::BEGINWEDGE) {
break;
}
}
// check for the colinearity of the stereocenter and the other two ligands.
const auto &conf = mol.getConformer();
const auto &p = conf.getAtomPos(atom->getIdx());
const auto atoma =
neighborsInfo.bonds[(i + 1) % 3].bond->getOtherAtom(atom);
const auto va = conf.getAtomPos(atoma->getIdx()) - p;
const auto atomb =
neighborsInfo.bonds[(i + 2) % 3].bond->getOtherAtom(atom);
const auto vb = conf.getAtomPos(atomb->getIdx()) - p;
auto angle = va.angleTo(vb);
static constexpr auto ANGLE_EPSILON = (M_PI * 5. / 180.); // 5 degrees
if (angle < ANGLE_EPSILON || (M_PI - angle) < ANGLE_EPSILON) {
errors.push_back(
"ERROR: [StereoValidation] Colinearity of non-stereo bonds at atom " +
std::to_string(atom->getIdx()));
}
} else {
// configurations with multiple stereo bonds may be formally ambiguous or
// unambiguos depending on their wedged/dashed direction and relative
// orientation on the plane. those cases that are formally unambiguous are
// still most often discouraged or also classified as not acceptable by
// IUPAC guidelines due to lack of clarity.
// The AvalonTools' struchk implementation simply doesn't allow multiple
// stereo bonds on stereo centers with 3 explicit ligands. The validations
// criteria for this sub-case could be in principle refined, but for now the
// same policy is implemented.
errors.push_back("ERROR: [StereoValidation] Atom " +
std::to_string(atom->getIdx()) +
" has 3 explicit substituents and multiple stereo bonds");
}
}
void check4CoordinatedStereo(const ROMol &mol, const Atom *atom,
const NeighborsInfo &neighborsInfo,
bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) {
if (neighborsInfo.dirCount.dash > 2 || neighborsInfo.dirCount.wedge > 2) {
// this condition would anyway trigger an "adjacent bonds with like
// orientation" alert, but this test could be clearer / more explicit.
errors.push_back("ERROR: [StereoValidation] Atom " +
std::to_string(atom->getIdx()) +
" has too many stereo bonds with like orientation");
if (!reportAllFailures) {
return;
}
}
for (unsigned int i = 0; i < 2; ++i) {
if ((neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINDASH &&
neighborsInfo.bonds[i + 2].bondDir == Bond::BondDir::BEGINWEDGE) ||
(neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINWEDGE &&
neighborsInfo.bonds[i + 2].bondDir == Bond::BondDir::BEGINDASH)) {
errors.push_back(
"ERROR: [StereoValidation] Atom " + std::to_string(atom->getIdx()) +
" has opposing stereo bonds with different up/down orientation");
if (!reportAllFailures) {
return;
}
}
}
for (unsigned int i = 0; i < 4; ++i) {
if ((neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINDASH &&
neighborsInfo.bonds[(i + 1) % 4].bondDir ==
Bond::BondDir::BEGINDASH) ||
(neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINWEDGE &&
neighborsInfo.bonds[(i + 1) % 4].bondDir ==
Bond::BondDir::BEGINWEDGE)) {
errors.push_back("ERROR: [StereoValidation] Atom " +
std::to_string(atom->getIdx()) +
" has adjacent stereo bonds with like orientation");
if (!reportAllFailures) {
return;
}
// it doesn't make sense to output this alert multiple times for the same
// atom we therefore exit the loop also when reportAllFailures is not set.
break;
}
}
if (neighborsInfo.dirCount.dash + neighborsInfo.dirCount.wedge == 1) {
// there is only one wedged/dashed bond. check for 'umbrellas' and
// other geometric violations. we need the conformation here.
const auto &conf = mol.getConformer();
// identify the bond index for the stereo bond with specified direction.
for (unsigned int i = 0; i < 4; ++i) {
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
if (bondDir == Bond::BondDir::BEGINDASH ||
bondDir == Bond::BondDir::BEGINWEDGE) {
// count how many of the other bonds lie on the opposite half-plane,
// i.e. form an angle > pi/4 with the stereo bond.
unsigned int opposed = 0;
const auto &p = conf.getAtomPos(atom->getIdx());
const auto bondi = neighborsInfo.bonds[i].bond;
const auto atomi = bondi->getOtherAtom(atom);
const auto vi = conf.getAtomPos(atomi->getIdx()) - p;
for (unsigned int j = 0; j < 4; ++j) {
if (j == i) {
continue;
}
const auto bondj = neighborsInfo.bonds[j].bond;
const auto atomj = bondj->getOtherAtom(atom);
const auto vj = conf.getAtomPos(atomj->getIdx()) - p;
if (vi.angleTo(vj) > 95. * M_PI / 180.) {
++opposed;
}
}
if (opposed == 3) {
errors.push_back(
"ERROR: [StereoValidation] Atom " +
std::to_string(atom->getIdx()) +
" has a potentially ambiguous representation: all non-stereo bonds" +
" opposite to the only stereo bond");
}
if (!reportAllFailures) {
return;
}
// there is only one stereo bond, which means we can exit the
// outer loop on the first execution of this block.
break;
}
}
// check for collinearity violations and/or cases where the
// the middle non-stereo bond is badly positioned (i.e., too short
// compared to the other two on its sides).
for (unsigned int i = 0; i < 4; i++) {
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
if (bondDir == Bond::BondDir::BEGINDASH ||
bondDir == Bond::BondDir::BEGINWEDGE) {
auto j = (i + 1) % 4;
auto k = (i + 2) % 4;
auto l = (i + 3) % 4;
const auto atomj = neighborsInfo.bonds[j].bond->getOtherAtom(atom);
const auto atomk = neighborsInfo.bonds[k].bond->getOtherAtom(atom);
const auto atoml = neighborsInfo.bonds[l].bond->getOtherAtom(atom);
const auto &pj = conf.getAtomPos(atomj->getIdx());
const auto &pk = conf.getAtomPos(atomk->getIdx());
const auto &pl = conf.getAtomPos(atoml->getIdx());
const auto v1 = pj - pk;
const auto v2 = pl - pk;
auto angle = v1.signedAngleTo(v2);
if (angle < 185. * M_PI / 180.) {
errors.push_back(
"ERROR: [StereoValidation] Colinearity or triangle rule violation of "
"non-stereo bonds at atom " +
std::to_string(atom->getIdx()) /* +
" due to angle formed by (" +
std::to_string(atomj->getIdx()+1) + "," +
std::to_string(atomk->getIdx()+1) + "," +
std::to_string(atoml->getIdx()+1) + ")" */
);
if (!reportAllFailures) {
return;
}
}
// there is only one stereo bond, which means we can exit the
// outer loop on the first execution of this block.
break;
}
}
}
}
void checkStereo(const ROMol &mol, const Atom *atom, bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) {
NeighborsInfo neighborsInfo(mol, atom);
if (neighborsInfo.dirCount.other) {
errors.push_back(
"ERROR: [StereoValidation] one or more bonds incident to atom " +
std::to_string(atom->getIdx()) + " have unexpected direction settings");
// this is an unlikely condition and it would make little sense to
// continue the analysis also when reportAllFailures were set.
return;
}
if (neighborsInfo.dirCount.unknown) {
if (neighborsInfo.dirCount.dash || neighborsInfo.dirCount.wedge) {
errors.push_back("ERROR: [StereoValidation] Atom " +
std::to_string(atom->getIdx()) +
" has both unknown and wedged/dashed stereo bonds.");
}
// else: if the only stereo bonds have either/unknown direction,
// we can return here.
return;
}
for (const auto &bondInfo : neighborsInfo.bonds) {
bool isStereo = bondInfo.bondDir == Bond::BondDir::BEGINDASH ||
bondInfo.bondDir == Bond::BondDir::BEGINWEDGE ||
bondInfo.bondDir == Bond::BondDir::UNKNOWN;
if (isStereo && !canHaveDirection(*bondInfo.bond)) {
errors.push_back("ERROR: [StereoValidation] Bond " +
std::to_string(bondInfo.bond->getIdx()) +
" has assigned stereo type, but unexpected bond order.");
if (!reportAllFailures) {
return;
}
}
}
// The validation is currently limited to some specific categories of
// stereocenters
bool multipleBondFound{}, possibleAllene{};
for (auto bond : mol.atomBonds(atom)) {
auto bondType = bond->getBondType();
if (bondType != Bond::BondType::SINGLE) {
multipleBondFound = true;
const Atom *otherAtom = bond->getOtherAtom(atom);
if (otherAtom->getDegree() == 2) {
int doubleBondCount{};
for (auto otherBond : mol.atomBonds(otherAtom)) {
if (otherBond->getBondType() == Bond::BondType::DOUBLE) {
++doubleBondCount;
}
}
if (doubleBondCount == 2) {
possibleAllene = true;
}
}
}
}
auto atomicNum = atom->getAtomicNum();
if (possibleAllene || (multipleBondFound && atomicNum == 15)) {
// Allenes and P compounds are not validated at this time.
return;
}
if (multipleBondFound && atomicNum != 16) {
// A stereo bond was found at an unsaturated atom. This condition used to
// trigger as error in STRUCHK, but there are valid use cases for it (e.g.,
// wavy bonds incident to double bonds of undefined/unknown configuration,
// and atropisomers).
//
// Validation of these use cases is not currently implemented.
return;
}
switch (atom->getDegree()) {
case 1:
case 2:
errors.push_back(
"ERROR: [StereoValidation] Atom " + std::to_string(atom->getIdx()) +
" has stereo bonds, but less than 3 explicit substituents.");
break;
case 3:
check3CoordinatedStereo(mol, atom, neighborsInfo, reportAllFailures,
errors);
break;
case 4:
check4CoordinatedStereo(mol, atom, neighborsInfo, reportAllFailures,
errors);
break;
default:;
}
}
} // namespace
std::vector<ValidationErrorInfo> StereoValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
for (auto atom : mol.atoms()) {
if (hasStereoBond(mol, atom)) {
checkStereo(mol, atom, reportAllFailures, errors);
}
if (!errors.empty() && !reportAllFailures) {
break;
}
}
return errors;
}
std::vector<ValidationErrorInfo> validateSmiles(const std::string &smiles) {
RWMOL_SPTR mol(SmilesToMol(smiles));
if (!mol) {

View File

@@ -30,6 +30,7 @@
namespace RDKit {
class RWMol;
class ROMol;
class Conformer;
namespace MolStandardize {
@@ -51,11 +52,12 @@ class RDKIT_MOLSTANDARDIZE_EXPORT ValidationMethod {
//! The CompositeValidation class provides a simple way to apply a collection of
// ValidationMethod instances in sequence
class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation : public ValidationMethod {
class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation
: public ValidationMethod {
public:
CompositeValidation(
const std::vector<std::shared_ptr<ValidationMethod>> & validations)
: validations(validations) {};
const std::vector<std::shared_ptr<ValidationMethod>> &validations)
: validations(validations){};
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
@@ -65,7 +67,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation : public ValidationMethod
}
private:
std::vector<std::shared_ptr<ValidationMethod>> validations;
std::vector<std::shared_ptr<ValidationMethod>> validations;
};
//! The RDKitValidation class throws an error when there are no atoms in the
@@ -95,7 +97,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT RDKitValidation : public ValidationMethod {
class RDKIT_MOLSTANDARDIZE_EXPORT NoAtomValidation : public ValidationMethod {
public:
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<NoAtomValidation>(*this);
@@ -106,7 +108,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT NoAtomValidation : public ValidationMethod {
class RDKIT_MOLSTANDARDIZE_EXPORT FragmentValidation : public ValidationMethod {
public:
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<FragmentValidation>(*this);
@@ -117,7 +119,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentValidation : public ValidationMethod {
class RDKIT_MOLSTANDARDIZE_EXPORT NeutralValidation : public ValidationMethod {
public:
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<NeutralValidation>(*this);
@@ -125,14 +127,24 @@ class RDKIT_MOLSTANDARDIZE_EXPORT NeutralValidation : public ValidationMethod {
};
//! The IsotopeValidation class logs if molecule contains isotopes.
/*!
<b>Notes:</b>
- By default, this class will return an error every time an isotopic
number is specified. When the `strict` constructor parameter is passed a
`true` argument, an error is returned only if the specified isotopic number
is not found in the RDKit periodic table.
*/
class RDKIT_MOLSTANDARDIZE_EXPORT IsotopeValidation : public ValidationMethod {
public:
IsotopeValidation(bool strict = false) : strict(strict){};
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<IsotopeValidation>(*this);
}
bool strict;
};
////////////////////////////////
@@ -146,7 +158,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT MolVSValidation : public CompositeValidation {
MolVSValidation();
//! overloaded constructor to take in a user-defined list of ValidationMethod
MolVSValidation(
const std::vector<std::shared_ptr<ValidationMethod>> & validations);
const std::vector<std::shared_ptr<ValidationMethod>> &validations);
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<MolVSValidation>(*this);
@@ -154,8 +166,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT MolVSValidation : public CompositeValidation {
};
//! The AllowedAtomsValidation class lets the user input a list of atoms,
//! anything not on
/// the list throws an error.
//! anything not on the list throws an error.
class RDKIT_MOLSTANDARDIZE_EXPORT AllowedAtomsValidation
: public ValidationMethod {
public:
@@ -173,8 +184,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT AllowedAtomsValidation
};
//! The DisallowedAtomsValidation class lets the user input a list of atoms and
//! as long
/// as there are no atoms from the list it is deemed acceptable.
//! as long as there are no atoms from the list it is deemed acceptable.
class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedAtomsValidation
: public ValidationMethod {
public:
@@ -191,6 +201,108 @@ class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedAtomsValidation
std::vector<std::shared_ptr<Atom>> d_disallowedList;
};
//! The DisallowedRadicalValidation class reports an error if any
/// unstable radical atoms are found.
/// The allowed radicals are [N]=O and [O]-N.
class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedRadicalValidation
: public ValidationMethod {
public:
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<DisallowedRadicalValidation>(*this);
}
};
//! The FeaturesValidation class reports an error if the input
/// molecule representation includes any undesired features.
class RDKIT_MOLSTANDARDIZE_EXPORT FeaturesValidation : public ValidationMethod {
public:
FeaturesValidation(bool allowEnhancedStereo = false,
bool allowAromaticBondType = false,
bool allowDativeBondType = false,
bool allowQueries = false, bool allowDummies = false,
bool allowAtomAliases = false)
: allowEnhancedStereo(allowEnhancedStereo),
allowAromaticBondType(allowAromaticBondType),
allowDativeBondType(allowDativeBondType),
allowQueries(allowQueries),
allowDummies(allowDummies),
allowAtomAliases(allowAtomAliases){};
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<FeaturesValidation>(*this);
}
bool allowEnhancedStereo;
bool allowAromaticBondType;
bool allowDativeBondType;
bool allowQueries;
bool allowDummies;
bool allowAtomAliases;
};
//! The Is2DValidation class reports an error if the input
/// molecule representation is designated as 3D or if it includes
/// non-null Z coordinates, and in case all atoms are assigned the
/// same coordinates.
class RDKIT_MOLSTANDARDIZE_EXPORT Is2DValidation : public ValidationMethod {
public:
Is2DValidation(double threshold = 1.e-3) : threshold(threshold){};
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<Is2DValidation>(*this);
}
double threshold;
};
//! The Layout2DValidation class reports an error if any atoms are
/// too close to any other atoms or bonds, and in case any bonds are
/// too long.
class RDKIT_MOLSTANDARDIZE_EXPORT Layout2DValidation : public ValidationMethod {
public:
Layout2DValidation(double clashLimit = 0.15, double bondLengthLimit = 25.,
bool allowLongBondsInRings = true,
bool allowAtomBondClashExemption = true,
double minMedianBondLength = 1e-3)
: clashLimit(clashLimit),
bondLengthLimit(bondLengthLimit),
allowLongBondsInRings(allowLongBondsInRings),
allowAtomBondClashExemption(allowAtomBondClashExemption),
minMedianBondLength(minMedianBondLength){};
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<Layout2DValidation>(*this);
}
static double squaredMedianBondLength(const ROMol &mol,
const Conformer &conf);
double clashLimit;
double bondLengthLimit;
bool allowLongBondsInRings;
bool allowAtomBondClashExemption;
double minMedianBondLength;
};
//! The StereoValidation class checks various "syntactic" constraints
/// related to the usage of stereo bonds on centers with 4 or 3 substituents,
/// in an attempt to ensure that the associated stereochemical configuration
/// can be interpreted unambiguously.
/// These validation criteria were ported from the AvalonTools STRUCHK software.
class RDKIT_MOLSTANDARDIZE_EXPORT StereoValidation : public ValidationMethod {
public:
std::vector<ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override;
std::shared_ptr<ValidationMethod> copy() const override {
return std::make_shared<StereoValidation>(*this);
}
};
//! A convenience function for quickly validating a single SMILES string.
RDKIT_MOLSTANDARDIZE_EXPORT std::vector<ValidationErrorInfo> validateSmiles(
const std::string &smiles);

View File

@@ -1,6 +1,6 @@
remove_definitions(-DRDKIT_MOLSTANDARDIZE_BUILD)
rdkit_python_extension(rdMolStandardize rdMolStandardize.cpp Validate.cpp
Charge.cpp Fragment.cpp Normalize.cpp Metal.cpp Tautomer.cpp
Charge.cpp Fragment.cpp Normalize.cpp Metal.cpp Tautomer.cpp Pipeline.cpp
DEST Chem/MolStandardize
LINK_LIBRARIES
LINK_LIBRARIES MolStandardize )

View File

@@ -0,0 +1,143 @@
//
// Copyright (C) 2023 Novartis Biomedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDBoost/Wrap.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/MolStandardize/Pipeline.h>
namespace RDKit {
namespace MolStandardize {
bool operator==(const PipelineLogEntry &lhs, const PipelineLogEntry &rhs) {
return (lhs.status == rhs.status) && (lhs.detail == rhs.detail);
}
} // namespace MolStandardize
} // namespace RDKit
namespace python = boost::python;
using namespace RDKit;
void wrap_pipeline() {
python::class_<MolStandardize::PipelineOptions>("PipelineOptions")
.def_readwrite("strictParsing",
&MolStandardize::PipelineOptions::strictParsing)
.def_readwrite("reportAllFailures",
&MolStandardize::PipelineOptions::reportAllFailures)
.def_readwrite("allowEmptyMolecules",
&MolStandardize::PipelineOptions::allowEmptyMolecules)
.def_readwrite("allowEnhancedStereo",
&MolStandardize::PipelineOptions::allowEnhancedStereo)
.def_readwrite("allowAromaticBondType",
&MolStandardize::PipelineOptions::allowAromaticBondType)
.def_readwrite("allowDativeBondType",
&MolStandardize::PipelineOptions::allowDativeBondType)
.def_readwrite("is2DZeroThreshold",
&MolStandardize::PipelineOptions::is2DZeroThreshold)
.def_readwrite("atomClashLimit",
&MolStandardize::PipelineOptions::atomClashLimit)
.def_readwrite("minMedianBondLength",
&MolStandardize::PipelineOptions::minMedianBondLength)
.def_readwrite("bondLengthLimit",
&MolStandardize::PipelineOptions::bondLengthLimit)
.def_readwrite("allowLongBondsInRings",
&MolStandardize::PipelineOptions::allowLongBondsInRings)
.def_readwrite(
"allowAtomBondClashExemption",
&MolStandardize::PipelineOptions::allowAtomBondClashExemption)
.def_readwrite("metalNof", &MolStandardize::PipelineOptions::metalNof)
.def_readwrite("metalNon", &MolStandardize::PipelineOptions::metalNon)
.def_readwrite("normalizerData",
&MolStandardize::PipelineOptions::normalizerData)
.def_readwrite("normalizerMaxRestarts",
&MolStandardize::PipelineOptions::normalizerMaxRestarts)
.def_readwrite("scaledMedianBondLength",
&MolStandardize::PipelineOptions::scaledMedianBondLength)
.def_readwrite("outputV2000",
&MolStandardize::PipelineOptions::outputV2000);
python::enum_<MolStandardize::PipelineStatus>("PipelineStatus")
.value("NO_EVENT", MolStandardize::PipelineStatus::NO_EVENT)
.value("INPUT_ERROR", MolStandardize::PipelineStatus::INPUT_ERROR)
.value("PREPARE_FOR_VALIDATION_ERROR",
MolStandardize::PipelineStatus::PREPARE_FOR_VALIDATION_ERROR)
.value("FEATURES_VALIDATION_ERROR",
MolStandardize::PipelineStatus::FEATURES_VALIDATION_ERROR)
.value("BASIC_VALIDATION_ERROR",
MolStandardize::PipelineStatus::BASIC_VALIDATION_ERROR)
.value("IS2D_VALIDATION_ERROR",
MolStandardize::PipelineStatus::IS2D_VALIDATION_ERROR)
.value("LAYOUT2D_VALIDATION_ERROR",
MolStandardize::PipelineStatus::LAYOUT2D_VALIDATION_ERROR)
.value("STEREO_VALIDATION_ERROR",
MolStandardize::PipelineStatus::STEREO_VALIDATION_ERROR)
.value("VALIDATION_ERROR",
MolStandardize::PipelineStatus::VALIDATION_ERROR)
.value("PREPARE_FOR_STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::PREPARE_FOR_STANDARDIZATION_ERROR)
.value("METAL_STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::METAL_STANDARDIZATION_ERROR)
.value("NORMALIZER_STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::NORMALIZER_STANDARDIZATION_ERROR)
.value("FRAGMENT_STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::FRAGMENT_STANDARDIZATION_ERROR)
.value("CHARGE_STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::CHARGE_STANDARDIZATION_ERROR)
.value("STANDARDIZATION_ERROR",
MolStandardize::PipelineStatus::STANDARDIZATION_ERROR)
.value("OUTPUT_ERROR", MolStandardize::PipelineStatus::OUTPUT_ERROR)
.value("PIPELINE_ERROR", MolStandardize::PipelineStatus::PIPELINE_ERROR)
.value("METALS_DISCONNECTED",
MolStandardize::PipelineStatus::METALS_DISCONNECTED)
.value("NORMALIZATION_APPLIED",
MolStandardize::PipelineStatus::NORMALIZATION_APPLIED)
.value("FRAGMENTS_REMOVED",
MolStandardize::PipelineStatus::FRAGMENTS_REMOVED)
.value("PROTONATION_CHANGED",
MolStandardize::PipelineStatus::PROTONATION_CHANGED)
.value("STRUCTURE_MODIFICATION",
MolStandardize::PipelineStatus::STRUCTURE_MODIFICATION);
python::enum_<MolStandardize::PipelineStage>("PipelineStage")
.value("PARSING_INPUT", MolStandardize::PipelineStage::PARSING_INPUT)
.value("PREPARE_FOR_VALIDATION",
MolStandardize::PipelineStage::PREPARE_FOR_VALIDATION)
.value("VALIDATION", MolStandardize::PipelineStage::VALIDATION)
.value("PREPARE_FOR_STANDARDIZATION",
MolStandardize::PipelineStage::PREPARE_FOR_STANDARDIZATION)
.value("STANDARDIZATION", MolStandardize::PipelineStage::STANDARDIZATION)
.value("SERIALIZING_OUTPUT",
MolStandardize::PipelineStage::SERIALIZING_OUTPUT)
.value("COMPLETED", MolStandardize::PipelineStage::COMPLETED);
python::class_<MolStandardize::PipelineLogEntry>("PipelineLogEntry",
python::no_init)
.def_readonly("status", &MolStandardize::PipelineLogEntry::status)
.def_readonly("detail", &MolStandardize::PipelineLogEntry::detail);
python::class_<MolStandardize::PipelineLog>("PipelineLog", python::no_init)
.def(python::vector_indexing_suite<MolStandardize::PipelineLog>());
python::class_<MolStandardize::PipelineResult>("PipelineResult",
python::no_init)
.def_readonly("status", &MolStandardize::PipelineResult::status)
.def_readonly("stage", &MolStandardize::PipelineResult::stage)
.def_readonly("log", &MolStandardize::PipelineResult::log)
.def_readonly("inputMolData",
&MolStandardize::PipelineResult::inputMolData)
.def_readonly("outputMolData",
&MolStandardize::PipelineResult::outputMolData)
.def_readonly("parentMolData",
&MolStandardize::PipelineResult::parentMolData);
python::class_<MolStandardize::Pipeline>("Pipeline")
.def(python::init<const MolStandardize::PipelineOptions &>())
.def("run", &MolStandardize::Pipeline::run);
}

View File

@@ -17,25 +17,23 @@ using namespace RDKit;
namespace {
struct ValidationMethodWrap : MolStandardize::ValidationMethod, python::wrapper<MolStandardize::ValidationMethod>
{
std::vector<MolStandardize::ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override
{
return this->get_override("validate")(mol, reportAllFailures);
}
struct ValidationMethodWrap
: MolStandardize::ValidationMethod,
python::wrapper<MolStandardize::ValidationMethod> {
std::vector<MolStandardize::ValidationErrorInfo> validate(
const ROMol &mol, bool reportAllFailures) const override {
return this->get_override("validate")(mol, reportAllFailures);
}
std::shared_ptr<MolStandardize::ValidationMethod> copy() const override
{
return this->get_override("copy")();
}
std::shared_ptr<MolStandardize::ValidationMethod> copy() const override {
return this->get_override("copy")();
}
};
// Wrap ValidationMethod::validate and convert the returned
// vector into a python list of strings
python::list pythonValidateMethod(
const MolStandardize::ValidationMethod & self, const ROMol &mol,
bool reportAllFailures) {
python::list pythonValidateMethod(const MolStandardize::ValidationMethod &self,
const ROMol &mol, bool reportAllFailures) {
python::list res;
std::vector<MolStandardize::ValidationErrorInfo> errout =
self.validate(mol, reportAllFailures);
@@ -104,62 +102,111 @@ struct validate_wrapper {
std::string docString = "";
python::class_<ValidationMethodWrap, boost::noncopyable>("ValidationMethod")
.def("validate", pythonValidateMethod,
(python::arg("self"), python::arg("mol"),
python::arg("reportAllFailures") = false),
"")
;
.def("validate", pythonValidateMethod,
(python::arg("self"), python::arg("mol"),
python::arg("reportAllFailures") = false),
"");
python::class_<
MolStandardize::RDKitValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("RDKitValidation")
;
python::class_<MolStandardize::RDKitValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("RDKitValidation");
python::class_<
MolStandardize::NoAtomValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("NoAtomValidation")
;
python::class_<MolStandardize::NoAtomValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("NoAtomValidation");
python::class_<
MolStandardize::FragmentValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("FragmentValidation")
;
python::class_<MolStandardize::FragmentValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("FragmentValidation");
python::class_<
MolStandardize::NeutralValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("NeutralValidation")
;
python::class_<MolStandardize::NeutralValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("NeutralValidation");
python::class_<
MolStandardize::IsotopeValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("IsotopeValidation")
;
python::class_<MolStandardize::IsotopeValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("IsotopeValidation")
.def(python::init<bool>(python::arg("strict") = false))
.def_readwrite("strict", &MolStandardize::IsotopeValidation::strict);
python::class_<
MolStandardize::MolVSValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("MolVSValidation")
.def("__init__", python::make_constructor(&getMolVSValidation))
;
python::class_<MolStandardize::MolVSValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("MolVSValidation")
.def("__init__", python::make_constructor(&getMolVSValidation));
python::class_<
MolStandardize::AllowedAtomsValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("AllowedAtomsValidation", python::no_init)
.def("__init__", python::make_constructor(&getAllowedAtomsValidation))
;
python::class_<MolStandardize::AllowedAtomsValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("AllowedAtomsValidation",
python::no_init)
.def("__init__", python::make_constructor(&getAllowedAtomsValidation));
python::class_<
MolStandardize::DisallowedAtomsValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("DisallowedAtomsValidation", python::no_init)
.def("__init__", python::make_constructor(&getDisallowedAtomsValidation))
;
python::class_<MolStandardize::DisallowedAtomsValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("DisallowedAtomsValidation",
python::no_init)
.def("__init__",
python::make_constructor(&getDisallowedAtomsValidation));
python::class_<MolStandardize::FeaturesValidation,
python::bases<MolStandardize::ValidationMethod>>(
"FeaturesValidation")
.def(python::init<bool, bool, bool, bool, bool, bool>(
(python::arg("allowEnhancedStereo") = false,
python::arg("allowAromaticBondType") = false,
python::arg("allowDativeBondType") = false,
python::arg("allowQueries") = false,
python::arg("allowDummmies") = false,
python::arg("allowAtomAliases") = false)))
.def_readwrite("allowEnhancedStereo",
&MolStandardize::FeaturesValidation::allowEnhancedStereo)
.def_readwrite(
"allowAromaticBondType",
&MolStandardize::FeaturesValidation::allowAromaticBondType)
.def_readwrite("allowDativeBondType",
&MolStandardize::FeaturesValidation::allowDativeBondType)
.def_readwrite("allowQueries",
&MolStandardize::FeaturesValidation::allowQueries)
.def_readwrite("allowDummies",
&MolStandardize::FeaturesValidation::allowDummies)
.def_readwrite("allowAtomAliases",
&MolStandardize::FeaturesValidation::allowAtomAliases);
python::class_<MolStandardize::DisallowedRadicalValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("DisallowedRadicalValidation");
python::class_<MolStandardize::Is2DValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("Is2DValidation")
.def(python::init<double>(python::arg("threshold") = 1e-3))
.def_readwrite("threshold", &MolStandardize::Is2DValidation::threshold);
python::class_<MolStandardize::Layout2DValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("Layout2DValidation")
.def(python::init<double, double, bool, bool, double>(
(python::arg("clashLimit") = 0.15,
python::arg("bondLengthLimit") = 25.,
python::arg("allowLongBondsInRings") = true,
python::arg("allowAtomBondClashExemption") = true,
python::arg("minMedianBondLength") = false)))
.def_readwrite("clashLimit",
&MolStandardize::Layout2DValidation::clashLimit)
.def_readwrite("bondLengthLimit",
&MolStandardize::Layout2DValidation::bondLengthLimit)
.def_readwrite(
"allowLongBondsInRings",
&MolStandardize::Layout2DValidation::allowLongBondsInRings)
.def_readwrite(
"allowAtomBondClashExemption",
&MolStandardize::Layout2DValidation::allowAtomBondClashExemption)
.def_readwrite(
"minMedianBondLength",
&MolStandardize::Layout2DValidation::minMedianBondLength);
python::class_<MolStandardize::StereoValidation,
python::bases<MolStandardize::ValidationMethod>,
boost::noncopyable>("StereoValidation");
python::def("ValidateSmiles", standardizeSmilesHelper, (python::arg("mol")),
docString.c_str());

View File

@@ -195,8 +195,7 @@ template <typename FUNCTYPE>
void mtinPlaceHelper2(python::object pymols, int numThreads,
python::object params, bool skip_standardize,
FUNCTYPE func) {
const auto *ps =
&RDKit::MolStandardize::defaultCleanupParameters;
const auto *ps = &RDKit::MolStandardize::defaultCleanupParameters;
if (params) {
ps = python::extract<RDKit::MolStandardize::CleanupParameters *>(params);
}
@@ -387,6 +386,7 @@ void wrap_metal();
void wrap_fragment();
void wrap_normalize();
void wrap_tautomer();
void wrap_pipeline();
BOOST_PYTHON_MODULE(rdMolStandardize) {
python::scope().attr("__doc__") =
@@ -668,4 +668,5 @@ BOOST_PYTHON_MODULE(rdMolStandardize) {
wrap_fragment();
wrap_normalize();
wrap_tautomer();
wrap_pipeline();
}

View File

@@ -269,9 +269,9 @@ class TestCase(unittest.TestCase):
mol = Chem.MolFromSmiles("CO(C)C", sanitize=False)
msg = vm.validate(mol)
self.assertEqual(len(msg), 1)
self.assertEqual
("""INFO: [ValenceValidation] Explicit valence for atom # 1 O, 3, is greater than permitted""",
msg[0])
self.assertEqual(
"""INFO: [ValenceValidation] Explicit valence for atom # 1 O, 3, is greater than permitted""",
msg[0])
vm2 = rdMolStandardize.MolVSValidation([rdMolStandardize.FragmentValidation()])
# with no argument it also works
@@ -279,17 +279,14 @@ class TestCase(unittest.TestCase):
mol2 = Chem.MolFromSmiles("COc1cccc(C=N[N-]C(N)=O)c1[O-].O.O.O.O=[U+2]=O")
msg2 = vm2.validate(mol2)
self.assertEqual(len(msg2), 1)
self.assertEqual
("""INFO: [FragmentValidation] water/hydroxide is present""", msg2[0])
self.assertEqual("""INFO: [FragmentValidation] water/hydroxide is present""", msg2[0])
vm3 = rdMolStandardize.MolVSValidation()
mol3 = Chem.MolFromSmiles("C1COCCO1.O=C(NO)NO")
msg3 = vm3.validate(mol3)
self.assertEqual(len(msg3), 2)
self.assertEqual
("""INFO: [FragmentValidation] 1,2-dimethoxyethane is present""", msg3[0])
self.assertEqual
("""INFO: [FragmentValidation] 1,4-dioxane is present""", msg3[1])
self.assertEqual("""INFO: [FragmentValidation] 1,2-dimethoxyethane is present""", msg3[0])
self.assertEqual("""INFO: [FragmentValidation] 1,4-dioxane is present""", msg3[1])
atomic_no = [6, 7, 8]
allowed_atoms = [Atom(i) for i in atomic_no]
@@ -297,22 +294,32 @@ class TestCase(unittest.TestCase):
mol4 = Chem.MolFromSmiles("CC(=O)CF")
msg4 = vm4.validate(mol4)
self.assertEqual(len(msg4), 1)
self.assertEqual
("""INFO: [AllowedAtomsValidation] Atom F is not in allowedAtoms list""", msg4[0])
self.assertEqual("""INFO: [AllowedAtomsValidation] Atom F is not in allowedAtoms list""",
msg4[0])
atomic_no = [9, 17, 35]
disallowed_atoms = [Atom(i) for i in atomic_no]
vm5 = rdMolStandardize.DisallowedAtomsValidation(disallowed_atoms)
mol5 = Chem.MolFromSmiles("CC(=O)CF")
msg5 = vm4.validate(mol5)
msg5 = vm5.validate(mol5)
self.assertEqual(len(msg5), 1)
self.assertEqual
("""INFO: [DisallowedAtomsValidation] Atom F is in disallowedAtoms list""", msg5[0])
self.assertEqual("""INFO: [DisallowedAtomsValidation] Atom F is in disallowedAtoms list""",
msg5[0])
msg6 = rdMolStandardize.ValidateSmiles("ClCCCl.c1ccccc1O")
self.assertEqual(len(msg6), 1)
self.assertEqual
("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg6[0])
mol6 = Chem.MolFromSmiles("[3CH4]")
vm6a = rdMolStandardize.IsotopeValidation()
msg6a = vm6a.validate(mol6)
self.assertEqual(len(msg6a), 1)
self.assertEqual("INFO: [IsotopeValidation] Molecule contains isotope 3C", msg6a[0])
vm6b = rdMolStandardize.IsotopeValidation(True)
msg6b = vm6b.validate(mol6)
self.assertEqual(len(msg6b), 1)
self.assertEqual("ERROR: [IsotopeValidation] The molecule contains an unknown isotope: 3C",
msg6b[0])
msg999 = rdMolStandardize.ValidateSmiles("ClCCCl.c1ccccc1O")
self.assertEqual(len(msg999), 1)
self.assertEqual("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg999[0])
def test10NormalizeFromData(self):
data = """// Name SMIRKS
@@ -1147,6 +1154,624 @@ chlorine [Cl]
rdMolStandardize.SuperParentInPlace(ms, 4)
self.assertEqual([Chem.MolToSmiles(m) for m in ms], [y for x, y in ind])
def test33MolBlockValidation(self):
# featuresValidation
mol = Chem.MolFromMolBlock(
'''
Mrv2311 01162413552D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)
M V30 2 C -18.7083 6.1667 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
validator = rdMolStandardize.FeaturesValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(errinfo[0], "ERROR: [FeaturesValidation] Query atom 0 is not allowed")
validator.allowDummies = True
validator.allowQueries = True
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
mol = Chem.MolFromMolBlock('''
Mrv2311 01162411552D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 4 3 0 0 0
M V30 BEGIN ATOM
M V30 1 C -18.208 8.52 0 0 CFG=2
M V30 2 F -19.5417 7.75 0 0
M V30 3 C -16.8743 7.75 0 0
M V30 4 Cl -18.208 10.06 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 3 CFG=1
M V30 2 1 2 1
M V30 3 1 1 4
M V30 END BOND
M V30 BEGIN COLLECTION
M V30 MDLV30/STERAC1 ATOMS=(1 1)
M V30 END COLLECTION
M V30 END CTAB
M END
''')
# enhanced stereo features are by default disallowed
validator = rdMolStandardize.FeaturesValidation()
errinfo = validator.validate(mol, True)
self.assertEqual(len(errinfo), 1)
self.assertEqual(
errinfo[0], "ERROR: [FeaturesValidation] Enhanced stereochemistry features are not allowed")
# allow enhanced stereo
validator = rdMolStandardize.FeaturesValidation(True)
errinfo = validator.validate(mol, True)
self.assertEqual(len(errinfo), 0)
validator.allowEnhancedStereo = True
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
mol = Chem.MolFromMolBlock(
'''
Mrv2311 02272411562D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 7 7 0 0 0
M V30 BEGIN ATOM
M V30 1 C -10.3542 4.29 0 0
M V30 2 C -11.6879 3.52 0 0
M V30 3 C -11.6879 1.9798 0 0
M V30 4 N -10.3542 1.21 0 0
M V30 5 C -9.0204 1.9798 0 0
M V30 6 C -9.0204 3.52 0 0
M V30 7 C -10.3542 5.83 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 4 1 2
M V30 2 4 1 6
M V30 3 4 2 3
M V30 4 4 5 6
M V30 5 1 1 7
M V30 6 4 3 4
M V30 7 4 4 5
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
# aromatic bonds are by default disallowed
validator = rdMolStandardize.FeaturesValidation()
errinfo = validator.validate(mol, True)
self.assertEqual(len(errinfo), 6)
self.assertEqual(errinfo[0],
"ERROR: [FeaturesValidation] Bond 0 of aromatic type is not allowed")
validator.allowAromaticBondType = True
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
# allow aromatic bonds
validator = rdMolStandardize.FeaturesValidation(False, True)
errinfo = validator.validate(mol, True)
self.assertEqual(len(errinfo), 0)
# disallowedRadicalValidation
mol = Chem.MolFromMolBlock(
'''
Mrv2311 02082417212D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 C -20.9372 7.145 0 0 RAD=2
M V30 2 C -22.2708 6.375 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
validator = rdMolStandardize.DisallowedRadicalValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(errinfo[0],
"ERROR: [DisallowedRadicalValidation] The radical at atom 0 is not allowed")
# is2DValidation
mol = Chem.MolFromMolBlock(
'''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.8753 4.9367 0 0
M V30 2 C -0.4583 4.1667 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
validator = rdMolStandardize.Is2DValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
conf = mol.GetConformer()
pos = conf.GetAtomPosition(1)
self.assertEqual(pos.z, 0.0)
pos.z = 0.1
conf.SetAtomPosition(1, pos)
validator = rdMolStandardize.Is2DValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(errinfo[0],
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates")
validator = rdMolStandardize.Is2DValidation(0.2)
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
mol = Chem.MolFromMolBlock(
'''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.8753 4.9367 0 0
M V30 2 C -0.4583 4.1667 0.2 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
validator = rdMolStandardize.Is2DValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(errinfo[0],
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates")
# AtomClashValidation
mol = Chem.MolFromMolBlock(
'''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 6 5 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.6667 6.2067 0 0
M V30 2 C -3.0004 5.4367 0 0
M V30 3 C -3.0004 3.8965 0 0
M V30 4 C -1.6667 3.1267 0 0
M V30 5 C -0.3329 4.6000 0 0
M V30 6 C -0.3329 4.7000 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2
M V30 2 1 1 6
M V30 3 1 2 3
M V30 4 1 3 4
M V30 5 1 4 5
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
validator = rdMolStandardize.Layout2DValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(errinfo[0], "ERROR: [Layout2DValidation] Atom 4 is too close to atom 5")
validator = rdMolStandardize.Layout2DValidation(1e-3)
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 0)
mol = Chem.MolFromMolBlock(
'''
10052311582D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 5 4 0 0 0
M V30 BEGIN ATOM
M V30 1 Br 0.0003 7.27 0 0
M V30 2 C -1.3333 6.5 0 0
M V30 3 F -2.667 7.27 0 0
M V30 4 O -1.3333 4.96 0 0
M V30 5 C 0.0003 5.73 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 5 CFG=1
M V30 2 1 2 3 CFG=3
M V30 3 1 2 1
M V30 4 1 2 4
M V30 END BOND
M V30 END CTAB
M END
''', sanitize=False)
Chem.ReapplyMolBlockWedging(mol)
validator = rdMolStandardize.StereoValidation()
errinfo = validator.validate(mol)
self.assertEqual(len(errinfo), 1)
self.assertEqual(
errinfo[0],
"ERROR: [StereoValidation] Atom 1 has opposing stereo bonds with different up/down orientation"
)
def test24Pipeline(self):
pipeline = rdMolStandardize.Pipeline()
# invalid input molblock
molblock = '''
sldfj;ldskfj sldkjfsd;lkf
M V30 BEGIN CTAB
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.PARSING_INPUT)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.INPUT_ERROR)
# R group
molblock = '''
Mrv2311 01162413552D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)
M V30 2 C -18.7083 6.1667 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.FEATURES_VALIDATION_ERROR)
# no atoms
molblock = '''
10052313452D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 0 0 0 0 0
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.BASIC_VALIDATION_ERROR)
# neutral quaternary N
molblock = '''
10242314442D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 5 4 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.6247 7.5825 0 0
M V30 2 N -2.9583 6.8125 0 0
M V30 3 C -4.292 7.5825 0 0
M V30 4 C -2.9583 5.2725 0 0
M V30 5 C -1.6247 6.0425 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 1 2 3
M V30 3 1 2 4
M V30 4 1 2 5
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
#self.assertTrue(result.status & rdMolStandardize.PipelineStatus.STANDARDIZATION_ERROR)
self.assertEqual(
result.status,
(
rdMolStandardize.PipelineStatus.BASIC_VALIDATION_ERROR
| rdMolStandardize.PipelineStatus.PREPARE_FOR_STANDARDIZATION_ERROR #|
#rdMolStandardize.PipelineStatus.NORMALIZER_STANDARDIZATION_ERROR
))
molblock = '''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 2 1 0 0 0
M V30 BEGIN ATOM
M V30 1 C 0.8753 4.9367 0 0
M V30 2 C -0.4583 4.1667 0.2 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.IS2D_VALIDATION_ERROR)
molblock = '''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 4 3 0 0 0
M V30 BEGIN ATOM
M V30 1 C -3.05 5.48 0 0
M V30 2 C -4.4167 4.6875 0 0
M V30 3 C -4.3289 6.3627 0 0
M V30 4 C -3.0 5.5 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 1 1 3
M V30 3 1 3 4
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.LAYOUT2D_VALIDATION_ERROR)
molblock = '''
2D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 5 4 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.583 5.7075 0 0
M V30 2 C -2.9167 4.9375 0 0
M V30 3 C -1.583 7.2475 0 0
M V30 4 C -0.2493 4.9375 0.5 0
M V30 5 C -1.583 4.1675 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 1 2 CFG=1
M V30 2 1 1 3 CFG=1
M V30 3 1 1 4
M V30 4 1 1 5
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
self.assertEqual(
result.status, rdMolStandardize.PipelineStatus.IS2D_VALIDATION_ERROR
| rdMolStandardize.PipelineStatus.STEREO_VALIDATION_ERROR)
molblock = '''
10282320572D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 5 4 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.0413 5.4992 0 0
M V30 2 C -2.375 4.7292 0 0
M V30 3 O -1.0413 7.0392 0 0
M V30 4 O 0.2924 4.7292 0 0
M V30 5 Na 0.2924 3.1892 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 1 1 4
M V30 3 2 1 3
M V30 4 1 4 5
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
(rdMolStandardize.PipelineStatus.METALS_DISCONNECTED
| rdMolStandardize.PipelineStatus.FRAGMENTS_REMOVED
| rdMolStandardize.PipelineStatus.PROTONATION_CHANGED))
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
parentSmiles = Chem.MolToSmiles(parentMol)
self.assertEqual(parentSmiles, "CC(=O)O")
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
outputSmiles = Chem.MolToSmiles(outputMol)
self.assertEqual(outputSmiles, "CC(=O)O")
molblock = '''
10282320572D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 4 3 0 0 0
M V30 BEGIN ATOM
M V30 1 N -1.0413 5.4992 0 0
M V30 2 C -2.375 4.7292 0 0
M V30 3 O -1.0413 7.0392 0 0
M V30 4 O 0.2924 4.7292 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 2 1 4
M V30 3 2 1 3
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
# nitro groups are cleaned-up in a pre-standardization step
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
rdMolStandardize.PipelineStatus.NO_EVENT)
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
parentSmiles = Chem.MolToSmiles(parentMol)
self.assertEqual(parentSmiles, "C[N+](=O)[O-]")
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
outputSmiles = Chem.MolToSmiles(outputMol)
self.assertEqual(outputSmiles, "C[N+](=O)[O-]")
molblock = '''
10282320572D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 6 5 0 0 0
M V30 BEGIN ATOM
M V30 1 C -1.0413 5.4992 0 0
M V30 2 C -2.375 4.7292 0 0
M V30 3 O -1.0413 7.0392 0 0
M V30 4 O 0.2924 4.7292 0 0
M V30 5 N -3.7087 5.4992 0 0 CHG=1
M V30 6 Na 0.2924 3.1892 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 1 1 4
M V30 3 2 1 3
M V30 4 1 2 5
M V30 5 1 4 6
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
(rdMolStandardize.PipelineStatus.METALS_DISCONNECTED
| rdMolStandardize.PipelineStatus.FRAGMENTS_REMOVED))
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
parentSmiles = Chem.MolToSmiles(parentMol)
self.assertEqual(parentSmiles, "NCC(=O)O")
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
outputSmiles = Chem.MolToSmiles(outputMol)
self.assertEqual(outputSmiles, "[NH3+]CC(=O)[O-]")
def test25PipelineNormalizerOptions(self):
options = rdMolStandardize.PipelineOptions()
# run the pipeline w/ the RDKit default normalizer transforms
options.normalizerData = ''
pipeline = rdMolStandardize.Pipeline(options)
molblock = '''
Mrv2311 02072415362D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 4 3 0 0 0
M V30 BEGIN ATOM
M V30 1 S -10.3538 4.27 0 0
M V30 2 C -11.6875 3.5 0 0
M V30 3 O -10.3538 5.81 0 0
M V30 4 C -9.0201 3.5 0 0
M V30 END ATOM
M V30 BEGIN BOND
M V30 1 1 2 1
M V30 2 1 1 4
M V30 3 2 1 3
M V30 END BOND
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
rdMolStandardize.PipelineStatus.NO_EVENT)
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
rdMolStandardize.PipelineStatus.NORMALIZATION_APPLIED)
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
outputSmiles = Chem.MolToSmiles(outputMol)
self.assertEqual(outputSmiles, "C[S+](C)[O-]")
def test26PipelineAllowEmptyMoleculesOption(self):
options = rdMolStandardize.PipelineOptions()
options.allowEmptyMolecules = True
pipeline = rdMolStandardize.Pipeline(options)
# no atoms
molblock = '''
10052313452D
0 0 0 0 0 999 V3000
M V30 BEGIN CTAB
M V30 COUNTS 0 0 0 0 0
M V30 END CTAB
M END
'''
result = pipeline.run(molblock)
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
self.assertEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
if __name__ == "__main__":
unittest.main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
find_package(SWIG 4.1 REQUIRED)
find_package(SWIG 4.2 REQUIRED)
include(${SWIG_USE_FILE})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})

View File

@@ -1,5 +1,27 @@
%{
#include <GraphMol/MolStandardize/MolStandardize.h>
#include <GraphMol/MolStandardize/Pipeline.h>
namespace RDKit {
namespace MolStandardize {
bool operator==(const PipelineLogEntry & rhs, const PipelineLogEntry & lhs) {
return (rhs.status == lhs.status) && (rhs.detail == lhs.detail);
}
bool operator!=(const PipelineLogEntry & rhs, const PipelineLogEntry & lhs) {
return !(rhs == lhs);
}
}
}
%}
%include <std_vector.i>
namespace std {
%template(PipelineLog) std::vector<RDKit::MolStandardize::PipelineLogEntry>;
}
%include <GraphMol/MolStandardize/MolStandardize.h>
#if defined SWIGJAVA
%include "enumtypeunsafe.swg"
#endif
%include <GraphMol/MolStandardize/Pipeline.h>

View File

@@ -14,6 +14,53 @@ import org.junit.*;
public void testStandardize1() {
assertEquals("fail", RDKFuncs.standardizeSmiles("[Na]OC(=O)c1ccccc1"),"O=C([O-])c1ccccc1.[Na+]");
}
@Test
public void testPipelineBadInput() {
Pipeline pipeline = new Pipeline();
PipelineResult result = pipeline.run(
"\n" +
" sldfj;ldskfj sldkjfsd;lkf\n" +
"M V30 BEGIN CTAB"
);
assertEquals(result.getStage(), PipelineStage.PARSING_INPUT);
assertFalse(result.getStatus() == PipelineStatus.NO_EVENT);
assertTrue((result.getStatus() & PipelineStatus.INPUT_ERROR) != PipelineStatus.NO_EVENT);
result.delete();
pipeline.delete();
}
@Test
public void testPipelineUnsupportedFeatures() {
Pipeline pipeline = new Pipeline();
PipelineResult result = pipeline.run(
"\n" +
" Mrv2311 01162413552D \n" +
"\n" +
" 0 0 0 0 0 999 V3000\n" +
"M V30 BEGIN CTAB\n" +
"M V30 COUNTS 2 1 0 0 0\n" +
"M V30 BEGIN ATOM\n" +
"M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)\n" +
"M V30 2 C -18.7083 6.1667 0 0\n" +
"M V30 END ATOM\n" +
"M V30 BEGIN BOND\n" +
"M V30 1 1 2 1\n" +
"M V30 END BOND\n" +
"M V30 END CTAB\n" +
"M END"
);
assertEquals(result.getStage(), PipelineStage.COMPLETED);
assertFalse(result.getStatus() == PipelineStatus.NO_EVENT);
assertTrue((result.getStatus() & PipelineStatus.VALIDATION_ERROR) != PipelineStatus.NO_EVENT);
assertTrue((result.getStatus() & PipelineStatus.FEATURES_VALIDATION_ERROR) != PipelineStatus.NO_EVENT);
result.delete();
pipeline.delete();
}
public static void main(String args[]) {
org.junit.runner.JUnitCore.main("org.RDKit.MolStandardizeTest");
}