mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Extend RDKit::MolStandardize with a validation and standardization Pipeline (#7582)
* Extend RDKit::MolStandardize with a validation and standardization Pipeline * suggested changes * apply clang-format * apply yapf * MolStandardize::FeaturesValidation optionally disallow dative bonds * add allowDativeBondType to MolStandardize::PipelineOptions * apply clang-format * make the API of other validation classes more consistent with MolStandardize::FeaturesValidation * apply clang-format * PipelineStage to enum class remove virtual functions from Pipeline class be explicit about enums * light refactoring to avoid what I think is an unnecessary call to `parse` * a bit of modernization * make the pipeline configurable * make parse and serialize configurable too * switch to storing pipeline stages using uints * add a simple test for providing a pipeline * update pointer alignment for clang-format * test modifying the parser and serializer * update swig requirement * changes in response to review * changes in response to review * rename PipelineResult's *MolBlock members to *MolData * upgrade swig to 4.2 in the CI environments * add a few missing export directives --------- Co-authored-by: greg landrum <greg.landrum@gmail.com>
This commit is contained in:
committed by
GitHub
parent
138bdc8d58
commit
3f7caf0147
@@ -9,7 +9,7 @@ steps:
|
||||
conda create --name rdkit_build -c conda-forge cmake \
|
||||
libboost=$(boost_version) \
|
||||
libboost-devel=$(boost_version) \
|
||||
swig=4.1
|
||||
swig=4.2
|
||||
displayName: Setup build environment
|
||||
- bash: |
|
||||
source ${CONDA}/etc/profile.d/conda.sh
|
||||
|
||||
@@ -21,7 +21,7 @@ steps:
|
||||
libcxx=$(compiler_version) cmake=3.26 \
|
||||
libboost=$(boost_version) \
|
||||
libboost-devel=$(boost_version) \
|
||||
cairo eigen swig=4.1
|
||||
cairo eigen swig=4.2
|
||||
conda activate rdkit_build
|
||||
displayName: Setup build environment
|
||||
- bash: |
|
||||
|
||||
@@ -15,7 +15,7 @@ steps:
|
||||
cmake=3.26 ^
|
||||
libboost=$(boost_version) ^
|
||||
libboost-devel=$(boost_version) ^
|
||||
cairo eigen swig=4.1
|
||||
cairo eigen swig=4.2
|
||||
call activate rdkit_build
|
||||
displayName: Install dependencies
|
||||
- script: |
|
||||
|
||||
@@ -23,7 +23,7 @@ BinPackArguments: true
|
||||
ColumnLimit: 80
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: true
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
DerivePointerAlignment: true
|
||||
DerivePointerAlignment: false
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
IndentCaseLabels: true
|
||||
IndentWrappedFunctionNames: false
|
||||
@@ -40,7 +40,7 @@ PenaltyBreakString: 1000
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 200
|
||||
PointerAlignment: Left
|
||||
PointerAlignment: Right
|
||||
SpacesBeforeTrailingComments: 2
|
||||
Cpp11BracedListStyle: true
|
||||
Standard: Cpp11
|
||||
|
||||
@@ -1,55 +1,61 @@
|
||||
|
||||
rdkit_library(MolStandardize
|
||||
MolStandardize.cpp
|
||||
Metal.cpp
|
||||
Normalize.cpp
|
||||
Validate.cpp
|
||||
Charge.cpp
|
||||
Tautomer.cpp
|
||||
Fragment.cpp
|
||||
FragmentCatalog/FragmentCatalogEntry.cpp
|
||||
FragmentCatalog/FragmentCatalogParams.cpp
|
||||
FragmentCatalog/FragmentCatalogUtils.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogEntry.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogParams.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogUtils.cpp
|
||||
TransformCatalog/TransformCatalogEntry.cpp
|
||||
TransformCatalog/TransformCatalogParams.cpp
|
||||
TransformCatalog/TransformCatalogUtils.cpp
|
||||
TautomerCatalog/TautomerCatalogEntry.cpp
|
||||
TautomerCatalog/TautomerCatalogParams.cpp
|
||||
TautomerCatalog/TautomerCatalogUtils.cpp
|
||||
LINK_LIBRARIES ChemReactions ChemTransforms SmilesParse SubstructMatch Descriptors GraphMol )
|
||||
Pipeline.cpp
|
||||
MolStandardize.cpp
|
||||
Metal.cpp
|
||||
Normalize.cpp
|
||||
Validate.cpp
|
||||
Charge.cpp
|
||||
Tautomer.cpp
|
||||
Fragment.cpp
|
||||
FragmentCatalog/FragmentCatalogEntry.cpp
|
||||
FragmentCatalog/FragmentCatalogParams.cpp
|
||||
FragmentCatalog/FragmentCatalogUtils.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogEntry.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogParams.cpp
|
||||
AcidBaseCatalog/AcidBaseCatalogUtils.cpp
|
||||
TransformCatalog/TransformCatalogEntry.cpp
|
||||
TransformCatalog/TransformCatalogParams.cpp
|
||||
TransformCatalog/TransformCatalogUtils.cpp
|
||||
TautomerCatalog/TautomerCatalogEntry.cpp
|
||||
TautomerCatalog/TautomerCatalogParams.cpp
|
||||
TautomerCatalog/TautomerCatalogUtils.cpp
|
||||
LINK_LIBRARIES ChemReactions ChemTransforms SmilesParse SubstructMatch Descriptors GraphMol )
|
||||
target_compile_definitions(MolStandardize PRIVATE RDKIT_MOLSTANDARDIZE_BUILD)
|
||||
|
||||
rdkit_headers(MolStandardize.h
|
||||
Metal.h
|
||||
Normalize.h
|
||||
Validate.h
|
||||
Charge.h
|
||||
Tautomer.h
|
||||
Fragment.h
|
||||
DEST GraphMol/MolStandardize)
|
||||
rdkit_headers(
|
||||
Pipeline.h
|
||||
MolStandardize.h
|
||||
Metal.h
|
||||
Normalize.h
|
||||
Validate.h
|
||||
Charge.h
|
||||
Tautomer.h
|
||||
Fragment.h
|
||||
DEST GraphMol/MolStandardize)
|
||||
|
||||
rdkit_headers(FragmentCatalog/FragmentCatalogEntry.h
|
||||
FragmentCatalog/FragmentCatalogParams.h
|
||||
FragmentCatalog/FragmentCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/FragmentCatalog)
|
||||
rdkit_headers(
|
||||
FragmentCatalog/FragmentCatalogEntry.h
|
||||
FragmentCatalog/FragmentCatalogParams.h
|
||||
FragmentCatalog/FragmentCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/FragmentCatalog)
|
||||
|
||||
rdkit_headers(AcidBaseCatalog/AcidBaseCatalogEntry.h
|
||||
AcidBaseCatalog/AcidBaseCatalogParams.h
|
||||
AcidBaseCatalog/AcidBaseCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/AcidBaseCatalog)
|
||||
rdkit_headers(
|
||||
AcidBaseCatalog/AcidBaseCatalogEntry.h
|
||||
AcidBaseCatalog/AcidBaseCatalogParams.h
|
||||
AcidBaseCatalog/AcidBaseCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/AcidBaseCatalog)
|
||||
|
||||
rdkit_headers(TransformCatalog/TransformCatalogEntry.h
|
||||
TransformCatalog/TransformCatalogParams.h
|
||||
TransformCatalog/TransformCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/TransformCatalog)
|
||||
rdkit_headers(
|
||||
TransformCatalog/TransformCatalogEntry.h
|
||||
TransformCatalog/TransformCatalogParams.h
|
||||
TransformCatalog/TransformCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/TransformCatalog)
|
||||
|
||||
rdkit_headers(TautomerCatalog/TautomerCatalogEntry.h
|
||||
TautomerCatalog/TautomerCatalogParams.h
|
||||
TautomerCatalog/TautomerCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/TautomerCatalog)
|
||||
rdkit_headers(
|
||||
TautomerCatalog/TautomerCatalogEntry.h
|
||||
TautomerCatalog/TautomerCatalogParams.h
|
||||
TautomerCatalog/TautomerCatalogUtils.h
|
||||
DEST GraphMol/MolStandardize/TautomerCatalog)
|
||||
|
||||
if(RDK_BUILD_PYTHON_WRAPPERS)
|
||||
add_subdirectory(Wrap)
|
||||
@@ -63,5 +69,6 @@ rdkit_test(molTautomerTest testTautomer.cpp LINK_LIBRARIES MolStandardize )
|
||||
rdkit_test(molStandardizeSmallTest test2.cpp LINK_LIBRARIES MolStandardize )
|
||||
rdkit_test(molFragmentTest testFragment.cpp LINK_LIBRARIES MolStandardize )
|
||||
rdkit_catch_test(molStandardizeCatchTest catch_tests.cpp LINK_LIBRARIES MolStandardize )
|
||||
rdkit_catch_test(molStandardizePipelineTest testPipeline.cpp LINK_LIBRARIES MolStandardize)
|
||||
|
||||
|
||||
|
||||
@@ -538,7 +538,7 @@ void Uncharger::unchargeInPlace(RWMol &mol) {
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace MolStandardize
|
||||
}
|
||||
|
||||
} // namespace MolStandardize
|
||||
} // namespace RDKit
|
||||
|
||||
587
Code/GraphMol/MolStandardize/Pipeline.cpp
Normal file
587
Code/GraphMol/MolStandardize/Pipeline.cpp
Normal file
@@ -0,0 +1,587 @@
|
||||
//
|
||||
// Copyright (C) 2023 Novartis Biomedical Research
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
|
||||
#include <cmath>
|
||||
#include <regex>
|
||||
#include <sstream>
|
||||
#include "Pipeline.h"
|
||||
#include "Validate.h"
|
||||
#include "Metal.h"
|
||||
#include "Normalize.h"
|
||||
#include "Charge.h"
|
||||
#include "Fragment.h"
|
||||
#include <RDGeneral/FileParseException.h>
|
||||
#include <GraphMol/FileParsers/FileParsers.h>
|
||||
#include <GraphMol/SmilesParse/SmilesParse.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
#include <GraphMol/Chirality.h>
|
||||
|
||||
namespace RDKit {
|
||||
namespace MolStandardize {
|
||||
|
||||
void PipelineResult::append(PipelineStatus newStatus, const std::string &info) {
|
||||
status = static_cast<PipelineStatus>(status | newStatus);
|
||||
log.push_back({newStatus, info});
|
||||
}
|
||||
|
||||
PipelineResult Pipeline::run(const std::string &molblock) const {
|
||||
PipelineResult result;
|
||||
result.status = NO_EVENT;
|
||||
result.inputMolData = molblock;
|
||||
|
||||
// parse the molblock into an RWMol instance
|
||||
result.stage = static_cast<uint32_t>(PipelineStage::PARSING_INPUT);
|
||||
RWMOL_SPTR mol = parse(molblock, result, options);
|
||||
if (!mol || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
|
||||
!options.reportAllFailures)) {
|
||||
return result;
|
||||
}
|
||||
|
||||
RWMOL_SPTR_PAIR output;
|
||||
|
||||
if (mol->getNumAtoms() == 0 && options.allowEmptyMolecules) {
|
||||
output = {mol, mol};
|
||||
} else {
|
||||
// we try sanitization and validation on a copy, because we want to preserve
|
||||
// the original input molecule for later
|
||||
RWMOL_SPTR molCopy{new RWMol(*mol)};
|
||||
for (const auto &[stage, operation] : validationSteps) {
|
||||
result.stage = stage;
|
||||
molCopy = operation(molCopy, result, options);
|
||||
if (!molCopy || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
|
||||
!options.reportAllFailures)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto &[stage, operation] : standardizationSteps) {
|
||||
result.stage = stage;
|
||||
mol = operation(mol, result, options);
|
||||
if (!mol || ((result.status & PIPELINE_ERROR) != NO_EVENT &&
|
||||
!options.reportAllFailures)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
if (makeParent) {
|
||||
result.stage = static_cast<uint32_t>(PipelineStage::MAKE_PARENT);
|
||||
output = makeParent(mol, result, options);
|
||||
if (!output.first || !output.second ||
|
||||
((result.status & PIPELINE_ERROR) != NO_EVENT &&
|
||||
!options.reportAllFailures)) {
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
output = {mol, mol};
|
||||
}
|
||||
}
|
||||
|
||||
// serialize as MolBlocks
|
||||
result.stage = static_cast<uint32_t>(PipelineStage::SERIALIZING_OUTPUT);
|
||||
serialize(output, result, options);
|
||||
if ((result.status & PIPELINE_ERROR) != NO_EVENT &&
|
||||
!options.reportAllFailures) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result.stage = static_cast<uint32_t>(PipelineStage::COMPLETED);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
namespace Operations {
|
||||
RWMOL_SPTR parse(const std::string &molblock, PipelineResult &result,
|
||||
const PipelineOptions &options) {
|
||||
v2::FileParsers::MolFileParserParams params;
|
||||
// we don't want to sanitize the molecule at this stage
|
||||
params.sanitize = false;
|
||||
// Hs wouldn't be anyway removed if the mol is not sanitized
|
||||
params.removeHs = false;
|
||||
// strict parsing is configurable via the pipeline options
|
||||
params.strictParsing = options.strictParsing;
|
||||
|
||||
RWMOL_SPTR mol{};
|
||||
|
||||
try {
|
||||
mol.reset(v2::FileParsers::MolFromMolBlock(molblock, params).release());
|
||||
} catch (FileParseException &e) {
|
||||
result.append(INPUT_ERROR, e.what());
|
||||
}
|
||||
|
||||
if (!mol) {
|
||||
result.append(INPUT_ERROR,
|
||||
"Could not instantiate a valid molecule from input");
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
void serialize(RWMOL_SPTR_PAIR output, PipelineResult &result,
|
||||
const PipelineOptions &options) {
|
||||
const ROMol &outputMol = *output.first;
|
||||
const ROMol &parentMol = *output.second;
|
||||
|
||||
try {
|
||||
if (!options.outputV2000) {
|
||||
result.outputMolData = MolToV3KMolBlock(outputMol);
|
||||
result.parentMolData = MolToV3KMolBlock(parentMol);
|
||||
} else {
|
||||
try {
|
||||
result.outputMolData = MolToV2KMolBlock(outputMol);
|
||||
result.parentMolData = MolToV2KMolBlock(parentMol);
|
||||
} catch (ValueErrorException &e) {
|
||||
result.append(OUTPUT_ERROR,
|
||||
"Can't write molecule to V2000 output format: " +
|
||||
std::string(e.what()));
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
result.append(OUTPUT_ERROR, "Can't write molecule to output format: " +
|
||||
std::string(e.what()));
|
||||
} catch (...) {
|
||||
result.append(
|
||||
OUTPUT_ERROR,
|
||||
"An unexpected error occurred while serializing the output structures.");
|
||||
}
|
||||
}
|
||||
RWMOL_SPTR prepareForValidation(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &) {
|
||||
// Prepare the mol for validation.
|
||||
|
||||
try {
|
||||
// The general intention is about validating the original input, and
|
||||
// therefore limit the sanitization to the minimum, but it's not very useful
|
||||
// to record a valence validation error for issues like a badly drawn nitro
|
||||
// group that would be later fixed during by the normalization step.
|
||||
//
|
||||
// Some sanitization also needs to be performed in order to assign the
|
||||
// stereochemistry (which needs to happen prior to reapplying the wedging,
|
||||
// see below), and we need to find radicals, in order to support the
|
||||
// corresponding validation criterion.
|
||||
constexpr unsigned int sanitizeOps =
|
||||
(MolOps::SANITIZE_CLEANUP | MolOps::SANITIZE_SYMMRINGS |
|
||||
MolOps::SANITIZE_CLEANUP_ORGANOMETALLICS |
|
||||
MolOps::SANITIZE_FINDRADICALS);
|
||||
unsigned int failedOp = 0;
|
||||
MolOps::sanitizeMol(*mol, failedOp, sanitizeOps);
|
||||
|
||||
// We want to restore the original MolBlock wedging, but this step may in
|
||||
// some cases overwrite the ENDDOWNRIGHT/ENDUPRIGHT info that describes the
|
||||
// configuration of double bonds adjacent to stereocenters. We therefore
|
||||
// first assign the stereochemistry, and then restore the wedging.
|
||||
constexpr bool cleanIt = true;
|
||||
constexpr bool force = true;
|
||||
constexpr bool flagPossible = true;
|
||||
MolOps::assignStereochemistry(*mol, cleanIt, force, flagPossible);
|
||||
Chirality::reapplyMolBlockWedging(*mol);
|
||||
} catch (MolSanitizeException &) {
|
||||
result.append(
|
||||
PREPARE_FOR_VALIDATION_ERROR,
|
||||
"An error occurred while preparing the molecule for validation.");
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
namespace {
|
||||
// The error messages from the ValidationMethod classes include some metadata
|
||||
// in a string prefix that are not particularly useful within the context of
|
||||
// this Pipeline. The function below removes that prefix.
|
||||
static const std::regex prefix("^(ERROR|INFO): \\[.+\\] ");
|
||||
std::string removeErrorPrefix(const std::string &message) {
|
||||
return std::regex_replace(message, prefix, "");
|
||||
}
|
||||
} // namespace
|
||||
|
||||
RWMOL_SPTR validate(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &options) {
|
||||
auto applyValidation = [&mol, &result, &options](
|
||||
const ValidationMethod &v,
|
||||
PipelineStatus status) -> bool {
|
||||
auto errors = v.validate(*mol, options.reportAllFailures);
|
||||
for (const auto &error : errors) {
|
||||
result.append(status, removeErrorPrefix(error));
|
||||
}
|
||||
return errors.empty();
|
||||
};
|
||||
|
||||
// check for undesired features in the input molecule (e.g., query
|
||||
// atoms/bonds)
|
||||
FeaturesValidation featuresValidation(options.allowEnhancedStereo,
|
||||
options.allowAromaticBondType,
|
||||
options.allowDativeBondType);
|
||||
if (!applyValidation(featuresValidation, FEATURES_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// check the number of atoms and valence status
|
||||
RDKitValidation rdkitValidation;
|
||||
if (!applyValidation(rdkitValidation, BASIC_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// disallow radicals
|
||||
DisallowedRadicalValidation radicalValidation;
|
||||
if (!applyValidation(radicalValidation, BASIC_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// validate the isotopic numbers (if any are specified)
|
||||
IsotopeValidation isotopeValidation(true);
|
||||
if (!applyValidation(isotopeValidation, BASIC_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// verify that the input is a 2D structure
|
||||
Is2DValidation is2DValidation(options.is2DZeroThreshold);
|
||||
if (!applyValidation(is2DValidation, IS2D_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// validate the 2D layout (check for clashing atoms and abnormally long bonds)
|
||||
Layout2DValidation layout2DValidation(
|
||||
options.atomClashLimit, options.bondLengthLimit,
|
||||
options.allowLongBondsInRings, options.allowAtomBondClashExemption,
|
||||
options.minMedianBondLength);
|
||||
if (!applyValidation(layout2DValidation, LAYOUT2D_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
// verify that the specified stereochemistry is formally correct
|
||||
StereoValidation stereoValidation;
|
||||
if (!applyValidation(stereoValidation, STEREO_VALIDATION_ERROR) &&
|
||||
!options.reportAllFailures) {
|
||||
return mol;
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
RWMOL_SPTR prepareForStandardization(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &) {
|
||||
// Prepare the mol for standardization.
|
||||
|
||||
try {
|
||||
MolOps::sanitizeMol(*mol);
|
||||
} catch (MolSanitizeException &) {
|
||||
result.append(
|
||||
PREPARE_FOR_STANDARDIZATION_ERROR,
|
||||
"An error occurred while preparing the molecule for standardization.");
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
RWMOL_SPTR standardize(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &options) {
|
||||
auto smiles = MolToSmiles(*mol);
|
||||
auto reference = smiles;
|
||||
|
||||
// bonding to metals
|
||||
try {
|
||||
MetalDisconnectorOptions mdOpts;
|
||||
MetalDisconnector metalDisconnector(mdOpts);
|
||||
std::unique_ptr<ROMol> metalNof{SmartsToMol(options.metalNof)};
|
||||
metalDisconnector.setMetalNof(*metalNof);
|
||||
std::unique_ptr<ROMol> metalNon{SmartsToMol(options.metalNon)};
|
||||
metalDisconnector.setMetalNon(*metalNon);
|
||||
metalDisconnector.disconnectInPlace(*mol);
|
||||
} catch (...) {
|
||||
result.append(
|
||||
METAL_STANDARDIZATION_ERROR,
|
||||
"An error occurred while processing the bonding of metal species.");
|
||||
return mol;
|
||||
}
|
||||
|
||||
smiles = MolToSmiles(*mol);
|
||||
if (smiles != reference) {
|
||||
result.append(METALS_DISCONNECTED,
|
||||
"One or more metal atoms were disconnected.");
|
||||
}
|
||||
reference = smiles;
|
||||
|
||||
// functional groups
|
||||
try {
|
||||
std::unique_ptr<Normalizer> normalizer{};
|
||||
if (options.normalizerData.empty()) {
|
||||
normalizer.reset(new Normalizer);
|
||||
} else {
|
||||
std::istringstream sstr(options.normalizerData);
|
||||
normalizer.reset(new Normalizer(sstr, options.normalizerMaxRestarts));
|
||||
}
|
||||
// normalizeInPlace() may return an ill-formed molecule if
|
||||
// the sanitization of a transformed structure failed
|
||||
// => use normalize() instead (also see GitHub #7189)
|
||||
mol.reset(static_cast<RWMol *>(normalizer->normalize(*mol)));
|
||||
mol->updatePropertyCache(false);
|
||||
} catch (...) {
|
||||
result.append(
|
||||
NORMALIZER_STANDARDIZATION_ERROR,
|
||||
"An error occurred while normalizing the representation of some functional groups");
|
||||
return mol;
|
||||
}
|
||||
|
||||
smiles = MolToSmiles(*mol);
|
||||
if (smiles != reference) {
|
||||
result.append(NORMALIZATION_APPLIED,
|
||||
"The representation of some functional groups was adjusted.");
|
||||
}
|
||||
reference = smiles;
|
||||
|
||||
// keep the largest fragment
|
||||
try {
|
||||
LargestFragmentChooser fragmentChooser;
|
||||
fragmentChooser.chooseInPlace(*mol);
|
||||
} catch (...) {
|
||||
result.append(
|
||||
FRAGMENT_STANDARDIZATION_ERROR,
|
||||
"An error occurred while removing the disconnected fragments");
|
||||
return mol;
|
||||
}
|
||||
|
||||
smiles = MolToSmiles(*mol);
|
||||
if (smiles != reference) {
|
||||
result.append(
|
||||
FRAGMENTS_REMOVED,
|
||||
"One or more disconnected fragments (e.g., counterions) were removed.");
|
||||
}
|
||||
|
||||
// The stereochemistry is not assigned until after we are done modifying the
|
||||
// molecular graph:
|
||||
constexpr bool cleanIt = true;
|
||||
constexpr bool force = true;
|
||||
constexpr bool flagPossible = true;
|
||||
MolOps::assignStereochemistry(*mol, cleanIt, force, flagPossible);
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
RWMOL_SPTR reapplyWedging(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &) {
|
||||
// in general, we want to restore the bond wedging from the input molblock,
|
||||
// but we prefer to not use any wavy bonds, because of their ambiguity
|
||||
// in some configurations.
|
||||
|
||||
// we therefore proceed in two steps, we first reapply the molblock wedging
|
||||
// and then revert the changes related to double bonds with undefined/unknown
|
||||
// stereochemistry and change single bonds with "unknown" direction into plain
|
||||
// single bonds.
|
||||
|
||||
// in order to do so, we need to keep track of the current bond configuration
|
||||
// settings.
|
||||
using BondInfo = std::tuple<Bond::BondType, Bond::BondDir, Bond::BondStereo>;
|
||||
std::map<unsigned int, BondInfo> oldBonds;
|
||||
for (auto bond : mol->bonds()) {
|
||||
oldBonds[bond->getIdx()] = {bond->getBondType(), bond->getBondDir(),
|
||||
bond->getStereo()};
|
||||
}
|
||||
|
||||
// 1) restore the original wedging from the input MolBlock
|
||||
Chirality::reapplyMolBlockWedging(*mol);
|
||||
|
||||
// 2) revert the changes related to double bonds with stereo type "either":
|
||||
// restore the STEREOANY direction of double bonds that have a substituent
|
||||
// with direction UNKNOWN and are now STEREONONE
|
||||
for (auto bond : mol->bonds()) {
|
||||
if (bond->getBondType() != Bond::DOUBLE) {
|
||||
continue;
|
||||
}
|
||||
Bond::BondStereo oldStereo = std::get<2>(oldBonds[bond->getIdx()]);
|
||||
Bond::BondStereo newStereo = bond->getStereo();
|
||||
bool hasAdjacentWavy{false};
|
||||
for (auto atom : {bond->getBeginAtom(), bond->getEndAtom()}) {
|
||||
for (auto adjacentBond : mol->atomBonds(atom)) {
|
||||
if (adjacentBond == bond) {
|
||||
continue;
|
||||
}
|
||||
if (adjacentBond->getBondDir() == Bond::UNKNOWN) {
|
||||
hasAdjacentWavy = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (hasAdjacentWavy && oldStereo == Bond::STEREOANY &&
|
||||
newStereo == Bond::STEREONONE) {
|
||||
bond->setStereo(Bond::STEREOANY);
|
||||
result.append(
|
||||
NORMALIZATION_APPLIED,
|
||||
"Double bond " + std::to_string(bond->getIdx()) +
|
||||
" was assigned an undefined/unknown stereochemical configuration");
|
||||
}
|
||||
}
|
||||
|
||||
// 3) set the bond direction to NONE for bonds with direction UNKNOWN
|
||||
for (auto bond : mol->bonds()) {
|
||||
if (bond->getBondDir() != Bond::UNKNOWN) {
|
||||
continue;
|
||||
}
|
||||
bond->setBondDir(Bond::NONE);
|
||||
result.append(NORMALIZATION_APPLIED, "The \"wavy\" style of bond " +
|
||||
std::to_string(bond->getIdx()) +
|
||||
" was removed");
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
RWMOL_SPTR cleanup2D(RWMOL_SPTR mol, PipelineResult & /*result*/,
|
||||
const PipelineOptions &options) {
|
||||
// scale the atoms coordinates
|
||||
// and make sure that z coords are set to 0 (some z coords may be non-null
|
||||
// albeit smaller than the validation threshold - these noisy coords may in
|
||||
// some cases also interfere with the perception of stereochemistry by some
|
||||
// tools e.g., inchi)
|
||||
if (options.scaledMedianBondLength > 0. && mol->getNumConformers()) {
|
||||
auto &conf = mol->getConformer();
|
||||
double medianBondLength =
|
||||
sqrt(Layout2DValidation::squaredMedianBondLength(*mol, conf));
|
||||
if (medianBondLength > options.minMedianBondLength) {
|
||||
double scaleFactor = options.scaledMedianBondLength / medianBondLength;
|
||||
unsigned int natoms = conf.getNumAtoms();
|
||||
for (unsigned int i = 0; i < natoms; ++i) {
|
||||
auto pos = conf.getAtomPos(i) * scaleFactor;
|
||||
pos.z = 0.;
|
||||
conf.setAtomPos(i, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mol;
|
||||
}
|
||||
|
||||
namespace {
|
||||
void replaceDativeBonds(RWMOL_SPTR mol) {
|
||||
bool modified{false};
|
||||
for (auto bond : mol->bonds()) {
|
||||
if (bond->getBondType() != Bond::BondType::DATIVE) {
|
||||
continue;
|
||||
}
|
||||
auto donor = bond->getBeginAtom();
|
||||
donor->setFormalCharge(donor->getFormalCharge() + 1);
|
||||
auto acceptor = bond->getEndAtom();
|
||||
acceptor->setFormalCharge(acceptor->getFormalCharge() - 1);
|
||||
bond->setBondType(Bond::BondType::SINGLE);
|
||||
modified = true;
|
||||
}
|
||||
if (modified) {
|
||||
mol->updatePropertyCache(false);
|
||||
}
|
||||
}
|
||||
|
||||
void removeHsAtProtonatedSites(RWMOL_SPTR mol) {
|
||||
boost::dynamic_bitset<> protons{mol->getNumAtoms(), 0};
|
||||
for (auto atom : mol->atoms()) {
|
||||
if (atom->getAtomicNum() != 1 || atom->getDegree() != 1) {
|
||||
continue;
|
||||
}
|
||||
for (auto neighbor : mol->atomNeighbors(atom)) {
|
||||
if (neighbor->getFormalCharge() > 0) {
|
||||
protons.set(atom->getIdx());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (protons.any()) {
|
||||
for (int idx = mol->getNumAtoms() - 1; idx >= 0; --idx) {
|
||||
if (!protons[idx]) {
|
||||
continue;
|
||||
}
|
||||
auto atom = mol->getAtomWithIdx(idx);
|
||||
for (auto bond : mol->atomBonds(atom)) {
|
||||
auto neighbor = bond->getOtherAtom(atom);
|
||||
neighbor->setNumExplicitHs(neighbor->getNumExplicitHs() + 1);
|
||||
break; // there are no other bonds anyways
|
||||
}
|
||||
mol->removeAtom(atom);
|
||||
}
|
||||
mol->updatePropertyCache(false);
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
RWMOL_SPTR_PAIR makeParent(RWMOL_SPTR mol, PipelineResult &result,
|
||||
const PipelineOptions &) {
|
||||
auto reference = MolToSmiles(*mol);
|
||||
|
||||
RWMOL_SPTR parent{new RWMol(*mol)};
|
||||
|
||||
// A "parent" structure is constructed here, in order to provide a
|
||||
// representation of the original input that may be more suitable for
|
||||
// identification purposes even though it may not reflect the most stable
|
||||
// physical state or nicest representation for the compound.
|
||||
//
|
||||
// The two steps that are currently implemented for this procedure consist in
|
||||
// normalizing the overall charge status and replacing any explicit dative
|
||||
// bonds.
|
||||
//
|
||||
// If the input was submitted in an unsuitable protonation status, the
|
||||
// neutralized parent structure may become the actual output from the
|
||||
// standardization.
|
||||
|
||||
// overall charge status
|
||||
try {
|
||||
// The Uncharger implementation wouldn't identify the positively
|
||||
// charged sites with adjacent explicit Hs correctly (it's a quite
|
||||
// unlikely configuration, but potentially possible considering that
|
||||
// the pipeline operates on unsanitized input).
|
||||
//
|
||||
// If present, these Hs are therefore removed from the molecular graph
|
||||
// prior to neutralization.
|
||||
removeHsAtProtonatedSites(parent);
|
||||
|
||||
static const bool canonicalOrdering = false;
|
||||
static const bool force = true;
|
||||
static const bool protonationOnly = true;
|
||||
Uncharger uncharger(canonicalOrdering, force, protonationOnly);
|
||||
uncharger.unchargeInPlace(*parent);
|
||||
} catch (...) {
|
||||
result.append(
|
||||
CHARGE_STANDARDIZATION_ERROR,
|
||||
"An error occurred while normalizing the compound's charge status");
|
||||
return {{}, {}};
|
||||
}
|
||||
|
||||
// Check if `mol` was submitted in a suitable ionization state
|
||||
int parentCharge{};
|
||||
for (auto atom : parent->atoms()) {
|
||||
parentCharge += atom->getFormalCharge();
|
||||
}
|
||||
|
||||
int molCharge{};
|
||||
for (auto atom : mol->atoms()) {
|
||||
molCharge += atom->getFormalCharge();
|
||||
}
|
||||
|
||||
// If mol is neutral or in a protonation state that partially or fully
|
||||
// balances the non-neutralizable charged sites in the parent structure,
|
||||
// then mol is accepted. Otherwise, it is replaced by its parent.
|
||||
if ((molCharge > 0 && molCharge > parentCharge) ||
|
||||
(molCharge < 0 && molCharge < parentCharge)) {
|
||||
mol = parent;
|
||||
}
|
||||
|
||||
auto smiles = MolToSmiles(*mol);
|
||||
if (smiles != reference) {
|
||||
result.append(PROTONATION_CHANGED, "The protonation state was adjusted.");
|
||||
}
|
||||
reference = smiles;
|
||||
|
||||
// normalize the dative bonds
|
||||
replaceDativeBonds(parent);
|
||||
|
||||
return {mol, parent};
|
||||
}
|
||||
} // namespace Operations
|
||||
|
||||
} // namespace MolStandardize
|
||||
} // namespace RDKit
|
||||
234
Code/GraphMol/MolStandardize/Pipeline.h
Normal file
234
Code/GraphMol/MolStandardize/Pipeline.h
Normal file
@@ -0,0 +1,234 @@
|
||||
//
|
||||
// Copyright (C) 2023 Novartis Biomedical Research
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#ifndef RD_MOLSTANDARDIZE_PIPELINE_H
|
||||
#define RD_MOLSTANDARDIZE_PIPELINE_H
|
||||
#include <RDGeneral/export.h>
|
||||
#include <GraphMol/RWMol.h>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace RDKit {
|
||||
|
||||
namespace MolStandardize {
|
||||
|
||||
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineOptions {
|
||||
// parsing
|
||||
bool strictParsing{false};
|
||||
|
||||
// validation
|
||||
bool reportAllFailures{true};
|
||||
bool allowEmptyMolecules{false};
|
||||
bool allowEnhancedStereo{false};
|
||||
bool allowAromaticBondType{false};
|
||||
bool allowDativeBondType{false};
|
||||
double is2DZeroThreshold{1e-3};
|
||||
double atomClashLimit{0.03};
|
||||
double minMedianBondLength{1e-3};
|
||||
double bondLengthLimit{100.};
|
||||
bool allowLongBondsInRings{true};
|
||||
bool allowAtomBondClashExemption{true};
|
||||
|
||||
// cleanup/standardization
|
||||
// metal disconnector options
|
||||
std::string metalNof{"[Li,Na,K,Rb,Cs,Fr]~[#7,#8,F]"};
|
||||
std::string metalNon{};
|
||||
// normalizer options
|
||||
std::string normalizerData{
|
||||
"// Name\tSMIRKS\n"
|
||||
"Nitro to N+(O-)=O\t[N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]\n"
|
||||
"Sulfone to S(=O)(=O)\t[S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])\n"
|
||||
"Pyridine oxide to n+O-\t[nH0+0:1]=[OH0+0:2]>>[n+:1][O-:2]\n"
|
||||
"Azide to N=N+=N-\t[*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]\n"
|
||||
"Diazo/azo to =N+=N-\t[*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]\n"
|
||||
// Note: the sulfoxide transformation by default included in the
|
||||
// Normalizer configuration was removed Note: the transformation below was
|
||||
// ported from STRUCHK and it's not part of the default Normalizer
|
||||
// configuration
|
||||
"[SH](=O)(=O) to S(=O)O\t[c,C,N,O,F,Cl,Br,I:1][SH+0:2](=[O:3])=[O:4]>>[*:1][*:2]([*:3])=[*:4]\n"
|
||||
// Note: the two transformations below replace the default Phosphate
|
||||
// normalization in order to ensure that, if an O is available, the double
|
||||
// bond is placed between P and O
|
||||
"Phosphate to P(O-)=O\t[O-:1][P+;D4:2][O,S,Se,Te;-1:3]>>[O+0:1]=[P+0;D5:2][*-1:3]\n"
|
||||
"Generalized phosphate to P(X-)=Y\t[S,Se,Te;-1:1][P+;D4:2][S,Se,Te;-1:3]>>[*+0:1]=[P+0;D5:2][*-1:3]\n"
|
||||
"C/S+N to C/S=N+\t[C,S&!$([S+]-[O-]);X3+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
|
||||
"P+N to P=N+\t[P;X4+1:1]([NX3:2])[NX3!H0:3]>>[*+0:1]([N:2])=[N+:3]\n"
|
||||
"Recombine 1,3-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[N,P,As,Sb,O,S,Se,Te;+1:3]>>[*-0:1]=[*:2]-[*+0:3]\n"
|
||||
"Recombine 1,3-separated charges\t[n,o,p,s;-1:1]:[a:2]=[N,O,P,S;+1:3]>>[*-0:1]:[*:2]-[*+0:3]\n"
|
||||
"Recombine 1,3-separated charges\t[N,O,P,S;-1:1]-[a+0:2]:[n,o,p,s;+1:3]>>[*-0:1]=[*:2]:[*+0:3]\n"
|
||||
"Recombine 1,5-separated charges\t[N,P,As,Sb,O,S,Se,Te;-1:1]-[A+0:2]=[A:3]-[A:4]=[N,P,As,Sb,O,S,Se,Te;+1:5]>>[*-0:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
|
||||
"Recombine 1,5-separated charges\t[n,o,p,s;-1:1]:[a:2]:[a:3]:[c:4]=[N,O,P,S;+1:5]>>[*-0:1]:[*:2]:[*:3]:[c:4]-[*+0:5]\n"
|
||||
"Recombine 1,5-separated charges\t[N,O,P,S;-1:1]-[c:2]:[a:3]:[a:4]:[n,o,p,s;+1:5]>>[*-0:1]=[c:2]:[*:3]:[*:4]:[*+0:5]\n"
|
||||
// Note: four transformations were added to the normalization of aliphatic
|
||||
// conjug cations in order to favor the positioning of new double bonds
|
||||
// within rings
|
||||
"Normalize 1,3 conjugated cation\t[N;+0!H0:1]@-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
|
||||
"Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
|
||||
"Normalize 1,3 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]=[*:2]-[*+0:3]\n"
|
||||
"Normalize 1,3 conjugated cation\t[n;+0!H0:1]:[c:2]=[N!$(*~[N,O,P,S;-1]),O;+1H0:3]>>[*+1:1]:[*:2]-[*+0:3]\n"
|
||||
"Normalize 1,5 conjugated cation\t[N;+0!H0:1]@-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
|
||||
"Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]@-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
|
||||
"Normalize 1,5 conjugated cation\t[N,O!$(*N);+0!H0:1]-[A:2]=[A:3]-[A:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[*+1:1]=[*:2]-[*:3]=[*:4]-[*+0:5]\n"
|
||||
"Normalize 1,5 conjugated cation\t[n;+0!H0:1]:[a:2]:[a:3]:[c:4]=[N!$(*~[N,O,P,S;-1]),O;+1H0:5]>>[n+1:1]:[*:2]:[*:3]:[*:4]-[*+0:5]\n"
|
||||
"Charge normalization\t[F,Cl,Br,I,At;-1:1]=[O:2]>>[*-0:1][O-:2]\n"
|
||||
"Charge recombination\t[N,P,As,Sb;-1:1]=[C+;v3:2]>>[*+0:1]#[C+0:2]\n"};
|
||||
unsigned int normalizerMaxRestarts{200};
|
||||
double scaledMedianBondLength{1.};
|
||||
|
||||
// serialization
|
||||
bool outputV2000{false};
|
||||
};
|
||||
|
||||
enum RDKIT_MOLSTANDARDIZE_EXPORT PipelineStatus {
|
||||
NO_EVENT = 0,
|
||||
INPUT_ERROR = (1 << 0),
|
||||
PREPARE_FOR_VALIDATION_ERROR = (1 << 1),
|
||||
FEATURES_VALIDATION_ERROR = (1 << 2),
|
||||
BASIC_VALIDATION_ERROR = (1 << 3),
|
||||
IS2D_VALIDATION_ERROR = (1 << 4),
|
||||
LAYOUT2D_VALIDATION_ERROR = (1 << 5),
|
||||
STEREO_VALIDATION_ERROR = (1 << 6),
|
||||
VALIDATION_ERROR = (FEATURES_VALIDATION_ERROR | BASIC_VALIDATION_ERROR |
|
||||
IS2D_VALIDATION_ERROR | LAYOUT2D_VALIDATION_ERROR |
|
||||
STEREO_VALIDATION_ERROR),
|
||||
PREPARE_FOR_STANDARDIZATION_ERROR = (1 << 7),
|
||||
METAL_STANDARDIZATION_ERROR = (1 << 8),
|
||||
NORMALIZER_STANDARDIZATION_ERROR = (1 << 9),
|
||||
FRAGMENT_STANDARDIZATION_ERROR = (1 << 10),
|
||||
CHARGE_STANDARDIZATION_ERROR = (1 << 11),
|
||||
STANDARDIZATION_ERROR =
|
||||
(METAL_STANDARDIZATION_ERROR | NORMALIZER_STANDARDIZATION_ERROR |
|
||||
FRAGMENT_STANDARDIZATION_ERROR | CHARGE_STANDARDIZATION_ERROR),
|
||||
OUTPUT_ERROR = (1 << 12),
|
||||
PIPELINE_ERROR = (INPUT_ERROR | PREPARE_FOR_VALIDATION_ERROR |
|
||||
VALIDATION_ERROR | PREPARE_FOR_STANDARDIZATION_ERROR |
|
||||
STANDARDIZATION_ERROR | OUTPUT_ERROR),
|
||||
METALS_DISCONNECTED = (1 << 23),
|
||||
NORMALIZATION_APPLIED = (1 << 24),
|
||||
FRAGMENTS_REMOVED = (1 << 25),
|
||||
PROTONATION_CHANGED = (1 << 26),
|
||||
STRUCTURE_MODIFICATION = (METALS_DISCONNECTED | NORMALIZATION_APPLIED |
|
||||
FRAGMENTS_REMOVED | PROTONATION_CHANGED)
|
||||
};
|
||||
|
||||
enum class RDKIT_MOLSTANDARDIZE_EXPORT PipelineStage : std::uint32_t {
|
||||
NOT_STARTED = 0,
|
||||
PARSING_INPUT,
|
||||
PREPARE_FOR_VALIDATION,
|
||||
VALIDATION,
|
||||
PREPARE_FOR_STANDARDIZATION,
|
||||
STANDARDIZATION,
|
||||
REAPPLY_WEDGING,
|
||||
CLEANUP_2D,
|
||||
MAKE_PARENT,
|
||||
SERIALIZING_OUTPUT,
|
||||
COMPLETED
|
||||
};
|
||||
|
||||
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineLogEntry {
|
||||
PipelineStatus status;
|
||||
std::string detail;
|
||||
};
|
||||
|
||||
using PipelineLog = std::vector<PipelineLogEntry>;
|
||||
|
||||
struct RDKIT_MOLSTANDARDIZE_EXPORT PipelineResult {
|
||||
PipelineStatus status;
|
||||
std::uint32_t stage;
|
||||
PipelineLog log;
|
||||
std::string inputMolData;
|
||||
std::string outputMolData;
|
||||
std::string parentMolData;
|
||||
|
||||
void append(PipelineStatus newStatus, const std::string &info);
|
||||
};
|
||||
|
||||
using RWMOL_SPTR_PAIR = std::pair<RWMOL_SPTR, RWMOL_SPTR>;
|
||||
|
||||
namespace Operations {
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForValidation(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR validate(RWMOL_SPTR mol,
|
||||
PipelineResult &result,
|
||||
const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR prepareForStandardization(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR standardize(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR reapplyWedging(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR cleanup2D(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR_PAIR makeParent(
|
||||
RWMOL_SPTR mol, PipelineResult &result, const PipelineOptions &options);
|
||||
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT RWMOL_SPTR parse(const std::string &molblock,
|
||||
PipelineResult &result,
|
||||
const PipelineOptions &options);
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT void serialize(RWMOL_SPTR_PAIR output,
|
||||
PipelineResult &result,
|
||||
const PipelineOptions &options);
|
||||
|
||||
using ParseOperation = decltype(&parse);
|
||||
using SerializeOperation = decltype(&serialize);
|
||||
using Operation = decltype(&prepareForValidation);
|
||||
using ParentOperation = decltype(&makeParent);
|
||||
using PipelineVector = std::vector<std::pair<std::uint32_t, Operation>>;
|
||||
|
||||
const PipelineVector validationSteps{
|
||||
// input sanitization and cleanup
|
||||
{static_cast<uint32_t>(PipelineStage::PREPARE_FOR_VALIDATION),
|
||||
&prepareForValidation},
|
||||
// validate the structure
|
||||
{static_cast<uint32_t>(PipelineStage::VALIDATION), &validate}};
|
||||
|
||||
const PipelineVector standardizationSteps{
|
||||
{static_cast<uint32_t>(PipelineStage::PREPARE_FOR_STANDARDIZATION),
|
||||
&prepareForStandardization},
|
||||
{static_cast<uint32_t>(PipelineStage::STANDARDIZATION), &standardize},
|
||||
{static_cast<uint32_t>(PipelineStage::REAPPLY_WEDGING), &reapplyWedging},
|
||||
{static_cast<uint32_t>(PipelineStage::CLEANUP_2D), &cleanup2D}};
|
||||
} // namespace Operations
|
||||
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT Pipeline {
|
||||
private:
|
||||
PipelineOptions options;
|
||||
Operations::ParseOperation parse = Operations::parse;
|
||||
Operations::SerializeOperation serialize = Operations::serialize;
|
||||
Operations::PipelineVector validationSteps = Operations::validationSteps;
|
||||
Operations::PipelineVector standardizationSteps =
|
||||
Operations::standardizationSteps;
|
||||
Operations::ParentOperation makeParent = Operations::makeParent;
|
||||
|
||||
public:
|
||||
Pipeline() = default;
|
||||
explicit Pipeline(const PipelineOptions &o) : options(o){};
|
||||
~Pipeline() = default;
|
||||
|
||||
PipelineResult run(const std::string &molblock) const;
|
||||
|
||||
void setValidationSteps(const Operations::PipelineVector &steps) {
|
||||
validationSteps = steps;
|
||||
}
|
||||
void setStandardizationSteps(const Operations::PipelineVector &steps) {
|
||||
standardizationSteps = steps;
|
||||
}
|
||||
void setMakeParent(Operations::ParentOperation op) { makeParent = op; }
|
||||
void setParse(Operations::ParseOperation op) { parse = op; }
|
||||
void setSerialize(Operations::SerializeOperation op) { serialize = op; }
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
} // namespace MolStandardize
|
||||
} // namespace RDKit
|
||||
|
||||
#endif
|
||||
@@ -11,18 +11,20 @@
|
||||
#include "Fragment.h"
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/ROMol.h>
|
||||
#include <GraphMol/QueryOps.h>
|
||||
#include <GraphMol/MolStandardize/FragmentCatalog/FragmentCatalogParams.h>
|
||||
#include <GraphMol/Substruct/SubstructMatch.h>
|
||||
#include <GraphMol/PeriodicTable.h>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include <GraphMol/SmilesParse/SmilesParse.h>
|
||||
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace RDKit;
|
||||
|
||||
namespace RDKit {
|
||||
class RWMol;
|
||||
class ROMol;
|
||||
@@ -30,10 +32,9 @@ class ROMol;
|
||||
namespace MolStandardize {
|
||||
|
||||
std::vector<ValidationErrorInfo> CompositeValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const
|
||||
{
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
for (const auto & method : validations) {
|
||||
for (const auto &method : validations) {
|
||||
auto partial = method->validate(mol, reportAllFailures);
|
||||
if (!partial.empty()) {
|
||||
std::copy(partial.begin(), partial.end(), std::back_inserter(errors));
|
||||
@@ -73,8 +74,8 @@ std::vector<ValidationErrorInfo> RDKitValidation::validate(
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo>
|
||||
NoAtomValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
|
||||
std::vector<ValidationErrorInfo> NoAtomValidation::validate(
|
||||
const ROMol &mol, bool /*reportAllFailures*/) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
unsigned int na = mol.getNumAtoms();
|
||||
if (!na) {
|
||||
@@ -83,8 +84,8 @@ NoAtomValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo>
|
||||
FragmentValidation::validate(const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> FragmentValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
// REVIEW: reportAllFailures is not being used here. is that correct?
|
||||
RDUNUSED_PARAM(reportAllFailures);
|
||||
@@ -145,8 +146,8 @@ FragmentValidation::validate(const ROMol &mol, bool reportAllFailures) const {
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo>
|
||||
NeutralValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const {
|
||||
std::vector<ValidationErrorInfo> NeutralValidation::validate(
|
||||
const ROMol &mol, bool /*reportAllFailures*/) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
int charge = RDKit::MolOps::getFormalCharge(mol);
|
||||
if (charge != 0) {
|
||||
@@ -162,51 +163,48 @@ NeutralValidation::validate(const ROMol &mol, bool /*reportAllFailures*/) const
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo>
|
||||
IsotopeValidation::validate(const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> IsotopeValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
unsigned int na = mol.getNumAtoms();
|
||||
std::set<string> isotopes;
|
||||
|
||||
// loop over atoms
|
||||
for (size_t i = 0; i < na; ++i) {
|
||||
if (!reportAllFailures) {
|
||||
if (errors.size() >= 1) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const Atom *atom = mol.getAtomWithIdx(i);
|
||||
for (auto atom : mol.atoms()) {
|
||||
unsigned int isotope = atom->getIsotope();
|
||||
if (isotope != 0) {
|
||||
std::string symbol = atom->getSymbol();
|
||||
isotopes.insert(std::to_string(isotope) + symbol);
|
||||
if (isotope == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &isotope : isotopes) {
|
||||
errors.push_back("INFO: [IsotopeValidation] Molecule contains isotope " +
|
||||
isotope);
|
||||
std::string symbol = atom->getSymbol();
|
||||
unsigned int atomicNum = atom->getAtomicNum();
|
||||
if (atomicNum && strict) {
|
||||
PeriodicTable *periodicTable = PeriodicTable::getTable();
|
||||
double mass = periodicTable->getMassForIsotope(atomicNum, isotope);
|
||||
if (mass == 0.0) {
|
||||
errors.push_back(
|
||||
"ERROR: [IsotopeValidation] The molecule contains an unknown isotope: " +
|
||||
std::to_string(isotope) + symbol);
|
||||
}
|
||||
} else {
|
||||
errors.push_back("INFO: [IsotopeValidation] Molecule contains isotope " +
|
||||
std::to_string(isotope) + symbol);
|
||||
}
|
||||
|
||||
if (!errors.empty() && !reportAllFailures) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
// constructor
|
||||
MolVSValidation::MolVSValidation()
|
||||
: CompositeValidation({
|
||||
std::make_shared<NoAtomValidation>(),
|
||||
std::make_shared<FragmentValidation>(),
|
||||
std::make_shared<NeutralValidation>(),
|
||||
std::make_shared<IsotopeValidation>()
|
||||
})
|
||||
{
|
||||
}
|
||||
: CompositeValidation({std::make_shared<NoAtomValidation>(),
|
||||
std::make_shared<FragmentValidation>(),
|
||||
std::make_shared<NeutralValidation>(),
|
||||
std::make_shared<IsotopeValidation>()}) {}
|
||||
|
||||
// overloaded constructor
|
||||
MolVSValidation::MolVSValidation(
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> & validations)
|
||||
: CompositeValidation(validations)
|
||||
{
|
||||
}
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> &validations)
|
||||
: CompositeValidation(validations) {}
|
||||
|
||||
std::vector<ValidationErrorInfo> AllowedAtomsValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
@@ -267,6 +265,735 @@ std::vector<ValidationErrorInfo> DisallowedAtomsValidation::validate(
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo> DisallowedRadicalValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
|
||||
for (auto atom : mol.atoms()) {
|
||||
unsigned int numRadicalElectrons = atom->getNumRadicalElectrons();
|
||||
if (numRadicalElectrons == 0) {
|
||||
continue;
|
||||
}
|
||||
unsigned int atomicNum = atom->getAtomicNum();
|
||||
unsigned int degree = atom->getDegree();
|
||||
if ((atomicNum == 7 || atomicNum == 8) && numRadicalElectrons == 1 &&
|
||||
degree == 1) {
|
||||
unsigned int neighborAtomicNum = 0;
|
||||
Bond::BondType bondType = Bond::BondType::UNSPECIFIED;
|
||||
for (auto neighbor : mol.atomNeighbors(atom)) {
|
||||
// only one iteration is performed, because degree == 1
|
||||
neighborAtomicNum = neighbor->getAtomicNum();
|
||||
bondType = mol.getBondBetweenAtoms(atom->getIdx(), neighbor->getIdx())
|
||||
->getBondType();
|
||||
}
|
||||
if (atomicNum == 7 && neighborAtomicNum == 8 &&
|
||||
bondType == Bond::BondType::DOUBLE) {
|
||||
// nitric oxide
|
||||
continue;
|
||||
}
|
||||
if (atomicNum == 8 && neighborAtomicNum == 7 &&
|
||||
bondType == Bond::BondType::SINGLE) {
|
||||
// aminoxyl
|
||||
continue;
|
||||
}
|
||||
}
|
||||
errors.push_back(
|
||||
"ERROR: [DisallowedRadicalValidation] The radical at atom " +
|
||||
std::to_string(atom->getIdx()) + " is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo> FeaturesValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
|
||||
// Optionally disallow query and dummy atoms, and aliases
|
||||
for (auto atom : mol.atoms()) {
|
||||
if (!allowQueries && atom->hasQuery()) {
|
||||
errors.push_back("ERROR: [FeaturesValidation] Query atom " +
|
||||
std::to_string(atom->getIdx()) + " is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
} else if (!allowDummies && isAtomDummy(atom)) {
|
||||
errors.push_back("ERROR: [FeaturesValidation] Dummy atom " +
|
||||
std::to_string(atom->getIdx()) + " is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (!allowAtomAliases && atom->hasProp(common_properties::molFileAlias)) {
|
||||
errors.push_back(
|
||||
"ERROR: [FeaturesValidation] Atom " + std::to_string(atom->getIdx()) +
|
||||
" with alias '" +
|
||||
atom->getProp<std::string>(common_properties::molFileAlias) +
|
||||
"' is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optionally disallow query, aromatic or dative bonds
|
||||
for (auto bond : mol.bonds()) {
|
||||
if (!allowQueries && bond->hasQuery()) {
|
||||
errors.push_back("ERROR: [FeaturesValidation] Query bond " +
|
||||
std::to_string(bond->getIdx()) + " is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
if (!allowAromaticBondType &&
|
||||
bond->getBondType() == Bond::BondType::AROMATIC) {
|
||||
errors.push_back("ERROR: [FeaturesValidation] Bond " +
|
||||
std::to_string(bond->getIdx()) +
|
||||
" of aromatic type is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
if (!allowDativeBondType && bond->getBondType() == Bond::BondType::DATIVE) {
|
||||
errors.push_back("ERROR: [FeaturesValidation] Bond " +
|
||||
std::to_string(bond->getIdx()) +
|
||||
" of dative type is not allowed");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optionally disallow using the enahanced stereochemistry
|
||||
if (!allowEnhancedStereo && mol.getStereoGroups().size()) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [FeaturesValidation] Enhanced stereochemistry features are not allowed");
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo> Is2DValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
|
||||
if (!mol.getNumConformers()) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Is2DValidation] The molecule has no coordinates");
|
||||
return errors;
|
||||
}
|
||||
|
||||
const auto &conf = mol.getConformer();
|
||||
|
||||
if (conf.is3D()) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates");
|
||||
return errors;
|
||||
}
|
||||
|
||||
// conf.is3D() is assigned by the mol format parser based on the input
|
||||
// mol block designation, but also taking into account the presence of
|
||||
// non-null Z coordinates or stereobonds.
|
||||
//
|
||||
// the following test is in this sense probably redundant, but it's still
|
||||
// implemented in case molecules are built by other means.
|
||||
|
||||
double max_absz{};
|
||||
for (const auto &p : conf.getPositions()) {
|
||||
max_absz = std::max(std::abs(p.z), max_absz);
|
||||
}
|
||||
|
||||
if (max_absz > threshold) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (conf.getNumAtoms() < 2) {
|
||||
// there is nothing else to check here, if there is at most one atom.
|
||||
return errors;
|
||||
}
|
||||
|
||||
// verify that the atoms are not all in the same position (this often happens
|
||||
// because no coordinates were assigned and all atoms appear to be placed in
|
||||
// the origin)
|
||||
|
||||
double min_x = std::numeric_limits<double>::max();
|
||||
double max_x = std::numeric_limits<double>::min();
|
||||
double min_y = std::numeric_limits<double>::max();
|
||||
double max_y = std::numeric_limits<double>::min();
|
||||
for (const auto &p : conf.getPositions()) {
|
||||
min_x = std::min(p.x, min_x);
|
||||
max_x = std::max(p.x, max_x);
|
||||
min_y = std::min(p.y, min_y);
|
||||
max_y = std::max(p.y, max_y);
|
||||
}
|
||||
auto delta_x = max_x - min_x;
|
||||
auto delta_y = max_y - min_y;
|
||||
auto max_delta = std::max(delta_x, delta_y);
|
||||
|
||||
if (max_delta < threshold) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Is2DValidation] All atoms have the same (x,y) coordinates");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
double Layout2DValidation::squaredMedianBondLength(const ROMol &mol,
|
||||
const Conformer &conf) {
|
||||
// Compute the squared value of the median bond length, but exclude the bonds
|
||||
// of null length.
|
||||
double median = 0.0;
|
||||
unsigned int numBonds = mol.getNumBonds();
|
||||
if (numBonds) {
|
||||
std::vector<double> values;
|
||||
values.reserve(numBonds);
|
||||
for (const auto &bond : mol.bonds()) {
|
||||
const auto &p1 = conf.getAtomPos(bond->getBeginAtomIdx());
|
||||
const auto &p2 = conf.getAtomPos(bond->getEndAtomIdx());
|
||||
auto value = (p1 - p2).lengthSq();
|
||||
if (value > 0.) {
|
||||
values.push_back(value);
|
||||
}
|
||||
}
|
||||
if (!values.empty()) {
|
||||
std::sort(values.begin(), values.end());
|
||||
numBonds = values.size();
|
||||
if (numBonds % 2) {
|
||||
median = values[numBonds / 2];
|
||||
} else {
|
||||
median = 0.5 * (values[numBonds / 2 - 1] + values[numBonds / 2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
return median;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo> Layout2DValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
|
||||
if (!mol.getNumConformers()) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Layout2DValidation] The molecule has no coordinates");
|
||||
return errors;
|
||||
}
|
||||
|
||||
const auto &conf = mol.getConformer();
|
||||
unsigned int natoms = conf.getNumAtoms();
|
||||
|
||||
if (natoms < 2) {
|
||||
// there is nothing to check here, if there is only one atom.
|
||||
return errors;
|
||||
}
|
||||
|
||||
// compute threshold values for the squared atom-atom or atom-bond
|
||||
// distance and for the maximum bond length using the median squared
|
||||
// bond length as reference.
|
||||
auto reference = squaredMedianBondLength(mol, conf);
|
||||
if (reference < minMedianBondLength * minMedianBondLength) {
|
||||
errors.emplace_back(
|
||||
"ERROR: [Layout2DValidation] The median bond length is smaller than the configured limit");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
|
||||
// check for atoms clashing w/ other atoms
|
||||
auto atomClashThreshold = clashLimit * clashLimit * reference;
|
||||
for (unsigned int i = 0; i < natoms - 1; ++i) {
|
||||
const auto &pi = conf.getAtomPos(i);
|
||||
for (unsigned int j = i + 1; j < natoms; ++j) {
|
||||
const auto &pj = conf.getAtomPos(j);
|
||||
auto d2 = (pi - pj).lengthSq();
|
||||
if (d2 < atomClashThreshold) {
|
||||
errors.push_back("ERROR: [Layout2DValidation] Atom " +
|
||||
std::to_string(i) + " is too close to atom " +
|
||||
std::to_string(j));
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make sure we have the required rings info available
|
||||
if (allowLongBondsInRings || allowAtomBondClashExemption) {
|
||||
if (!mol.getRingInfo()->isInitialized()) {
|
||||
RDKit::MolOps::fastFindRings(mol);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto bond : mol.bonds()) {
|
||||
unsigned int i = bond->getBeginAtomIdx();
|
||||
const auto &pi = conf.getAtomPos(i);
|
||||
unsigned int j = bond->getEndAtomIdx();
|
||||
const auto &pj = conf.getAtomPos(j);
|
||||
|
||||
auto ll = (pi - pj).lengthSq();
|
||||
|
||||
// check for exceedingly long bonds
|
||||
auto bondLengthThreshold = bondLengthLimit * bondLengthLimit * reference;
|
||||
if (!allowLongBondsInRings ||
|
||||
mol.getRingInfo()->numBondRings(bond->getIdx()) == 0) {
|
||||
if (ll > bondLengthThreshold) {
|
||||
errors.push_back("ERROR: [Layout2DValidation] The length of bond " +
|
||||
std::to_string(bond->getIdx()) + " between atoms " +
|
||||
std::to_string(i) + " and " + std::to_string(j) +
|
||||
" exceeds a configured limit");
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allowAtomBondClashExemption) {
|
||||
// is this bond exempted from atom-bond collision detection?
|
||||
if ((ll > 5. * 5. * reference) &&
|
||||
mol.getRingInfo()->numBondRings(bond->getIdx()) != 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// check for atoms clashing with this bond
|
||||
for (unsigned int k = 0; k < natoms; ++k) {
|
||||
if (k == i || k == j) {
|
||||
continue;
|
||||
}
|
||||
const auto &pk = conf.getAtomPos(k);
|
||||
/*
|
||||
k
|
||||
/
|
||||
r/
|
||||
/
|
||||
/
|
||||
i---------------j
|
||||
b
|
||||
*/
|
||||
auto vik = pk - pi;
|
||||
auto vij = pj - pi;
|
||||
auto rr = vik.lengthSq();
|
||||
auto bb = vij.lengthSq();
|
||||
auto rb = vik.dotProduct(vij);
|
||||
static constexpr double EPS{
|
||||
1.e-7}; // prevent dividing by zero in extreme cases
|
||||
auto kb = (rr * bb - rb * rb) / (bb + EPS);
|
||||
if (rb >= 0. && /* cos alpha > 0 */
|
||||
rb <= bb && /* projection of r onto b does not exceed b */
|
||||
kb < atomClashThreshold /* distance from bond < limit */
|
||||
) {
|
||||
errors.push_back("ERROR: [Layout2DValidation] Atom " +
|
||||
std::to_string(k) + " too close to bond " +
|
||||
std::to_string(bond->getIdx()));
|
||||
if (!reportAllFailures) {
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
namespace {
|
||||
bool hasStereoBond(const ROMol &mol, const Atom *atom) {
|
||||
for (auto bond : mol.atomBonds(atom)) {
|
||||
if (atom != bond->getBeginAtom()) {
|
||||
continue;
|
||||
}
|
||||
auto bondDir = bond->getBondDir();
|
||||
if (bondDir == Bond::BondDir::BEGINDASH ||
|
||||
bondDir == Bond::BondDir::BEGINWEDGE ||
|
||||
bondDir == Bond::BondDir::UNKNOWN) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
struct BondInfo {
|
||||
const Bond *bond = nullptr;
|
||||
Bond::BondDir bondDir = Bond::BondDir::NONE;
|
||||
double angle = 0.;
|
||||
};
|
||||
|
||||
struct BondDirCount {
|
||||
unsigned int wedge = 0;
|
||||
unsigned int dash = 0;
|
||||
unsigned int unknown = 0;
|
||||
unsigned int other = 0;
|
||||
};
|
||||
|
||||
struct NeighborsInfo {
|
||||
NeighborsInfo(const ROMol &mol, const Atom *atom);
|
||||
std::vector<BondInfo> bonds;
|
||||
BondDirCount dirCount;
|
||||
};
|
||||
|
||||
NeighborsInfo::NeighborsInfo(const ROMol &mol, const Atom *atom) {
|
||||
for (auto bond : mol.atomBonds(atom)) {
|
||||
BondInfo info;
|
||||
info.bond = bond;
|
||||
if (bond->getBeginAtom() == atom) {
|
||||
// do not consider the bond direction
|
||||
// settings of bonds that begin from
|
||||
// neighboring atoms
|
||||
info.bondDir = bond->getBondDir();
|
||||
}
|
||||
bonds.push_back(info);
|
||||
}
|
||||
|
||||
for (const auto &info : bonds) {
|
||||
Bond::BondDir dir = info.bondDir;
|
||||
switch (dir) {
|
||||
case Bond::BondDir::BEGINDASH:
|
||||
++dirCount.dash;
|
||||
break;
|
||||
case Bond::BondDir::BEGINWEDGE:
|
||||
++dirCount.wedge;
|
||||
break;
|
||||
case Bond::BondDir::UNKNOWN:
|
||||
++dirCount.unknown;
|
||||
break;
|
||||
case Bond::BondDir::NONE:
|
||||
// ok, bonds with unspecified direction
|
||||
// are fine to ignore
|
||||
case Bond::ENDUPRIGHT:
|
||||
case Bond::ENDDOWNRIGHT:
|
||||
// also ignore direction settings that
|
||||
// may describe the configuration of an
|
||||
// adjacent double bond
|
||||
break;
|
||||
default:
|
||||
++dirCount.other;
|
||||
}
|
||||
}
|
||||
|
||||
const auto &conf = mol.getConformer();
|
||||
const auto &p = conf.getAtomPos(atom->getIdx());
|
||||
const auto bond0 = bonds[0].bond;
|
||||
const auto atom0 = bond0->getOtherAtom(atom);
|
||||
const auto v0 = conf.getAtomPos(atom0->getIdx()) - p;
|
||||
|
||||
// sort the neighbors based on the angle they form
|
||||
// with the first one
|
||||
auto degree = bonds.size();
|
||||
for (unsigned int n = 1; n < degree; ++n) {
|
||||
const auto bondn = bonds[n].bond;
|
||||
const auto atomn = bondn->getOtherAtom(atom);
|
||||
const auto vn = conf.getAtomPos(atomn->getIdx()) - p;
|
||||
bonds[n].angle = v0.signedAngleTo(vn);
|
||||
}
|
||||
|
||||
std::sort(
|
||||
bonds.begin() + 1, bonds.end(),
|
||||
[](const BondInfo &a, const BondInfo &b) { return a.angle < b.angle; });
|
||||
}
|
||||
|
||||
void check3CoordinatedStereo(const ROMol &mol, const Atom *atom,
|
||||
const NeighborsInfo &neighborsInfo,
|
||||
bool /*reportAllFailures*/,
|
||||
std::vector<ValidationErrorInfo> &errors) {
|
||||
auto numStereoBonds =
|
||||
neighborsInfo.dirCount.dash + neighborsInfo.dirCount.wedge;
|
||||
|
||||
if (numStereoBonds == 1) {
|
||||
// identify the stereo bond
|
||||
unsigned int i;
|
||||
for (i = 0; i < 3; ++i) {
|
||||
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
|
||||
if (bondDir == Bond::BondDir::BEGINDASH ||
|
||||
bondDir == Bond::BondDir::BEGINWEDGE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// check for the colinearity of the stereocenter and the other two ligands.
|
||||
const auto &conf = mol.getConformer();
|
||||
const auto &p = conf.getAtomPos(atom->getIdx());
|
||||
const auto atoma =
|
||||
neighborsInfo.bonds[(i + 1) % 3].bond->getOtherAtom(atom);
|
||||
const auto va = conf.getAtomPos(atoma->getIdx()) - p;
|
||||
const auto atomb =
|
||||
neighborsInfo.bonds[(i + 2) % 3].bond->getOtherAtom(atom);
|
||||
const auto vb = conf.getAtomPos(atomb->getIdx()) - p;
|
||||
|
||||
auto angle = va.angleTo(vb);
|
||||
|
||||
static constexpr auto ANGLE_EPSILON = (M_PI * 5. / 180.); // 5 degrees
|
||||
if (angle < ANGLE_EPSILON || (M_PI - angle) < ANGLE_EPSILON) {
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] Colinearity of non-stereo bonds at atom " +
|
||||
std::to_string(atom->getIdx()));
|
||||
}
|
||||
} else {
|
||||
// configurations with multiple stereo bonds may be formally ambiguous or
|
||||
// unambiguos depending on their wedged/dashed direction and relative
|
||||
// orientation on the plane. those cases that are formally unambiguous are
|
||||
// still most often discouraged or also classified as not acceptable by
|
||||
// IUPAC guidelines due to lack of clarity.
|
||||
|
||||
// The AvalonTools' struchk implementation simply doesn't allow multiple
|
||||
// stereo bonds on stereo centers with 3 explicit ligands. The validations
|
||||
// criteria for this sub-case could be in principle refined, but for now the
|
||||
// same policy is implemented.
|
||||
errors.push_back("ERROR: [StereoValidation] Atom " +
|
||||
std::to_string(atom->getIdx()) +
|
||||
" has 3 explicit substituents and multiple stereo bonds");
|
||||
}
|
||||
}
|
||||
|
||||
void check4CoordinatedStereo(const ROMol &mol, const Atom *atom,
|
||||
const NeighborsInfo &neighborsInfo,
|
||||
bool reportAllFailures,
|
||||
std::vector<ValidationErrorInfo> &errors) {
|
||||
if (neighborsInfo.dirCount.dash > 2 || neighborsInfo.dirCount.wedge > 2) {
|
||||
// this condition would anyway trigger an "adjacent bonds with like
|
||||
// orientation" alert, but this test could be clearer / more explicit.
|
||||
errors.push_back("ERROR: [StereoValidation] Atom " +
|
||||
std::to_string(atom->getIdx()) +
|
||||
" has too many stereo bonds with like orientation");
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < 2; ++i) {
|
||||
if ((neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINDASH &&
|
||||
neighborsInfo.bonds[i + 2].bondDir == Bond::BondDir::BEGINWEDGE) ||
|
||||
(neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINWEDGE &&
|
||||
neighborsInfo.bonds[i + 2].bondDir == Bond::BondDir::BEGINDASH)) {
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] Atom " + std::to_string(atom->getIdx()) +
|
||||
" has opposing stereo bonds with different up/down orientation");
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < 4; ++i) {
|
||||
if ((neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINDASH &&
|
||||
neighborsInfo.bonds[(i + 1) % 4].bondDir ==
|
||||
Bond::BondDir::BEGINDASH) ||
|
||||
(neighborsInfo.bonds[i].bondDir == Bond::BondDir::BEGINWEDGE &&
|
||||
neighborsInfo.bonds[(i + 1) % 4].bondDir ==
|
||||
Bond::BondDir::BEGINWEDGE)) {
|
||||
errors.push_back("ERROR: [StereoValidation] Atom " +
|
||||
std::to_string(atom->getIdx()) +
|
||||
" has adjacent stereo bonds with like orientation");
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
// it doesn't make sense to output this alert multiple times for the same
|
||||
// atom we therefore exit the loop also when reportAllFailures is not set.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (neighborsInfo.dirCount.dash + neighborsInfo.dirCount.wedge == 1) {
|
||||
// there is only one wedged/dashed bond. check for 'umbrellas' and
|
||||
// other geometric violations. we need the conformation here.
|
||||
const auto &conf = mol.getConformer();
|
||||
|
||||
// identify the bond index for the stereo bond with specified direction.
|
||||
for (unsigned int i = 0; i < 4; ++i) {
|
||||
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
|
||||
if (bondDir == Bond::BondDir::BEGINDASH ||
|
||||
bondDir == Bond::BondDir::BEGINWEDGE) {
|
||||
// count how many of the other bonds lie on the opposite half-plane,
|
||||
// i.e. form an angle > pi/4 with the stereo bond.
|
||||
unsigned int opposed = 0;
|
||||
const auto &p = conf.getAtomPos(atom->getIdx());
|
||||
const auto bondi = neighborsInfo.bonds[i].bond;
|
||||
const auto atomi = bondi->getOtherAtom(atom);
|
||||
const auto vi = conf.getAtomPos(atomi->getIdx()) - p;
|
||||
for (unsigned int j = 0; j < 4; ++j) {
|
||||
if (j == i) {
|
||||
continue;
|
||||
}
|
||||
const auto bondj = neighborsInfo.bonds[j].bond;
|
||||
const auto atomj = bondj->getOtherAtom(atom);
|
||||
const auto vj = conf.getAtomPos(atomj->getIdx()) - p;
|
||||
if (vi.angleTo(vj) > 95. * M_PI / 180.) {
|
||||
++opposed;
|
||||
}
|
||||
}
|
||||
if (opposed == 3) {
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] Atom " +
|
||||
std::to_string(atom->getIdx()) +
|
||||
" has a potentially ambiguous representation: all non-stereo bonds" +
|
||||
" opposite to the only stereo bond");
|
||||
}
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
// there is only one stereo bond, which means we can exit the
|
||||
// outer loop on the first execution of this block.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// check for collinearity violations and/or cases where the
|
||||
// the middle non-stereo bond is badly positioned (i.e., too short
|
||||
// compared to the other two on its sides).
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
Bond::BondDir bondDir = neighborsInfo.bonds[i].bondDir;
|
||||
if (bondDir == Bond::BondDir::BEGINDASH ||
|
||||
bondDir == Bond::BondDir::BEGINWEDGE) {
|
||||
auto j = (i + 1) % 4;
|
||||
auto k = (i + 2) % 4;
|
||||
auto l = (i + 3) % 4;
|
||||
const auto atomj = neighborsInfo.bonds[j].bond->getOtherAtom(atom);
|
||||
const auto atomk = neighborsInfo.bonds[k].bond->getOtherAtom(atom);
|
||||
const auto atoml = neighborsInfo.bonds[l].bond->getOtherAtom(atom);
|
||||
const auto &pj = conf.getAtomPos(atomj->getIdx());
|
||||
const auto &pk = conf.getAtomPos(atomk->getIdx());
|
||||
const auto &pl = conf.getAtomPos(atoml->getIdx());
|
||||
const auto v1 = pj - pk;
|
||||
const auto v2 = pl - pk;
|
||||
auto angle = v1.signedAngleTo(v2);
|
||||
if (angle < 185. * M_PI / 180.) {
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] Colinearity or triangle rule violation of "
|
||||
"non-stereo bonds at atom " +
|
||||
std::to_string(atom->getIdx()) /* +
|
||||
" due to angle formed by (" +
|
||||
std::to_string(atomj->getIdx()+1) + "," +
|
||||
std::to_string(atomk->getIdx()+1) + "," +
|
||||
std::to_string(atoml->getIdx()+1) + ")" */
|
||||
);
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
// there is only one stereo bond, which means we can exit the
|
||||
// outer loop on the first execution of this block.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void checkStereo(const ROMol &mol, const Atom *atom, bool reportAllFailures,
|
||||
std::vector<ValidationErrorInfo> &errors) {
|
||||
NeighborsInfo neighborsInfo(mol, atom);
|
||||
|
||||
if (neighborsInfo.dirCount.other) {
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] one or more bonds incident to atom " +
|
||||
std::to_string(atom->getIdx()) + " have unexpected direction settings");
|
||||
// this is an unlikely condition and it would make little sense to
|
||||
// continue the analysis also when reportAllFailures were set.
|
||||
return;
|
||||
}
|
||||
|
||||
if (neighborsInfo.dirCount.unknown) {
|
||||
if (neighborsInfo.dirCount.dash || neighborsInfo.dirCount.wedge) {
|
||||
errors.push_back("ERROR: [StereoValidation] Atom " +
|
||||
std::to_string(atom->getIdx()) +
|
||||
" has both unknown and wedged/dashed stereo bonds.");
|
||||
}
|
||||
// else: if the only stereo bonds have either/unknown direction,
|
||||
// we can return here.
|
||||
return;
|
||||
}
|
||||
|
||||
for (const auto &bondInfo : neighborsInfo.bonds) {
|
||||
bool isStereo = bondInfo.bondDir == Bond::BondDir::BEGINDASH ||
|
||||
bondInfo.bondDir == Bond::BondDir::BEGINWEDGE ||
|
||||
bondInfo.bondDir == Bond::BondDir::UNKNOWN;
|
||||
if (isStereo && !canHaveDirection(*bondInfo.bond)) {
|
||||
errors.push_back("ERROR: [StereoValidation] Bond " +
|
||||
std::to_string(bondInfo.bond->getIdx()) +
|
||||
" has assigned stereo type, but unexpected bond order.");
|
||||
if (!reportAllFailures) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The validation is currently limited to some specific categories of
|
||||
// stereocenters
|
||||
bool multipleBondFound{}, possibleAllene{};
|
||||
for (auto bond : mol.atomBonds(atom)) {
|
||||
auto bondType = bond->getBondType();
|
||||
if (bondType != Bond::BondType::SINGLE) {
|
||||
multipleBondFound = true;
|
||||
const Atom *otherAtom = bond->getOtherAtom(atom);
|
||||
if (otherAtom->getDegree() == 2) {
|
||||
int doubleBondCount{};
|
||||
for (auto otherBond : mol.atomBonds(otherAtom)) {
|
||||
if (otherBond->getBondType() == Bond::BondType::DOUBLE) {
|
||||
++doubleBondCount;
|
||||
}
|
||||
}
|
||||
if (doubleBondCount == 2) {
|
||||
possibleAllene = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
auto atomicNum = atom->getAtomicNum();
|
||||
if (possibleAllene || (multipleBondFound && atomicNum == 15)) {
|
||||
// Allenes and P compounds are not validated at this time.
|
||||
return;
|
||||
}
|
||||
if (multipleBondFound && atomicNum != 16) {
|
||||
// A stereo bond was found at an unsaturated atom. This condition used to
|
||||
// trigger as error in STRUCHK, but there are valid use cases for it (e.g.,
|
||||
// wavy bonds incident to double bonds of undefined/unknown configuration,
|
||||
// and atropisomers).
|
||||
//
|
||||
// Validation of these use cases is not currently implemented.
|
||||
return;
|
||||
}
|
||||
|
||||
switch (atom->getDegree()) {
|
||||
case 1:
|
||||
case 2:
|
||||
errors.push_back(
|
||||
"ERROR: [StereoValidation] Atom " + std::to_string(atom->getIdx()) +
|
||||
" has stereo bonds, but less than 3 explicit substituents.");
|
||||
break;
|
||||
case 3:
|
||||
check3CoordinatedStereo(mol, atom, neighborsInfo, reportAllFailures,
|
||||
errors);
|
||||
break;
|
||||
case 4:
|
||||
check4CoordinatedStereo(mol, atom, neighborsInfo, reportAllFailures,
|
||||
errors);
|
||||
break;
|
||||
default:;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::vector<ValidationErrorInfo> StereoValidation::validate(
|
||||
const ROMol &mol, bool reportAllFailures) const {
|
||||
std::vector<ValidationErrorInfo> errors;
|
||||
|
||||
for (auto atom : mol.atoms()) {
|
||||
if (hasStereoBond(mol, atom)) {
|
||||
checkStereo(mol, atom, reportAllFailures, errors);
|
||||
}
|
||||
if (!errors.empty() && !reportAllFailures) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::vector<ValidationErrorInfo> validateSmiles(const std::string &smiles) {
|
||||
RWMOL_SPTR mol(SmilesToMol(smiles));
|
||||
if (!mol) {
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
namespace RDKit {
|
||||
class RWMol;
|
||||
class ROMol;
|
||||
class Conformer;
|
||||
|
||||
namespace MolStandardize {
|
||||
|
||||
@@ -51,11 +52,12 @@ class RDKIT_MOLSTANDARDIZE_EXPORT ValidationMethod {
|
||||
|
||||
//! The CompositeValidation class provides a simple way to apply a collection of
|
||||
// ValidationMethod instances in sequence
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation : public ValidationMethod {
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation
|
||||
: public ValidationMethod {
|
||||
public:
|
||||
CompositeValidation(
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> & validations)
|
||||
: validations(validations) {};
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> &validations)
|
||||
: validations(validations){};
|
||||
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
@@ -65,7 +67,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT CompositeValidation : public ValidationMethod
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<ValidationMethod>> validations;
|
||||
std::vector<std::shared_ptr<ValidationMethod>> validations;
|
||||
};
|
||||
|
||||
//! The RDKitValidation class throws an error when there are no atoms in the
|
||||
@@ -95,7 +97,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT RDKitValidation : public ValidationMethod {
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT NoAtomValidation : public ValidationMethod {
|
||||
public:
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<NoAtomValidation>(*this);
|
||||
@@ -106,7 +108,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT NoAtomValidation : public ValidationMethod {
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT FragmentValidation : public ValidationMethod {
|
||||
public:
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<FragmentValidation>(*this);
|
||||
@@ -117,7 +119,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentValidation : public ValidationMethod {
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT NeutralValidation : public ValidationMethod {
|
||||
public:
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<NeutralValidation>(*this);
|
||||
@@ -125,14 +127,24 @@ class RDKIT_MOLSTANDARDIZE_EXPORT NeutralValidation : public ValidationMethod {
|
||||
};
|
||||
|
||||
//! The IsotopeValidation class logs if molecule contains isotopes.
|
||||
/*!
|
||||
<b>Notes:</b>
|
||||
- By default, this class will return an error every time an isotopic
|
||||
number is specified. When the `strict` constructor parameter is passed a
|
||||
`true` argument, an error is returned only if the specified isotopic number
|
||||
is not found in the RDKit periodic table.
|
||||
*/
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT IsotopeValidation : public ValidationMethod {
|
||||
public:
|
||||
IsotopeValidation(bool strict = false) : strict(strict){};
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<IsotopeValidation>(*this);
|
||||
}
|
||||
|
||||
bool strict;
|
||||
};
|
||||
|
||||
////////////////////////////////
|
||||
@@ -146,7 +158,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT MolVSValidation : public CompositeValidation {
|
||||
MolVSValidation();
|
||||
//! overloaded constructor to take in a user-defined list of ValidationMethod
|
||||
MolVSValidation(
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> & validations);
|
||||
const std::vector<std::shared_ptr<ValidationMethod>> &validations);
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<MolVSValidation>(*this);
|
||||
@@ -154,8 +166,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT MolVSValidation : public CompositeValidation {
|
||||
};
|
||||
|
||||
//! The AllowedAtomsValidation class lets the user input a list of atoms,
|
||||
//! anything not on
|
||||
/// the list throws an error.
|
||||
//! anything not on the list throws an error.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT AllowedAtomsValidation
|
||||
: public ValidationMethod {
|
||||
public:
|
||||
@@ -173,8 +184,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT AllowedAtomsValidation
|
||||
};
|
||||
|
||||
//! The DisallowedAtomsValidation class lets the user input a list of atoms and
|
||||
//! as long
|
||||
/// as there are no atoms from the list it is deemed acceptable.
|
||||
//! as long as there are no atoms from the list it is deemed acceptable.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedAtomsValidation
|
||||
: public ValidationMethod {
|
||||
public:
|
||||
@@ -191,6 +201,108 @@ class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedAtomsValidation
|
||||
std::vector<std::shared_ptr<Atom>> d_disallowedList;
|
||||
};
|
||||
|
||||
//! The DisallowedRadicalValidation class reports an error if any
|
||||
/// unstable radical atoms are found.
|
||||
/// The allowed radicals are [N]=O and [O]-N.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT DisallowedRadicalValidation
|
||||
: public ValidationMethod {
|
||||
public:
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<DisallowedRadicalValidation>(*this);
|
||||
}
|
||||
};
|
||||
|
||||
//! The FeaturesValidation class reports an error if the input
|
||||
/// molecule representation includes any undesired features.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT FeaturesValidation : public ValidationMethod {
|
||||
public:
|
||||
FeaturesValidation(bool allowEnhancedStereo = false,
|
||||
bool allowAromaticBondType = false,
|
||||
bool allowDativeBondType = false,
|
||||
bool allowQueries = false, bool allowDummies = false,
|
||||
bool allowAtomAliases = false)
|
||||
: allowEnhancedStereo(allowEnhancedStereo),
|
||||
allowAromaticBondType(allowAromaticBondType),
|
||||
allowDativeBondType(allowDativeBondType),
|
||||
allowQueries(allowQueries),
|
||||
allowDummies(allowDummies),
|
||||
allowAtomAliases(allowAtomAliases){};
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<FeaturesValidation>(*this);
|
||||
}
|
||||
bool allowEnhancedStereo;
|
||||
bool allowAromaticBondType;
|
||||
bool allowDativeBondType;
|
||||
bool allowQueries;
|
||||
bool allowDummies;
|
||||
bool allowAtomAliases;
|
||||
};
|
||||
|
||||
//! The Is2DValidation class reports an error if the input
|
||||
/// molecule representation is designated as 3D or if it includes
|
||||
/// non-null Z coordinates, and in case all atoms are assigned the
|
||||
/// same coordinates.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT Is2DValidation : public ValidationMethod {
|
||||
public:
|
||||
Is2DValidation(double threshold = 1.e-3) : threshold(threshold){};
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<Is2DValidation>(*this);
|
||||
}
|
||||
|
||||
double threshold;
|
||||
};
|
||||
|
||||
//! The Layout2DValidation class reports an error if any atoms are
|
||||
/// too close to any other atoms or bonds, and in case any bonds are
|
||||
/// too long.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT Layout2DValidation : public ValidationMethod {
|
||||
public:
|
||||
Layout2DValidation(double clashLimit = 0.15, double bondLengthLimit = 25.,
|
||||
bool allowLongBondsInRings = true,
|
||||
bool allowAtomBondClashExemption = true,
|
||||
double minMedianBondLength = 1e-3)
|
||||
: clashLimit(clashLimit),
|
||||
bondLengthLimit(bondLengthLimit),
|
||||
allowLongBondsInRings(allowLongBondsInRings),
|
||||
allowAtomBondClashExemption(allowAtomBondClashExemption),
|
||||
minMedianBondLength(minMedianBondLength){};
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<Layout2DValidation>(*this);
|
||||
}
|
||||
|
||||
static double squaredMedianBondLength(const ROMol &mol,
|
||||
const Conformer &conf);
|
||||
|
||||
double clashLimit;
|
||||
double bondLengthLimit;
|
||||
bool allowLongBondsInRings;
|
||||
bool allowAtomBondClashExemption;
|
||||
double minMedianBondLength;
|
||||
};
|
||||
|
||||
//! The StereoValidation class checks various "syntactic" constraints
|
||||
/// related to the usage of stereo bonds on centers with 4 or 3 substituents,
|
||||
/// in an attempt to ensure that the associated stereochemical configuration
|
||||
/// can be interpreted unambiguously.
|
||||
/// These validation criteria were ported from the AvalonTools STRUCHK software.
|
||||
class RDKIT_MOLSTANDARDIZE_EXPORT StereoValidation : public ValidationMethod {
|
||||
public:
|
||||
std::vector<ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override;
|
||||
std::shared_ptr<ValidationMethod> copy() const override {
|
||||
return std::make_shared<StereoValidation>(*this);
|
||||
}
|
||||
};
|
||||
|
||||
//! A convenience function for quickly validating a single SMILES string.
|
||||
RDKIT_MOLSTANDARDIZE_EXPORT std::vector<ValidationErrorInfo> validateSmiles(
|
||||
const std::string &smiles);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
remove_definitions(-DRDKIT_MOLSTANDARDIZE_BUILD)
|
||||
rdkit_python_extension(rdMolStandardize rdMolStandardize.cpp Validate.cpp
|
||||
Charge.cpp Fragment.cpp Normalize.cpp Metal.cpp Tautomer.cpp
|
||||
Charge.cpp Fragment.cpp Normalize.cpp Metal.cpp Tautomer.cpp Pipeline.cpp
|
||||
DEST Chem/MolStandardize
|
||||
LINK_LIBRARIES
|
||||
LINK_LIBRARIES MolStandardize )
|
||||
|
||||
143
Code/GraphMol/MolStandardize/Wrap/Pipeline.cpp
Normal file
143
Code/GraphMol/MolStandardize/Wrap/Pipeline.cpp
Normal file
@@ -0,0 +1,143 @@
|
||||
//
|
||||
// Copyright (C) 2023 Novartis Biomedical Research
|
||||
//
|
||||
// @@ All Rights Reserved @@
|
||||
// This file is part of the RDKit.
|
||||
// The contents are covered by the terms of the BSD license
|
||||
// which is included in the file license.txt, found at the root
|
||||
// of the RDKit source tree.
|
||||
//
|
||||
#include <RDBoost/Wrap.h>
|
||||
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/MolStandardize/Pipeline.h>
|
||||
|
||||
namespace RDKit {
|
||||
namespace MolStandardize {
|
||||
|
||||
bool operator==(const PipelineLogEntry &lhs, const PipelineLogEntry &rhs) {
|
||||
return (lhs.status == rhs.status) && (lhs.detail == rhs.detail);
|
||||
}
|
||||
|
||||
} // namespace MolStandardize
|
||||
} // namespace RDKit
|
||||
|
||||
namespace python = boost::python;
|
||||
using namespace RDKit;
|
||||
|
||||
void wrap_pipeline() {
|
||||
python::class_<MolStandardize::PipelineOptions>("PipelineOptions")
|
||||
.def_readwrite("strictParsing",
|
||||
&MolStandardize::PipelineOptions::strictParsing)
|
||||
.def_readwrite("reportAllFailures",
|
||||
&MolStandardize::PipelineOptions::reportAllFailures)
|
||||
.def_readwrite("allowEmptyMolecules",
|
||||
&MolStandardize::PipelineOptions::allowEmptyMolecules)
|
||||
.def_readwrite("allowEnhancedStereo",
|
||||
&MolStandardize::PipelineOptions::allowEnhancedStereo)
|
||||
.def_readwrite("allowAromaticBondType",
|
||||
&MolStandardize::PipelineOptions::allowAromaticBondType)
|
||||
.def_readwrite("allowDativeBondType",
|
||||
&MolStandardize::PipelineOptions::allowDativeBondType)
|
||||
.def_readwrite("is2DZeroThreshold",
|
||||
&MolStandardize::PipelineOptions::is2DZeroThreshold)
|
||||
.def_readwrite("atomClashLimit",
|
||||
&MolStandardize::PipelineOptions::atomClashLimit)
|
||||
.def_readwrite("minMedianBondLength",
|
||||
&MolStandardize::PipelineOptions::minMedianBondLength)
|
||||
.def_readwrite("bondLengthLimit",
|
||||
&MolStandardize::PipelineOptions::bondLengthLimit)
|
||||
.def_readwrite("allowLongBondsInRings",
|
||||
&MolStandardize::PipelineOptions::allowLongBondsInRings)
|
||||
.def_readwrite(
|
||||
"allowAtomBondClashExemption",
|
||||
&MolStandardize::PipelineOptions::allowAtomBondClashExemption)
|
||||
.def_readwrite("metalNof", &MolStandardize::PipelineOptions::metalNof)
|
||||
.def_readwrite("metalNon", &MolStandardize::PipelineOptions::metalNon)
|
||||
.def_readwrite("normalizerData",
|
||||
&MolStandardize::PipelineOptions::normalizerData)
|
||||
.def_readwrite("normalizerMaxRestarts",
|
||||
&MolStandardize::PipelineOptions::normalizerMaxRestarts)
|
||||
.def_readwrite("scaledMedianBondLength",
|
||||
&MolStandardize::PipelineOptions::scaledMedianBondLength)
|
||||
.def_readwrite("outputV2000",
|
||||
&MolStandardize::PipelineOptions::outputV2000);
|
||||
|
||||
python::enum_<MolStandardize::PipelineStatus>("PipelineStatus")
|
||||
.value("NO_EVENT", MolStandardize::PipelineStatus::NO_EVENT)
|
||||
.value("INPUT_ERROR", MolStandardize::PipelineStatus::INPUT_ERROR)
|
||||
.value("PREPARE_FOR_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::PREPARE_FOR_VALIDATION_ERROR)
|
||||
.value("FEATURES_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::FEATURES_VALIDATION_ERROR)
|
||||
.value("BASIC_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::BASIC_VALIDATION_ERROR)
|
||||
.value("IS2D_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::IS2D_VALIDATION_ERROR)
|
||||
.value("LAYOUT2D_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::LAYOUT2D_VALIDATION_ERROR)
|
||||
.value("STEREO_VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::STEREO_VALIDATION_ERROR)
|
||||
.value("VALIDATION_ERROR",
|
||||
MolStandardize::PipelineStatus::VALIDATION_ERROR)
|
||||
.value("PREPARE_FOR_STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::PREPARE_FOR_STANDARDIZATION_ERROR)
|
||||
.value("METAL_STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::METAL_STANDARDIZATION_ERROR)
|
||||
.value("NORMALIZER_STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::NORMALIZER_STANDARDIZATION_ERROR)
|
||||
.value("FRAGMENT_STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::FRAGMENT_STANDARDIZATION_ERROR)
|
||||
.value("CHARGE_STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::CHARGE_STANDARDIZATION_ERROR)
|
||||
.value("STANDARDIZATION_ERROR",
|
||||
MolStandardize::PipelineStatus::STANDARDIZATION_ERROR)
|
||||
.value("OUTPUT_ERROR", MolStandardize::PipelineStatus::OUTPUT_ERROR)
|
||||
.value("PIPELINE_ERROR", MolStandardize::PipelineStatus::PIPELINE_ERROR)
|
||||
.value("METALS_DISCONNECTED",
|
||||
MolStandardize::PipelineStatus::METALS_DISCONNECTED)
|
||||
.value("NORMALIZATION_APPLIED",
|
||||
MolStandardize::PipelineStatus::NORMALIZATION_APPLIED)
|
||||
.value("FRAGMENTS_REMOVED",
|
||||
MolStandardize::PipelineStatus::FRAGMENTS_REMOVED)
|
||||
.value("PROTONATION_CHANGED",
|
||||
MolStandardize::PipelineStatus::PROTONATION_CHANGED)
|
||||
.value("STRUCTURE_MODIFICATION",
|
||||
MolStandardize::PipelineStatus::STRUCTURE_MODIFICATION);
|
||||
|
||||
python::enum_<MolStandardize::PipelineStage>("PipelineStage")
|
||||
.value("PARSING_INPUT", MolStandardize::PipelineStage::PARSING_INPUT)
|
||||
.value("PREPARE_FOR_VALIDATION",
|
||||
MolStandardize::PipelineStage::PREPARE_FOR_VALIDATION)
|
||||
.value("VALIDATION", MolStandardize::PipelineStage::VALIDATION)
|
||||
.value("PREPARE_FOR_STANDARDIZATION",
|
||||
MolStandardize::PipelineStage::PREPARE_FOR_STANDARDIZATION)
|
||||
.value("STANDARDIZATION", MolStandardize::PipelineStage::STANDARDIZATION)
|
||||
.value("SERIALIZING_OUTPUT",
|
||||
MolStandardize::PipelineStage::SERIALIZING_OUTPUT)
|
||||
.value("COMPLETED", MolStandardize::PipelineStage::COMPLETED);
|
||||
|
||||
python::class_<MolStandardize::PipelineLogEntry>("PipelineLogEntry",
|
||||
python::no_init)
|
||||
.def_readonly("status", &MolStandardize::PipelineLogEntry::status)
|
||||
.def_readonly("detail", &MolStandardize::PipelineLogEntry::detail);
|
||||
|
||||
python::class_<MolStandardize::PipelineLog>("PipelineLog", python::no_init)
|
||||
.def(python::vector_indexing_suite<MolStandardize::PipelineLog>());
|
||||
|
||||
python::class_<MolStandardize::PipelineResult>("PipelineResult",
|
||||
python::no_init)
|
||||
.def_readonly("status", &MolStandardize::PipelineResult::status)
|
||||
.def_readonly("stage", &MolStandardize::PipelineResult::stage)
|
||||
.def_readonly("log", &MolStandardize::PipelineResult::log)
|
||||
.def_readonly("inputMolData",
|
||||
&MolStandardize::PipelineResult::inputMolData)
|
||||
.def_readonly("outputMolData",
|
||||
&MolStandardize::PipelineResult::outputMolData)
|
||||
.def_readonly("parentMolData",
|
||||
&MolStandardize::PipelineResult::parentMolData);
|
||||
|
||||
python::class_<MolStandardize::Pipeline>("Pipeline")
|
||||
.def(python::init<const MolStandardize::PipelineOptions &>())
|
||||
.def("run", &MolStandardize::Pipeline::run);
|
||||
}
|
||||
@@ -17,25 +17,23 @@ using namespace RDKit;
|
||||
|
||||
namespace {
|
||||
|
||||
struct ValidationMethodWrap : MolStandardize::ValidationMethod, python::wrapper<MolStandardize::ValidationMethod>
|
||||
{
|
||||
std::vector<MolStandardize::ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override
|
||||
{
|
||||
return this->get_override("validate")(mol, reportAllFailures);
|
||||
}
|
||||
struct ValidationMethodWrap
|
||||
: MolStandardize::ValidationMethod,
|
||||
python::wrapper<MolStandardize::ValidationMethod> {
|
||||
std::vector<MolStandardize::ValidationErrorInfo> validate(
|
||||
const ROMol &mol, bool reportAllFailures) const override {
|
||||
return this->get_override("validate")(mol, reportAllFailures);
|
||||
}
|
||||
|
||||
std::shared_ptr<MolStandardize::ValidationMethod> copy() const override
|
||||
{
|
||||
return this->get_override("copy")();
|
||||
}
|
||||
std::shared_ptr<MolStandardize::ValidationMethod> copy() const override {
|
||||
return this->get_override("copy")();
|
||||
}
|
||||
};
|
||||
|
||||
// Wrap ValidationMethod::validate and convert the returned
|
||||
// vector into a python list of strings
|
||||
python::list pythonValidateMethod(
|
||||
const MolStandardize::ValidationMethod & self, const ROMol &mol,
|
||||
bool reportAllFailures) {
|
||||
python::list pythonValidateMethod(const MolStandardize::ValidationMethod &self,
|
||||
const ROMol &mol, bool reportAllFailures) {
|
||||
python::list res;
|
||||
std::vector<MolStandardize::ValidationErrorInfo> errout =
|
||||
self.validate(mol, reportAllFailures);
|
||||
@@ -104,62 +102,111 @@ struct validate_wrapper {
|
||||
std::string docString = "";
|
||||
|
||||
python::class_<ValidationMethodWrap, boost::noncopyable>("ValidationMethod")
|
||||
.def("validate", pythonValidateMethod,
|
||||
(python::arg("self"), python::arg("mol"),
|
||||
python::arg("reportAllFailures") = false),
|
||||
"")
|
||||
;
|
||||
.def("validate", pythonValidateMethod,
|
||||
(python::arg("self"), python::arg("mol"),
|
||||
python::arg("reportAllFailures") = false),
|
||||
"");
|
||||
|
||||
python::class_<
|
||||
MolStandardize::RDKitValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("RDKitValidation")
|
||||
;
|
||||
python::class_<MolStandardize::RDKitValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("RDKitValidation");
|
||||
|
||||
python::class_<
|
||||
MolStandardize::NoAtomValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("NoAtomValidation")
|
||||
;
|
||||
python::class_<MolStandardize::NoAtomValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("NoAtomValidation");
|
||||
|
||||
python::class_<
|
||||
MolStandardize::FragmentValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("FragmentValidation")
|
||||
;
|
||||
python::class_<MolStandardize::FragmentValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("FragmentValidation");
|
||||
|
||||
python::class_<
|
||||
MolStandardize::NeutralValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("NeutralValidation")
|
||||
;
|
||||
python::class_<MolStandardize::NeutralValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("NeutralValidation");
|
||||
|
||||
python::class_<
|
||||
MolStandardize::IsotopeValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("IsotopeValidation")
|
||||
;
|
||||
python::class_<MolStandardize::IsotopeValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("IsotopeValidation")
|
||||
.def(python::init<bool>(python::arg("strict") = false))
|
||||
.def_readwrite("strict", &MolStandardize::IsotopeValidation::strict);
|
||||
|
||||
python::class_<
|
||||
MolStandardize::MolVSValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("MolVSValidation")
|
||||
.def("__init__", python::make_constructor(&getMolVSValidation))
|
||||
;
|
||||
python::class_<MolStandardize::MolVSValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("MolVSValidation")
|
||||
.def("__init__", python::make_constructor(&getMolVSValidation));
|
||||
|
||||
python::class_<
|
||||
MolStandardize::AllowedAtomsValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("AllowedAtomsValidation", python::no_init)
|
||||
.def("__init__", python::make_constructor(&getAllowedAtomsValidation))
|
||||
;
|
||||
python::class_<MolStandardize::AllowedAtomsValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("AllowedAtomsValidation",
|
||||
python::no_init)
|
||||
.def("__init__", python::make_constructor(&getAllowedAtomsValidation));
|
||||
|
||||
python::class_<
|
||||
MolStandardize::DisallowedAtomsValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("DisallowedAtomsValidation", python::no_init)
|
||||
.def("__init__", python::make_constructor(&getDisallowedAtomsValidation))
|
||||
;
|
||||
python::class_<MolStandardize::DisallowedAtomsValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("DisallowedAtomsValidation",
|
||||
python::no_init)
|
||||
.def("__init__",
|
||||
python::make_constructor(&getDisallowedAtomsValidation));
|
||||
|
||||
python::class_<MolStandardize::FeaturesValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>>(
|
||||
"FeaturesValidation")
|
||||
.def(python::init<bool, bool, bool, bool, bool, bool>(
|
||||
(python::arg("allowEnhancedStereo") = false,
|
||||
python::arg("allowAromaticBondType") = false,
|
||||
python::arg("allowDativeBondType") = false,
|
||||
python::arg("allowQueries") = false,
|
||||
python::arg("allowDummmies") = false,
|
||||
python::arg("allowAtomAliases") = false)))
|
||||
.def_readwrite("allowEnhancedStereo",
|
||||
&MolStandardize::FeaturesValidation::allowEnhancedStereo)
|
||||
.def_readwrite(
|
||||
"allowAromaticBondType",
|
||||
&MolStandardize::FeaturesValidation::allowAromaticBondType)
|
||||
.def_readwrite("allowDativeBondType",
|
||||
&MolStandardize::FeaturesValidation::allowDativeBondType)
|
||||
.def_readwrite("allowQueries",
|
||||
&MolStandardize::FeaturesValidation::allowQueries)
|
||||
.def_readwrite("allowDummies",
|
||||
&MolStandardize::FeaturesValidation::allowDummies)
|
||||
.def_readwrite("allowAtomAliases",
|
||||
&MolStandardize::FeaturesValidation::allowAtomAliases);
|
||||
|
||||
python::class_<MolStandardize::DisallowedRadicalValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("DisallowedRadicalValidation");
|
||||
|
||||
python::class_<MolStandardize::Is2DValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("Is2DValidation")
|
||||
.def(python::init<double>(python::arg("threshold") = 1e-3))
|
||||
.def_readwrite("threshold", &MolStandardize::Is2DValidation::threshold);
|
||||
|
||||
python::class_<MolStandardize::Layout2DValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("Layout2DValidation")
|
||||
.def(python::init<double, double, bool, bool, double>(
|
||||
(python::arg("clashLimit") = 0.15,
|
||||
python::arg("bondLengthLimit") = 25.,
|
||||
python::arg("allowLongBondsInRings") = true,
|
||||
python::arg("allowAtomBondClashExemption") = true,
|
||||
python::arg("minMedianBondLength") = false)))
|
||||
.def_readwrite("clashLimit",
|
||||
&MolStandardize::Layout2DValidation::clashLimit)
|
||||
.def_readwrite("bondLengthLimit",
|
||||
&MolStandardize::Layout2DValidation::bondLengthLimit)
|
||||
.def_readwrite(
|
||||
"allowLongBondsInRings",
|
||||
&MolStandardize::Layout2DValidation::allowLongBondsInRings)
|
||||
.def_readwrite(
|
||||
"allowAtomBondClashExemption",
|
||||
&MolStandardize::Layout2DValidation::allowAtomBondClashExemption)
|
||||
.def_readwrite(
|
||||
"minMedianBondLength",
|
||||
&MolStandardize::Layout2DValidation::minMedianBondLength);
|
||||
|
||||
python::class_<MolStandardize::StereoValidation,
|
||||
python::bases<MolStandardize::ValidationMethod>,
|
||||
boost::noncopyable>("StereoValidation");
|
||||
|
||||
python::def("ValidateSmiles", standardizeSmilesHelper, (python::arg("mol")),
|
||||
docString.c_str());
|
||||
|
||||
@@ -195,8 +195,7 @@ template <typename FUNCTYPE>
|
||||
void mtinPlaceHelper2(python::object pymols, int numThreads,
|
||||
python::object params, bool skip_standardize,
|
||||
FUNCTYPE func) {
|
||||
const auto *ps =
|
||||
&RDKit::MolStandardize::defaultCleanupParameters;
|
||||
const auto *ps = &RDKit::MolStandardize::defaultCleanupParameters;
|
||||
if (params) {
|
||||
ps = python::extract<RDKit::MolStandardize::CleanupParameters *>(params);
|
||||
}
|
||||
@@ -387,6 +386,7 @@ void wrap_metal();
|
||||
void wrap_fragment();
|
||||
void wrap_normalize();
|
||||
void wrap_tautomer();
|
||||
void wrap_pipeline();
|
||||
|
||||
BOOST_PYTHON_MODULE(rdMolStandardize) {
|
||||
python::scope().attr("__doc__") =
|
||||
@@ -668,4 +668,5 @@ BOOST_PYTHON_MODULE(rdMolStandardize) {
|
||||
wrap_fragment();
|
||||
wrap_normalize();
|
||||
wrap_tautomer();
|
||||
wrap_pipeline();
|
||||
}
|
||||
|
||||
@@ -269,9 +269,9 @@ class TestCase(unittest.TestCase):
|
||||
mol = Chem.MolFromSmiles("CO(C)C", sanitize=False)
|
||||
msg = vm.validate(mol)
|
||||
self.assertEqual(len(msg), 1)
|
||||
self.assertEqual
|
||||
("""INFO: [ValenceValidation] Explicit valence for atom # 1 O, 3, is greater than permitted""",
|
||||
msg[0])
|
||||
self.assertEqual(
|
||||
"""INFO: [ValenceValidation] Explicit valence for atom # 1 O, 3, is greater than permitted""",
|
||||
msg[0])
|
||||
|
||||
vm2 = rdMolStandardize.MolVSValidation([rdMolStandardize.FragmentValidation()])
|
||||
# with no argument it also works
|
||||
@@ -279,17 +279,14 @@ class TestCase(unittest.TestCase):
|
||||
mol2 = Chem.MolFromSmiles("COc1cccc(C=N[N-]C(N)=O)c1[O-].O.O.O.O=[U+2]=O")
|
||||
msg2 = vm2.validate(mol2)
|
||||
self.assertEqual(len(msg2), 1)
|
||||
self.assertEqual
|
||||
("""INFO: [FragmentValidation] water/hydroxide is present""", msg2[0])
|
||||
self.assertEqual("""INFO: [FragmentValidation] water/hydroxide is present""", msg2[0])
|
||||
|
||||
vm3 = rdMolStandardize.MolVSValidation()
|
||||
mol3 = Chem.MolFromSmiles("C1COCCO1.O=C(NO)NO")
|
||||
msg3 = vm3.validate(mol3)
|
||||
self.assertEqual(len(msg3), 2)
|
||||
self.assertEqual
|
||||
("""INFO: [FragmentValidation] 1,2-dimethoxyethane is present""", msg3[0])
|
||||
self.assertEqual
|
||||
("""INFO: [FragmentValidation] 1,4-dioxane is present""", msg3[1])
|
||||
self.assertEqual("""INFO: [FragmentValidation] 1,2-dimethoxyethane is present""", msg3[0])
|
||||
self.assertEqual("""INFO: [FragmentValidation] 1,4-dioxane is present""", msg3[1])
|
||||
|
||||
atomic_no = [6, 7, 8]
|
||||
allowed_atoms = [Atom(i) for i in atomic_no]
|
||||
@@ -297,22 +294,32 @@ class TestCase(unittest.TestCase):
|
||||
mol4 = Chem.MolFromSmiles("CC(=O)CF")
|
||||
msg4 = vm4.validate(mol4)
|
||||
self.assertEqual(len(msg4), 1)
|
||||
self.assertEqual
|
||||
("""INFO: [AllowedAtomsValidation] Atom F is not in allowedAtoms list""", msg4[0])
|
||||
self.assertEqual("""INFO: [AllowedAtomsValidation] Atom F is not in allowedAtoms list""",
|
||||
msg4[0])
|
||||
|
||||
atomic_no = [9, 17, 35]
|
||||
disallowed_atoms = [Atom(i) for i in atomic_no]
|
||||
vm5 = rdMolStandardize.DisallowedAtomsValidation(disallowed_atoms)
|
||||
mol5 = Chem.MolFromSmiles("CC(=O)CF")
|
||||
msg5 = vm4.validate(mol5)
|
||||
msg5 = vm5.validate(mol5)
|
||||
self.assertEqual(len(msg5), 1)
|
||||
self.assertEqual
|
||||
("""INFO: [DisallowedAtomsValidation] Atom F is in disallowedAtoms list""", msg5[0])
|
||||
self.assertEqual("""INFO: [DisallowedAtomsValidation] Atom F is in disallowedAtoms list""",
|
||||
msg5[0])
|
||||
|
||||
msg6 = rdMolStandardize.ValidateSmiles("ClCCCl.c1ccccc1O")
|
||||
self.assertEqual(len(msg6), 1)
|
||||
self.assertEqual
|
||||
("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg6[0])
|
||||
mol6 = Chem.MolFromSmiles("[3CH4]")
|
||||
vm6a = rdMolStandardize.IsotopeValidation()
|
||||
msg6a = vm6a.validate(mol6)
|
||||
self.assertEqual(len(msg6a), 1)
|
||||
self.assertEqual("INFO: [IsotopeValidation] Molecule contains isotope 3C", msg6a[0])
|
||||
vm6b = rdMolStandardize.IsotopeValidation(True)
|
||||
msg6b = vm6b.validate(mol6)
|
||||
self.assertEqual(len(msg6b), 1)
|
||||
self.assertEqual("ERROR: [IsotopeValidation] The molecule contains an unknown isotope: 3C",
|
||||
msg6b[0])
|
||||
|
||||
msg999 = rdMolStandardize.ValidateSmiles("ClCCCl.c1ccccc1O")
|
||||
self.assertEqual(len(msg999), 1)
|
||||
self.assertEqual("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg999[0])
|
||||
|
||||
def test10NormalizeFromData(self):
|
||||
data = """// Name SMIRKS
|
||||
@@ -1147,6 +1154,624 @@ chlorine [Cl]
|
||||
rdMolStandardize.SuperParentInPlace(ms, 4)
|
||||
self.assertEqual([Chem.MolToSmiles(m) for m in ms], [y for x, y in ind])
|
||||
|
||||
def test33MolBlockValidation(self):
|
||||
# featuresValidation
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
Mrv2311 01162413552D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)
|
||||
M V30 2 C -18.7083 6.1667 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
validator = rdMolStandardize.FeaturesValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(errinfo[0], "ERROR: [FeaturesValidation] Query atom 0 is not allowed")
|
||||
validator.allowDummies = True
|
||||
validator.allowQueries = True
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
mol = Chem.MolFromMolBlock('''
|
||||
Mrv2311 01162411552D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 4 3 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -18.208 8.52 0 0 CFG=2
|
||||
M V30 2 F -19.5417 7.75 0 0
|
||||
M V30 3 C -16.8743 7.75 0 0
|
||||
M V30 4 Cl -18.208 10.06 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 3 CFG=1
|
||||
M V30 2 1 2 1
|
||||
M V30 3 1 1 4
|
||||
M V30 END BOND
|
||||
M V30 BEGIN COLLECTION
|
||||
M V30 MDLV30/STERAC1 ATOMS=(1 1)
|
||||
M V30 END COLLECTION
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''')
|
||||
|
||||
# enhanced stereo features are by default disallowed
|
||||
validator = rdMolStandardize.FeaturesValidation()
|
||||
errinfo = validator.validate(mol, True)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(
|
||||
errinfo[0], "ERROR: [FeaturesValidation] Enhanced stereochemistry features are not allowed")
|
||||
|
||||
# allow enhanced stereo
|
||||
validator = rdMolStandardize.FeaturesValidation(True)
|
||||
errinfo = validator.validate(mol, True)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
validator.allowEnhancedStereo = True
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
Mrv2311 02272411562D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 7 7 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -10.3542 4.29 0 0
|
||||
M V30 2 C -11.6879 3.52 0 0
|
||||
M V30 3 C -11.6879 1.9798 0 0
|
||||
M V30 4 N -10.3542 1.21 0 0
|
||||
M V30 5 C -9.0204 1.9798 0 0
|
||||
M V30 6 C -9.0204 3.52 0 0
|
||||
M V30 7 C -10.3542 5.83 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 4 1 2
|
||||
M V30 2 4 1 6
|
||||
M V30 3 4 2 3
|
||||
M V30 4 4 5 6
|
||||
M V30 5 1 1 7
|
||||
M V30 6 4 3 4
|
||||
M V30 7 4 4 5
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
# aromatic bonds are by default disallowed
|
||||
validator = rdMolStandardize.FeaturesValidation()
|
||||
errinfo = validator.validate(mol, True)
|
||||
self.assertEqual(len(errinfo), 6)
|
||||
self.assertEqual(errinfo[0],
|
||||
"ERROR: [FeaturesValidation] Bond 0 of aromatic type is not allowed")
|
||||
validator.allowAromaticBondType = True
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
# allow aromatic bonds
|
||||
validator = rdMolStandardize.FeaturesValidation(False, True)
|
||||
errinfo = validator.validate(mol, True)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
# disallowedRadicalValidation
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
Mrv2311 02082417212D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -20.9372 7.145 0 0 RAD=2
|
||||
M V30 2 C -22.2708 6.375 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
validator = rdMolStandardize.DisallowedRadicalValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(errinfo[0],
|
||||
"ERROR: [DisallowedRadicalValidation] The radical at atom 0 is not allowed")
|
||||
|
||||
# is2DValidation
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 0.8753 4.9367 0 0
|
||||
M V30 2 C -0.4583 4.1667 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
validator = rdMolStandardize.Is2DValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
conf = mol.GetConformer()
|
||||
pos = conf.GetAtomPosition(1)
|
||||
self.assertEqual(pos.z, 0.0)
|
||||
pos.z = 0.1
|
||||
conf.SetAtomPosition(1, pos)
|
||||
|
||||
validator = rdMolStandardize.Is2DValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(errinfo[0],
|
||||
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates")
|
||||
|
||||
validator = rdMolStandardize.Is2DValidation(0.2)
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 0.8753 4.9367 0 0
|
||||
M V30 2 C -0.4583 4.1667 0.2 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
validator = rdMolStandardize.Is2DValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(errinfo[0],
|
||||
"ERROR: [Is2DValidation] The molecule includes non-null Z coordinates")
|
||||
|
||||
# AtomClashValidation
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 6 5 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -1.6667 6.2067 0 0
|
||||
M V30 2 C -3.0004 5.4367 0 0
|
||||
M V30 3 C -3.0004 3.8965 0 0
|
||||
M V30 4 C -1.6667 3.1267 0 0
|
||||
M V30 5 C -0.3329 4.6000 0 0
|
||||
M V30 6 C -0.3329 4.7000 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2
|
||||
M V30 2 1 1 6
|
||||
M V30 3 1 2 3
|
||||
M V30 4 1 3 4
|
||||
M V30 5 1 4 5
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
validator = rdMolStandardize.Layout2DValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(errinfo[0], "ERROR: [Layout2DValidation] Atom 4 is too close to atom 5")
|
||||
|
||||
validator = rdMolStandardize.Layout2DValidation(1e-3)
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 0)
|
||||
|
||||
mol = Chem.MolFromMolBlock(
|
||||
'''
|
||||
10052311582D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 5 4 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 Br 0.0003 7.27 0 0
|
||||
M V30 2 C -1.3333 6.5 0 0
|
||||
M V30 3 F -2.667 7.27 0 0
|
||||
M V30 4 O -1.3333 4.96 0 0
|
||||
M V30 5 C 0.0003 5.73 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 5 CFG=1
|
||||
M V30 2 1 2 3 CFG=3
|
||||
M V30 3 1 2 1
|
||||
M V30 4 1 2 4
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
''', sanitize=False)
|
||||
|
||||
Chem.ReapplyMolBlockWedging(mol)
|
||||
|
||||
validator = rdMolStandardize.StereoValidation()
|
||||
errinfo = validator.validate(mol)
|
||||
self.assertEqual(len(errinfo), 1)
|
||||
self.assertEqual(
|
||||
errinfo[0],
|
||||
"ERROR: [StereoValidation] Atom 1 has opposing stereo bonds with different up/down orientation"
|
||||
)
|
||||
|
||||
def test24Pipeline(self):
|
||||
pipeline = rdMolStandardize.Pipeline()
|
||||
|
||||
# invalid input molblock
|
||||
molblock = '''
|
||||
sldfj;ldskfj sldkjfsd;lkf
|
||||
M V30 BEGIN CTAB
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.PARSING_INPUT)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.INPUT_ERROR)
|
||||
|
||||
# R group
|
||||
molblock = '''
|
||||
Mrv2311 01162413552D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)
|
||||
M V30 2 C -18.7083 6.1667 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.FEATURES_VALIDATION_ERROR)
|
||||
|
||||
# no atoms
|
||||
molblock = '''
|
||||
10052313452D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 0 0 0 0 0
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.BASIC_VALIDATION_ERROR)
|
||||
|
||||
# neutral quaternary N
|
||||
molblock = '''
|
||||
10242314442D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 5 4 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -1.6247 7.5825 0 0
|
||||
M V30 2 N -2.9583 6.8125 0 0
|
||||
M V30 3 C -4.292 7.5825 0 0
|
||||
M V30 4 C -2.9583 5.2725 0 0
|
||||
M V30 5 C -1.6247 6.0425 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 1 2 3
|
||||
M V30 3 1 2 4
|
||||
M V30 4 1 2 5
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
#self.assertTrue(result.status & rdMolStandardize.PipelineStatus.STANDARDIZATION_ERROR)
|
||||
self.assertEqual(
|
||||
result.status,
|
||||
(
|
||||
rdMolStandardize.PipelineStatus.BASIC_VALIDATION_ERROR
|
||||
| rdMolStandardize.PipelineStatus.PREPARE_FOR_STANDARDIZATION_ERROR #|
|
||||
#rdMolStandardize.PipelineStatus.NORMALIZER_STANDARDIZATION_ERROR
|
||||
))
|
||||
|
||||
molblock = '''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 2 1 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C 0.8753 4.9367 0 0
|
||||
M V30 2 C -0.4583 4.1667 0.2 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.IS2D_VALIDATION_ERROR)
|
||||
|
||||
molblock = '''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 4 3 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -3.05 5.48 0 0
|
||||
M V30 2 C -4.4167 4.6875 0 0
|
||||
M V30 3 C -4.3289 6.3627 0 0
|
||||
M V30 4 C -3.0 5.5 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 1 1 3
|
||||
M V30 3 1 3 4
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.LAYOUT2D_VALIDATION_ERROR)
|
||||
|
||||
molblock = '''
|
||||
2D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 5 4 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -1.583 5.7075 0 0
|
||||
M V30 2 C -2.9167 4.9375 0 0
|
||||
M V30 3 C -1.583 7.2475 0 0
|
||||
M V30 4 C -0.2493 4.9375 0.5 0
|
||||
M V30 5 C -1.583 4.1675 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 1 2 CFG=1
|
||||
M V30 2 1 1 3 CFG=1
|
||||
M V30 3 1 1 4
|
||||
M V30 4 1 1 5
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertNotEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertTrue(result.status & rdMolStandardize.PipelineStatus.VALIDATION_ERROR)
|
||||
self.assertEqual(
|
||||
result.status, rdMolStandardize.PipelineStatus.IS2D_VALIDATION_ERROR
|
||||
| rdMolStandardize.PipelineStatus.STEREO_VALIDATION_ERROR)
|
||||
|
||||
molblock = '''
|
||||
10282320572D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 5 4 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -1.0413 5.4992 0 0
|
||||
M V30 2 C -2.375 4.7292 0 0
|
||||
M V30 3 O -1.0413 7.0392 0 0
|
||||
M V30 4 O 0.2924 4.7292 0 0
|
||||
M V30 5 Na 0.2924 3.1892 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 1 1 4
|
||||
M V30 3 2 1 3
|
||||
M V30 4 1 4 5
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
|
||||
rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
(rdMolStandardize.PipelineStatus.METALS_DISCONNECTED
|
||||
| rdMolStandardize.PipelineStatus.FRAGMENTS_REMOVED
|
||||
| rdMolStandardize.PipelineStatus.PROTONATION_CHANGED))
|
||||
|
||||
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
|
||||
parentSmiles = Chem.MolToSmiles(parentMol)
|
||||
self.assertEqual(parentSmiles, "CC(=O)O")
|
||||
|
||||
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
|
||||
outputSmiles = Chem.MolToSmiles(outputMol)
|
||||
self.assertEqual(outputSmiles, "CC(=O)O")
|
||||
|
||||
molblock = '''
|
||||
10282320572D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 4 3 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 N -1.0413 5.4992 0 0
|
||||
M V30 2 C -2.375 4.7292 0 0
|
||||
M V30 3 O -1.0413 7.0392 0 0
|
||||
M V30 4 O 0.2924 4.7292 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 2 1 4
|
||||
M V30 3 2 1 3
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
# nitro groups are cleaned-up in a pre-standardization step
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
|
||||
rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
|
||||
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
|
||||
parentSmiles = Chem.MolToSmiles(parentMol)
|
||||
self.assertEqual(parentSmiles, "C[N+](=O)[O-]")
|
||||
|
||||
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
|
||||
outputSmiles = Chem.MolToSmiles(outputMol)
|
||||
self.assertEqual(outputSmiles, "C[N+](=O)[O-]")
|
||||
|
||||
molblock = '''
|
||||
10282320572D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 6 5 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 C -1.0413 5.4992 0 0
|
||||
M V30 2 C -2.375 4.7292 0 0
|
||||
M V30 3 O -1.0413 7.0392 0 0
|
||||
M V30 4 O 0.2924 4.7292 0 0
|
||||
M V30 5 N -3.7087 5.4992 0 0 CHG=1
|
||||
M V30 6 Na 0.2924 3.1892 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 1 1 4
|
||||
M V30 3 2 1 3
|
||||
M V30 4 1 2 5
|
||||
M V30 5 1 4 6
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
|
||||
rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
(rdMolStandardize.PipelineStatus.METALS_DISCONNECTED
|
||||
| rdMolStandardize.PipelineStatus.FRAGMENTS_REMOVED))
|
||||
|
||||
parentMol = Chem.MolFromMolBlock(result.parentMolData, sanitize=False)
|
||||
parentSmiles = Chem.MolToSmiles(parentMol)
|
||||
self.assertEqual(parentSmiles, "NCC(=O)O")
|
||||
|
||||
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
|
||||
outputSmiles = Chem.MolToSmiles(outputMol)
|
||||
self.assertEqual(outputSmiles, "[NH3+]CC(=O)[O-]")
|
||||
|
||||
def test25PipelineNormalizerOptions(self):
|
||||
options = rdMolStandardize.PipelineOptions()
|
||||
# run the pipeline w/ the RDKit default normalizer transforms
|
||||
options.normalizerData = ''
|
||||
pipeline = rdMolStandardize.Pipeline(options)
|
||||
|
||||
molblock = '''
|
||||
Mrv2311 02072415362D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 4 3 0 0 0
|
||||
M V30 BEGIN ATOM
|
||||
M V30 1 S -10.3538 4.27 0 0
|
||||
M V30 2 C -11.6875 3.5 0 0
|
||||
M V30 3 O -10.3538 5.81 0 0
|
||||
M V30 4 C -9.0201 3.5 0 0
|
||||
M V30 END ATOM
|
||||
M V30 BEGIN BOND
|
||||
M V30 1 1 2 1
|
||||
M V30 2 1 1 4
|
||||
M V30 3 2 1 3
|
||||
M V30 END BOND
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.PIPELINE_ERROR),
|
||||
rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
self.assertNotEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION)
|
||||
self.assertEqual((result.status & rdMolStandardize.PipelineStatus.STRUCTURE_MODIFICATION),
|
||||
rdMolStandardize.PipelineStatus.NORMALIZATION_APPLIED)
|
||||
|
||||
outputMol = Chem.MolFromMolBlock(result.outputMolData, sanitize=False)
|
||||
outputSmiles = Chem.MolToSmiles(outputMol)
|
||||
self.assertEqual(outputSmiles, "C[S+](C)[O-]")
|
||||
|
||||
def test26PipelineAllowEmptyMoleculesOption(self):
|
||||
options = rdMolStandardize.PipelineOptions()
|
||||
options.allowEmptyMolecules = True
|
||||
pipeline = rdMolStandardize.Pipeline(options)
|
||||
|
||||
# no atoms
|
||||
molblock = '''
|
||||
10052313452D
|
||||
|
||||
0 0 0 0 0 999 V3000
|
||||
M V30 BEGIN CTAB
|
||||
M V30 COUNTS 0 0 0 0 0
|
||||
M V30 END CTAB
|
||||
M END
|
||||
'''
|
||||
result = pipeline.run(molblock)
|
||||
self.assertEqual(result.stage, rdMolStandardize.PipelineStage.COMPLETED)
|
||||
self.assertEqual(result.status, rdMolStandardize.PipelineStatus.NO_EVENT)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
2419
Code/GraphMol/MolStandardize/testPipeline.cpp
Normal file
2419
Code/GraphMol/MolStandardize/testPipeline.cpp
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,4 @@
|
||||
find_package(SWIG 4.1 REQUIRED)
|
||||
find_package(SWIG 4.2 REQUIRED)
|
||||
include(${SWIG_USE_FILE})
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
@@ -1,5 +1,27 @@
|
||||
%{
|
||||
#include <GraphMol/MolStandardize/MolStandardize.h>
|
||||
#include <GraphMol/MolStandardize/Pipeline.h>
|
||||
|
||||
namespace RDKit {
|
||||
namespace MolStandardize {
|
||||
bool operator==(const PipelineLogEntry & rhs, const PipelineLogEntry & lhs) {
|
||||
return (rhs.status == lhs.status) && (rhs.detail == lhs.detail);
|
||||
}
|
||||
bool operator!=(const PipelineLogEntry & rhs, const PipelineLogEntry & lhs) {
|
||||
return !(rhs == lhs);
|
||||
}
|
||||
}
|
||||
}
|
||||
%}
|
||||
|
||||
%include <std_vector.i>
|
||||
namespace std {
|
||||
%template(PipelineLog) std::vector<RDKit::MolStandardize::PipelineLogEntry>;
|
||||
}
|
||||
|
||||
%include <GraphMol/MolStandardize/MolStandardize.h>
|
||||
|
||||
#if defined SWIGJAVA
|
||||
%include "enumtypeunsafe.swg"
|
||||
#endif
|
||||
%include <GraphMol/MolStandardize/Pipeline.h>
|
||||
|
||||
@@ -14,6 +14,53 @@ import org.junit.*;
|
||||
public void testStandardize1() {
|
||||
assertEquals("fail", RDKFuncs.standardizeSmiles("[Na]OC(=O)c1ccccc1"),"O=C([O-])c1ccccc1.[Na+]");
|
||||
}
|
||||
@Test
|
||||
public void testPipelineBadInput() {
|
||||
Pipeline pipeline = new Pipeline();
|
||||
|
||||
PipelineResult result = pipeline.run(
|
||||
"\n" +
|
||||
" sldfj;ldskfj sldkjfsd;lkf\n" +
|
||||
"M V30 BEGIN CTAB"
|
||||
);
|
||||
|
||||
assertEquals(result.getStage(), PipelineStage.PARSING_INPUT);
|
||||
assertFalse(result.getStatus() == PipelineStatus.NO_EVENT);
|
||||
assertTrue((result.getStatus() & PipelineStatus.INPUT_ERROR) != PipelineStatus.NO_EVENT);
|
||||
|
||||
result.delete();
|
||||
pipeline.delete();
|
||||
}
|
||||
@Test
|
||||
public void testPipelineUnsupportedFeatures() {
|
||||
Pipeline pipeline = new Pipeline();
|
||||
|
||||
PipelineResult result = pipeline.run(
|
||||
"\n" +
|
||||
" Mrv2311 01162413552D \n" +
|
||||
"\n" +
|
||||
" 0 0 0 0 0 999 V3000\n" +
|
||||
"M V30 BEGIN CTAB\n" +
|
||||
"M V30 COUNTS 2 1 0 0 0\n" +
|
||||
"M V30 BEGIN ATOM\n" +
|
||||
"M V30 1 R# -17.3747 6.9367 0 0 RGROUPS=(1 0)\n" +
|
||||
"M V30 2 C -18.7083 6.1667 0 0\n" +
|
||||
"M V30 END ATOM\n" +
|
||||
"M V30 BEGIN BOND\n" +
|
||||
"M V30 1 1 2 1\n" +
|
||||
"M V30 END BOND\n" +
|
||||
"M V30 END CTAB\n" +
|
||||
"M END"
|
||||
);
|
||||
|
||||
assertEquals(result.getStage(), PipelineStage.COMPLETED);
|
||||
assertFalse(result.getStatus() == PipelineStatus.NO_EVENT);
|
||||
assertTrue((result.getStatus() & PipelineStatus.VALIDATION_ERROR) != PipelineStatus.NO_EVENT);
|
||||
assertTrue((result.getStatus() & PipelineStatus.FEATURES_VALIDATION_ERROR) != PipelineStatus.NO_EVENT);
|
||||
|
||||
result.delete();
|
||||
pipeline.delete();
|
||||
}
|
||||
public static void main(String args[]) {
|
||||
org.junit.runner.JUnitCore.main("org.RDKit.MolStandardizeTest");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user