mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-07 22:44:25 +08:00
* StructChecker changes. Initial commit. First implementation. Added some tests. * StructChecker: add GoodAtoms and AcidicAtoms. new updates * StructChecker: add new tests * StructChecker: added TransformAugmentedAtoms() * StructCheck: add structCheck to GraphMol. Fix compilation errors. * StructChecker: add stereo verification and some utilities. * StructChecker: function FixDubious3DMolecule was added * StructChecker: checkStereo added. done with stereo. * StructChecker: add StripSmallFragments() * StructChecker: add AtomClash() function. Some cosmetic + tests * StructChecker: checkAtoms() was started * StructChecker: checkAtoms is ready * StructChecker: user RingInfo from RDkit. Start regarge * StructChecker: ReCharge molecule method prototype * StructChecker: updates for ReCharge. Almost finished * StructChecker: all ReCharge is done except external data tables loading * StructChecker: add path tables into API. ReCharge completed * Adds augmented atom data Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Removes extra files Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Adds path to test data via RDBASE environment Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Revert "Struct checker apr15" * StructChecker: add missing tautomer tests * Updates test to use RDBASE * Adds initialization of data from data section * Adds Python API and tests * Fixes namespace for enum * StructChecker: update/imporve strip small fragments * StructChecker: fix acidic atoms (but logic does not work) * StructChecker: fix match issue for CheckAtoms * Adds macro guards * Adds loading API and proper constructor * Fixes tests, adds stereo test * Fixes crash bug, matches[0] was being accessed from an empty match vector * Reverts crash fix - conflicts with previous * Adds the rest of the structure checker options * StructChecker: fix atom matching for aromatic rings * StructChecker: add tautomers checks. Update some tests * StructChecker: stereo fixes. Add some tests * StructChecker: fix check atoms. Start ligand symbol list * StructChecker: fix some check atoms validation. Add Tranform to query lists. Start correct loading augmented atoms * update * another set of fixes * StructChecker: fix loadDefaultAugmentedAtoms. Some changes in CheckAtom + tests + debug conditional breakpoints (TEMP operators) * StructChecker: rewrited RecMatch() to sequential. Changed bond matching algorithm. small bug fixes * Adds better logging of mismatched atoms * Removes duplicated negative charge * Fixes charges * Adds nitro group test * StructChecker: add better logging * remove double logging * Reformats code using RDKit's clang-format style * StructChecker: Fix charge reformat using RDKit format. * StructChecker: compilation restore after merge * restore bond matching * Removes the same fragments that strucheck does in case of ties * Don't resanitize - this adds aromaticity which mucks things up * Adds empty molecule checks * Fixes atom clashes. * Removes debug printing * Removes debug logging info * First pass at stereo fixes * Fixes off by one error for dubious stereo fix * Fixes more off by one errors * Fixes more off by one errors * More off by one fixes. * Another off by one * Fixes chiral flag set in molfile check * Copies chiral flag over to largest fragment if necessary * Poor man’s parity check. * Find unspecified chiral centers ala Avalon. * StructChecker: fix recursive match. Fix transformations * StructChecker: fix transformation for atom list (using query atoms) * Fixes checks && to & * StructChecker: fix carboxylic acids tranform issue. Atom list is changed only if different * StructChecker: documentation was updated * Fixes snprintf and silences some warnings * Adds Get/Set StructCheckerOptions * Adds default AugmentedAtomTransforms
281 lines
8.9 KiB
C++
281 lines
8.9 KiB
C++
//
|
|
// Copyright (C) 2016 Novartis Institutes for BioMedical Research
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#pragma once
|
|
#ifndef _RD_STRUCTCHECKER_H__
|
|
#define _RD_STRUCTCHECKER_H__
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include "../RDKitBase.h"
|
|
|
|
/* Example of Usage
|
|
1) StructChecker chk;
|
|
int flags = StructureCheck::checkMolStructure( mol ); // use defaults
|
|
or
|
|
2)
|
|
StructureCheck::StructCheckerOptions options; // use defaults
|
|
// To use external data
|
|
StructureCheck::loadOptionsFromFiles(options, file1, file2, …);
|
|
StructChecker chk(options);
|
|
|
|
for( mol in mols ) {
|
|
int flags = StructureCheck::checkMolStructure( mol, &options);
|
|
if (0!=(flags & StructureCheck::StructureFlags::BAD_SET)) {
|
|
// write to error file
|
|
} else if (0!=(flags & StructureCheck::StructureFlags::TRANSFORMED_SET))
|
|
{
|
|
// input molecule was transformed
|
|
} else { // flag == NO_CHANGE
|
|
// no change
|
|
}
|
|
}
|
|
*/
|
|
|
|
namespace RDKit {
|
|
namespace StructureCheck {
|
|
|
|
// Flags for the return values of the StructureChecker
|
|
|
|
// TypeDefs for translating augmented atom pairs
|
|
static const int ANY_CHARGE = 8;
|
|
enum RadicalType {
|
|
RT_NONE = 0,
|
|
SINGLET = 1,
|
|
DOUBLET = 2,
|
|
TRIPLET = 3,
|
|
ANY_RADICAL = 0xFF
|
|
};
|
|
|
|
enum AABondType { // MDL CTFile bond types plus extensions
|
|
BT_NONE = 0, // means REMOVE Bond
|
|
SINGLE = 1,
|
|
DOUBLE = 2,
|
|
TRIPLE = 3,
|
|
AROMATIC = 4,
|
|
SINGLE_DOUBLE = 5,
|
|
SINGLE_AROMATIC = 6,
|
|
DOUBLE_AROMATIC = 7,
|
|
ANY_BOND = 8,
|
|
ALL_BOND_TYPES = 0xF
|
|
};
|
|
|
|
enum AATopology {
|
|
TP_NONE = 0, // Don't care
|
|
RING = 1, // Ring
|
|
CHAIN = 2 // Chain
|
|
};
|
|
|
|
struct Ligand {
|
|
std::string AtomSymbol;
|
|
int Charge;
|
|
RadicalType Radical;
|
|
unsigned SubstitutionCount; // substitution count 0 = don't care
|
|
AABondType BondType;
|
|
Ligand()
|
|
: Charge(ANY_CHARGE),
|
|
Radical(ANY_RADICAL),
|
|
SubstitutionCount(0),
|
|
BondType(ANY_BOND) {}
|
|
};
|
|
|
|
struct AugmentedAtom {
|
|
std::string AtomSymbol;
|
|
std::string ShortName;
|
|
int Charge;
|
|
RadicalType Radical;
|
|
AATopology Topology;
|
|
std::vector<Ligand> Ligands;
|
|
|
|
AugmentedAtom()
|
|
: Charge(ANY_CHARGE), Radical(ANY_RADICAL), Topology(TP_NONE) {}
|
|
|
|
AugmentedAtom(const std::string &symbol, const std::string &name, int charge,
|
|
RadicalType radical, AATopology topology)
|
|
: AtomSymbol(symbol),
|
|
ShortName(name),
|
|
Charge(charge),
|
|
Radical(radical),
|
|
Topology(topology) {}
|
|
};
|
|
|
|
struct IncEntry {
|
|
std::string AtomSymbol;
|
|
double LocalInc;
|
|
double AlphaInc;
|
|
double BetaInc;
|
|
double MultInc;
|
|
|
|
// Used for logging
|
|
int local_inc_used;
|
|
int alpha_inc_used;
|
|
int beta_inc_used;
|
|
int mult_inc_used;
|
|
};
|
|
|
|
struct PathEntry {
|
|
AugmentedAtom Path;
|
|
double Cond;
|
|
// Used for logging
|
|
int cond_used;
|
|
};
|
|
//-------------
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// Structure Check Options
|
|
// Holds all the user options for the StructureChecking.
|
|
// Can be initialized from factory functions, perhaps serialized
|
|
|
|
struct StructCheckerOptions {
|
|
double AcidityLimit;
|
|
bool RemoveMinorFragments;
|
|
int DesiredCharge;
|
|
bool CheckCollisions;
|
|
int CollisionLimitPercent;
|
|
unsigned MaxMolSize;
|
|
bool ConvertSText;
|
|
bool SqueezeIdentifiers;
|
|
bool StripZeros;
|
|
bool CheckStereo;
|
|
bool ConvertAtomTexts;
|
|
bool GroupsToSGroups;
|
|
bool Verbose;
|
|
|
|
// Internal data for struchk
|
|
std::vector<std::pair<AugmentedAtom, AugmentedAtom> > AugmentedAtomPairs;
|
|
std::vector<AugmentedAtom> AcidicAtoms;
|
|
std::vector<AugmentedAtom> GoodAtoms;
|
|
std::vector<ROMOL_SPTR> Patterns;
|
|
std::vector<ROMOL_SPTR> RotatePatterns;
|
|
std::vector<ROMOL_SPTR> StereoPatterns;
|
|
std::vector<ROMOL_SPTR> FromTautomer;
|
|
std::vector<ROMOL_SPTR> ToTautomer;
|
|
|
|
double Elneg0; // elneg_table[0].value;
|
|
std::map<unsigned, double> ElnegTable; // AtomicNumber -> eleng
|
|
std::vector<IncEntry> AtomAcidity; // atom_acidity_table[]
|
|
std::vector<IncEntry> ChargeIncTable;
|
|
// std::map AtomSymbol(or AtomicNumber) -> IncEntry
|
|
/* [ReadTransformation() ]
|
|
* The alpha, beta coefficients of the transfomation function used
|
|
* to stretch the preliminary pKa values to the actual predictions.
|
|
* The function is pKa = 7 + (pKa'-7)*beta + ((pKa'-7)*alpha)^3.
|
|
*/
|
|
|
|
double Alpha, Beta;
|
|
std::vector<PathEntry> AlphaPathTable, BetaPathTable;
|
|
|
|
public:
|
|
StructCheckerOptions();
|
|
|
|
void clear() { *this = StructCheckerOptions(); }
|
|
|
|
bool loadAugmentedAtomTranslations(const std::string &path);
|
|
void setAugmentedAtomTranslations(
|
|
const std::vector<std::pair<AugmentedAtom, AugmentedAtom> > &aaPairs);
|
|
|
|
bool loadAcidicAugmentedAtoms(const std::string &path);
|
|
void setAcidicAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
|
|
|
|
bool loadGoodAugmentedAtoms(const std::string &path);
|
|
void setGoodAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
|
|
|
|
bool loadPatterns(const std::string &path); // file with clean patterns
|
|
void parsePatterns(
|
|
const std::vector<std::string> &smarts); // can throw RDKit exeptions
|
|
void setPatterns(const std::vector<ROMOL_SPTR> &p);
|
|
|
|
bool loadRotatePatterns(const std::string &path); // file with rotate patterns
|
|
void parseRotatePatterns(
|
|
const std::vector<std::string> &smarts); // can throw RDKit exeptions
|
|
void setRotatePatterns(const std::vector<ROMOL_SPTR> &p);
|
|
|
|
bool loadStereoPatterns(
|
|
const std::string &path); // file with stereo patterns
|
|
void parseStereoPatterns(
|
|
const std::vector<std::string> &smarts); // can throw RDKit exeptions
|
|
void setStereoPatterns(const std::vector<ROMOL_SPTR> &p);
|
|
|
|
bool loadTautomerData(const std::string &path); // file path
|
|
void parseTautomerData(const std::vector<std::string> &smartsFrom,
|
|
const std::vector<std::string> &smartsTo);
|
|
void setTautomerData(const std::vector<ROMOL_SPTR> &from,
|
|
const std::vector<ROMOL_SPTR> &to);
|
|
bool loadChargeDataTables(const std::string &path); // file path
|
|
};
|
|
|
|
bool parseOptionsJSON(const std::string &json, StructCheckerOptions &op);
|
|
|
|
bool loadOptionsFromFiles(
|
|
StructCheckerOptions &op,
|
|
const std::string &augmentedAtomTranslationsFile = "",
|
|
// ?? AcidicAtoms;
|
|
// ?? GoodAtoms;
|
|
const std::string &patternFile = "", // file with clean patterns
|
|
const std::string &rotatePatternFile = "", // file with rotate patterns
|
|
const std::string &stereoPatternFile = "", // file with stereo patterns
|
|
const std::string &tautomerFile = "");
|
|
|
|
class StructChecker {
|
|
public:
|
|
typedef enum StructureFlags {
|
|
NO_CHANGE = 0,
|
|
BAD_MOLECULE = 0x0001,
|
|
ALIAS_CONVERSION_FAILED = 0x0002,
|
|
STEREO_ERROR = 0x0004,
|
|
STEREO_FORCED_BAD = 0x0008,
|
|
ATOM_CLASH = 0x0010,
|
|
ATOM_CHECK_FAILED = 0x0020,
|
|
SIZE_CHECK_FAILED = 0x0040,
|
|
// reserved error = 0x0080,
|
|
TRANSFORMED = 0x0100,
|
|
FRAGMENTS_FOUND = 0x0200,
|
|
EITHER_WARNING = 0x0400,
|
|
DUBIOUS_STEREO_REMOVED = 0x0800,
|
|
RECHARGED = 0x1000,
|
|
STEREO_TRANSFORMED = 0x2000,
|
|
TEMPLATE_TRANSFORMED = 0x4000,
|
|
TAUTOMER_TRANSFORMED = 0x8000,
|
|
// mask:
|
|
BAD_SET = (BAD_MOLECULE | ALIAS_CONVERSION_FAILED | STEREO_ERROR |
|
|
STEREO_FORCED_BAD | ATOM_CLASH | ATOM_CHECK_FAILED |
|
|
SIZE_CHECK_FAILED),
|
|
|
|
TRANSFORMED_SET = (TRANSFORMED | FRAGMENTS_FOUND | EITHER_WARNING |
|
|
DUBIOUS_STEREO_REMOVED | STEREO_TRANSFORMED |
|
|
TEMPLATE_TRANSFORMED | TAUTOMER_TRANSFORMED | RECHARGED),
|
|
} StructureFlags;
|
|
// attributes:
|
|
private:
|
|
StructCheckerOptions Options;
|
|
|
|
public:
|
|
inline StructChecker() {}
|
|
inline StructChecker(const StructCheckerOptions &options)
|
|
: Options(options) {}
|
|
|
|
const StructCheckerOptions & GetOptions() const { return Options; }
|
|
void SetOptions(const StructCheckerOptions &options) { Options = options; }
|
|
|
|
// Check and fix (if need) molecule structure and return a set of StructureFlags
|
|
// that describes what have been done
|
|
unsigned checkMolStructure(RWMol &mol) const;
|
|
|
|
// an instance independed helper methods:
|
|
// Converts structure property flags to a comma seperated string
|
|
static std::string StructureFlagsToString(unsigned flags);
|
|
// Converts a comma seperated string to a StructureFlag unsigned integer
|
|
static unsigned StringToStructureFlags(const std::string &str);
|
|
// internal implementation:
|
|
private:
|
|
};
|
|
}
|
|
}
|
|
#endif
|