Files
rdkit/Code/GraphMol/StructChecker/StructChecker.h
Brian Kelley 8609cd4883 Add StructChecker functionality
* StructChecker changes. Initial commit. First implementation. Added some tests.

* StructChecker: add  GoodAtoms and AcidicAtoms. new updates

* StructChecker: add new tests

* StructChecker: added TransformAugmentedAtoms()

* StructCheck: add structCheck to GraphMol. Fix compilation errors.

* StructChecker: add stereo verification and some utilities.

* StructChecker: function FixDubious3DMolecule was added

* StructChecker: checkStereo added. done with stereo.

* StructChecker: add StripSmallFragments()

* StructChecker: add AtomClash() function. Some cosmetic + tests

* StructChecker: checkAtoms() was started

* StructChecker: checkAtoms is ready

* StructChecker: user RingInfo from RDkit. Start regarge

* StructChecker: ReCharge molecule method prototype

* StructChecker: updates for ReCharge. Almost finished

* StructChecker: all ReCharge is done except external data tables loading

* StructChecker: add path tables into API. ReCharge completed

* Adds augmented atom data

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Removes extra files

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Adds path to test data via RDBASE environment

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Revert "Struct checker apr15"

* StructChecker: add missing tautomer tests

* Updates test to use RDBASE

* Adds initialization of data from data section

* Adds Python API and tests

* Fixes namespace for enum

* StructChecker: update/imporve strip small fragments

* StructChecker: fix acidic atoms (but logic does not work)

* StructChecker: fix match issue for CheckAtoms

* Adds macro guards

* Adds loading API and proper constructor

* Fixes tests, adds stereo test

* Fixes crash bug, matches[0] was being accessed from an empty match vector

* Reverts crash fix - conflicts with previous

* Adds the rest of the structure checker options

* StructChecker: fix atom matching for aromatic rings

* StructChecker: add tautomers checks. Update some tests

* StructChecker: stereo fixes. Add some tests

* StructChecker: fix check atoms. Start ligand symbol list

* StructChecker: fix some check atoms validation. Add Tranform to query lists. Start correct loading augmented atoms

* update

* another set of fixes

* StructChecker: fix loadDefaultAugmentedAtoms. Some changes in CheckAtom + tests + debug conditional breakpoints (TEMP operators)

* StructChecker: rewrited RecMatch() to sequential. Changed bond matching algorithm. small bug fixes

* Adds better logging of mismatched atoms

* Removes duplicated negative charge

* Fixes charges

* Adds nitro group test

* StructChecker: add better logging

* remove double logging

* Reformats code using RDKit's clang-format style

* StructChecker: Fix charge reformat using RDKit format.

* StructChecker: compilation restore after merge

* restore bond matching

* Removes the same fragments that strucheck does in case of ties

* Don't resanitize - this adds aromaticity which mucks things up

* Adds empty molecule checks

* Fixes atom clashes.

* Removes debug printing

* Removes debug logging info

* First pass at stereo fixes

* Fixes off by one error for dubious stereo fix

* Fixes more off by one errors

* Fixes more off by one errors

* More off by one fixes.

* Another off by one

* Fixes chiral flag set in molfile check

* Copies chiral flag over to largest fragment if necessary

* Poor man’s parity check.

* Find unspecified chiral centers ala Avalon.

* StructChecker: fix recursive match. Fix transformations

* StructChecker: fix transformation for atom list (using query atoms)

* Fixes checks && to &

* StructChecker: fix carboxylic acids tranform issue. Atom list is changed only if different

* StructChecker: documentation was updated

* Fixes snprintf and silences some warnings

* Adds Get/Set StructCheckerOptions

* Adds default AugmentedAtomTransforms
2016-10-24 08:00:07 +02:00

281 lines
8.9 KiB
C++

//
// Copyright (C) 2016 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#pragma once
#ifndef _RD_STRUCTCHECKER_H__
#define _RD_STRUCTCHECKER_H__
#include <string>
#include <vector>
#include "../RDKitBase.h"
/* Example of Usage
1) StructChecker chk;
int flags = StructureCheck::checkMolStructure( mol ); // use defaults
or
2)
StructureCheck::StructCheckerOptions options; // use defaults
// To use external data
StructureCheck::loadOptionsFromFiles(options, file1, file2, …);
StructChecker chk(options);
for( mol in mols ) {
int flags = StructureCheck::checkMolStructure( mol, &options);
if (0!=(flags & StructureCheck::StructureFlags::BAD_SET)) {
// write to error file
} else if (0!=(flags & StructureCheck::StructureFlags::TRANSFORMED_SET))
{
// input molecule was transformed
} else { // flag == NO_CHANGE
// no change
}
}
*/
namespace RDKit {
namespace StructureCheck {
// Flags for the return values of the StructureChecker
// TypeDefs for translating augmented atom pairs
static const int ANY_CHARGE = 8;
enum RadicalType {
RT_NONE = 0,
SINGLET = 1,
DOUBLET = 2,
TRIPLET = 3,
ANY_RADICAL = 0xFF
};
enum AABondType { // MDL CTFile bond types plus extensions
BT_NONE = 0, // means REMOVE Bond
SINGLE = 1,
DOUBLE = 2,
TRIPLE = 3,
AROMATIC = 4,
SINGLE_DOUBLE = 5,
SINGLE_AROMATIC = 6,
DOUBLE_AROMATIC = 7,
ANY_BOND = 8,
ALL_BOND_TYPES = 0xF
};
enum AATopology {
TP_NONE = 0, // Don't care
RING = 1, // Ring
CHAIN = 2 // Chain
};
struct Ligand {
std::string AtomSymbol;
int Charge;
RadicalType Radical;
unsigned SubstitutionCount; // substitution count 0 = don't care
AABondType BondType;
Ligand()
: Charge(ANY_CHARGE),
Radical(ANY_RADICAL),
SubstitutionCount(0),
BondType(ANY_BOND) {}
};
struct AugmentedAtom {
std::string AtomSymbol;
std::string ShortName;
int Charge;
RadicalType Radical;
AATopology Topology;
std::vector<Ligand> Ligands;
AugmentedAtom()
: Charge(ANY_CHARGE), Radical(ANY_RADICAL), Topology(TP_NONE) {}
AugmentedAtom(const std::string &symbol, const std::string &name, int charge,
RadicalType radical, AATopology topology)
: AtomSymbol(symbol),
ShortName(name),
Charge(charge),
Radical(radical),
Topology(topology) {}
};
struct IncEntry {
std::string AtomSymbol;
double LocalInc;
double AlphaInc;
double BetaInc;
double MultInc;
// Used for logging
int local_inc_used;
int alpha_inc_used;
int beta_inc_used;
int mult_inc_used;
};
struct PathEntry {
AugmentedAtom Path;
double Cond;
// Used for logging
int cond_used;
};
//-------------
////////////////////////////////////////////////////////////////////////////
// Structure Check Options
// Holds all the user options for the StructureChecking.
// Can be initialized from factory functions, perhaps serialized
struct StructCheckerOptions {
double AcidityLimit;
bool RemoveMinorFragments;
int DesiredCharge;
bool CheckCollisions;
int CollisionLimitPercent;
unsigned MaxMolSize;
bool ConvertSText;
bool SqueezeIdentifiers;
bool StripZeros;
bool CheckStereo;
bool ConvertAtomTexts;
bool GroupsToSGroups;
bool Verbose;
// Internal data for struchk
std::vector<std::pair<AugmentedAtom, AugmentedAtom> > AugmentedAtomPairs;
std::vector<AugmentedAtom> AcidicAtoms;
std::vector<AugmentedAtom> GoodAtoms;
std::vector<ROMOL_SPTR> Patterns;
std::vector<ROMOL_SPTR> RotatePatterns;
std::vector<ROMOL_SPTR> StereoPatterns;
std::vector<ROMOL_SPTR> FromTautomer;
std::vector<ROMOL_SPTR> ToTautomer;
double Elneg0; // elneg_table[0].value;
std::map<unsigned, double> ElnegTable; // AtomicNumber -> eleng
std::vector<IncEntry> AtomAcidity; // atom_acidity_table[]
std::vector<IncEntry> ChargeIncTable;
// std::map AtomSymbol(or AtomicNumber) -> IncEntry
/* [ReadTransformation() ]
* The alpha, beta coefficients of the transfomation function used
* to stretch the preliminary pKa values to the actual predictions.
* The function is pKa = 7 + (pKa'-7)*beta + ((pKa'-7)*alpha)^3.
*/
double Alpha, Beta;
std::vector<PathEntry> AlphaPathTable, BetaPathTable;
public:
StructCheckerOptions();
void clear() { *this = StructCheckerOptions(); }
bool loadAugmentedAtomTranslations(const std::string &path);
void setAugmentedAtomTranslations(
const std::vector<std::pair<AugmentedAtom, AugmentedAtom> > &aaPairs);
bool loadAcidicAugmentedAtoms(const std::string &path);
void setAcidicAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
bool loadGoodAugmentedAtoms(const std::string &path);
void setGoodAugmentedAtoms(const std::vector<AugmentedAtom> &acidicAtoms);
bool loadPatterns(const std::string &path); // file with clean patterns
void parsePatterns(
const std::vector<std::string> &smarts); // can throw RDKit exeptions
void setPatterns(const std::vector<ROMOL_SPTR> &p);
bool loadRotatePatterns(const std::string &path); // file with rotate patterns
void parseRotatePatterns(
const std::vector<std::string> &smarts); // can throw RDKit exeptions
void setRotatePatterns(const std::vector<ROMOL_SPTR> &p);
bool loadStereoPatterns(
const std::string &path); // file with stereo patterns
void parseStereoPatterns(
const std::vector<std::string> &smarts); // can throw RDKit exeptions
void setStereoPatterns(const std::vector<ROMOL_SPTR> &p);
bool loadTautomerData(const std::string &path); // file path
void parseTautomerData(const std::vector<std::string> &smartsFrom,
const std::vector<std::string> &smartsTo);
void setTautomerData(const std::vector<ROMOL_SPTR> &from,
const std::vector<ROMOL_SPTR> &to);
bool loadChargeDataTables(const std::string &path); // file path
};
bool parseOptionsJSON(const std::string &json, StructCheckerOptions &op);
bool loadOptionsFromFiles(
StructCheckerOptions &op,
const std::string &augmentedAtomTranslationsFile = "",
// ?? AcidicAtoms;
// ?? GoodAtoms;
const std::string &patternFile = "", // file with clean patterns
const std::string &rotatePatternFile = "", // file with rotate patterns
const std::string &stereoPatternFile = "", // file with stereo patterns
const std::string &tautomerFile = "");
class StructChecker {
public:
typedef enum StructureFlags {
NO_CHANGE = 0,
BAD_MOLECULE = 0x0001,
ALIAS_CONVERSION_FAILED = 0x0002,
STEREO_ERROR = 0x0004,
STEREO_FORCED_BAD = 0x0008,
ATOM_CLASH = 0x0010,
ATOM_CHECK_FAILED = 0x0020,
SIZE_CHECK_FAILED = 0x0040,
// reserved error = 0x0080,
TRANSFORMED = 0x0100,
FRAGMENTS_FOUND = 0x0200,
EITHER_WARNING = 0x0400,
DUBIOUS_STEREO_REMOVED = 0x0800,
RECHARGED = 0x1000,
STEREO_TRANSFORMED = 0x2000,
TEMPLATE_TRANSFORMED = 0x4000,
TAUTOMER_TRANSFORMED = 0x8000,
// mask:
BAD_SET = (BAD_MOLECULE | ALIAS_CONVERSION_FAILED | STEREO_ERROR |
STEREO_FORCED_BAD | ATOM_CLASH | ATOM_CHECK_FAILED |
SIZE_CHECK_FAILED),
TRANSFORMED_SET = (TRANSFORMED | FRAGMENTS_FOUND | EITHER_WARNING |
DUBIOUS_STEREO_REMOVED | STEREO_TRANSFORMED |
TEMPLATE_TRANSFORMED | TAUTOMER_TRANSFORMED | RECHARGED),
} StructureFlags;
// attributes:
private:
StructCheckerOptions Options;
public:
inline StructChecker() {}
inline StructChecker(const StructCheckerOptions &options)
: Options(options) {}
const StructCheckerOptions & GetOptions() const { return Options; }
void SetOptions(const StructCheckerOptions &options) { Options = options; }
// Check and fix (if need) molecule structure and return a set of StructureFlags
// that describes what have been done
unsigned checkMolStructure(RWMol &mol) const;
// an instance independed helper methods:
// Converts structure property flags to a comma seperated string
static std::string StructureFlagsToString(unsigned flags);
// Converts a comma seperated string to a StructureFlag unsigned integer
static unsigned StringToStructureFlags(const std::string &str);
// internal implementation:
private:
};
}
}
#endif