mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* StructChecker changes. Initial commit. First implementation. Added some tests. * StructChecker: add GoodAtoms and AcidicAtoms. new updates * StructChecker: add new tests * StructChecker: added TransformAugmentedAtoms() * StructCheck: add structCheck to GraphMol. Fix compilation errors. * StructChecker: add stereo verification and some utilities. * StructChecker: function FixDubious3DMolecule was added * StructChecker: checkStereo added. done with stereo. * StructChecker: add StripSmallFragments() * StructChecker: add AtomClash() function. Some cosmetic + tests * StructChecker: checkAtoms() was started * StructChecker: checkAtoms is ready * StructChecker: user RingInfo from RDkit. Start regarge * StructChecker: ReCharge molecule method prototype * StructChecker: updates for ReCharge. Almost finished * StructChecker: all ReCharge is done except external data tables loading * StructChecker: add path tables into API. ReCharge completed * Adds augmented atom data Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Removes extra files Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Adds path to test data via RDBASE environment Signed-off-by: Brian Kelley <brian.kelley@novartis.com> * Revert "Struct checker apr15" * StructChecker: add missing tautomer tests * Updates test to use RDBASE * Adds initialization of data from data section * Adds Python API and tests * Fixes namespace for enum * StructChecker: update/imporve strip small fragments * StructChecker: fix acidic atoms (but logic does not work) * StructChecker: fix match issue for CheckAtoms * Adds macro guards * Adds loading API and proper constructor * Fixes tests, adds stereo test * Fixes crash bug, matches[0] was being accessed from an empty match vector * Reverts crash fix - conflicts with previous * Adds the rest of the structure checker options * StructChecker: fix atom matching for aromatic rings * StructChecker: add tautomers checks. Update some tests * StructChecker: stereo fixes. Add some tests * StructChecker: fix check atoms. Start ligand symbol list * StructChecker: fix some check atoms validation. Add Tranform to query lists. Start correct loading augmented atoms * update * another set of fixes * StructChecker: fix loadDefaultAugmentedAtoms. Some changes in CheckAtom + tests + debug conditional breakpoints (TEMP operators) * StructChecker: rewrited RecMatch() to sequential. Changed bond matching algorithm. small bug fixes * Adds better logging of mismatched atoms * Removes duplicated negative charge * Fixes charges * Adds nitro group test * StructChecker: add better logging * remove double logging * Reformats code using RDKit's clang-format style * StructChecker: Fix charge reformat using RDKit format. * StructChecker: compilation restore after merge * restore bond matching * Removes the same fragments that strucheck does in case of ties * Don't resanitize - this adds aromaticity which mucks things up * Adds empty molecule checks * Fixes atom clashes. * Removes debug printing * Removes debug logging info * First pass at stereo fixes * Fixes off by one error for dubious stereo fix * Fixes more off by one errors * Fixes more off by one errors * More off by one fixes. * Another off by one * Fixes chiral flag set in molfile check * Copies chiral flag over to largest fragment if necessary * Poor man’s parity check. * Find unspecified chiral centers ala Avalon. * StructChecker: fix recursive match. Fix transformations * StructChecker: fix transformation for atom list (using query atoms) * Fixes checks && to & * StructChecker: fix carboxylic acids tranform issue. Atom list is changed only if different * StructChecker: documentation was updated * Fixes snprintf and silences some warnings * Adds Get/Set StructCheckerOptions * Adds default AugmentedAtomTransforms
305 lines
9.5 KiB
C++
305 lines
9.5 KiB
C++
//
|
|
// Copyright (C) 2016 Novartis Institutes for BioMedical Research
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include "Pattern.h"
|
|
|
|
namespace RDKit {
|
|
namespace StructureCheck {
|
|
|
|
static const char *AtomSymbol[] = {
|
|
// Periodic Table
|
|
"*", // ANY_ELEMENT = 0,
|
|
"H", // 1
|
|
"He", // 2
|
|
"Li", "Be", "B",
|
|
"C", // 6
|
|
"N", "O", "F",
|
|
"Ne", // 10
|
|
"Na", "Mg", "Al", "Si", "P", "S", "Cl",
|
|
"Ar", // 18
|
|
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co",
|
|
"Ni", // 28
|
|
"Cu", "Zn", "Ga", "Ge", "As", "Se", "Br",
|
|
"Kr", // 36
|
|
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh",
|
|
"Pd", // 46
|
|
"Ag", "Cd", "In", "Sn", "Sb", "Te", "I",
|
|
"Xe", // 54
|
|
"Cs", "Ba",
|
|
|
|
"La", // 57
|
|
"Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy",
|
|
"Ho", "Er", "Tm", "Yb",
|
|
"Lu", // 71
|
|
|
|
"Hf", "Ta", "W", "Re", "Os", "Ir",
|
|
"Pt", // 78
|
|
"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At",
|
|
"Rn", // 86
|
|
"Fr", "Ra",
|
|
|
|
"Ac", // 89
|
|
"Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf",
|
|
"Es", "Fm", "Md", "No",
|
|
"Lr", // 103
|
|
|
|
"Rf", "Db", "Sg", "Bh", "Hn",
|
|
"Mt" // 109
|
|
};
|
|
|
|
class AtomSymbolMapper {
|
|
std::map<std::string, unsigned> SymbolMap;
|
|
|
|
public:
|
|
AtomSymbolMapper() {
|
|
for (unsigned n = 0; n < 110; n++) SymbolMap[AtomSymbol[n]] = n;
|
|
}
|
|
inline unsigned getAtomicNumber(const std::string symbol) const {
|
|
return SymbolMap.at(symbol);
|
|
}
|
|
};
|
|
static const AtomSymbolMapper smap;
|
|
|
|
unsigned getAtomicNumber(const std::string symbol) {
|
|
return smap.getAtomicNumber(symbol);
|
|
}
|
|
|
|
// predefined generic atom type sets for use in STRUCHK
|
|
static const char *HC_table[] = /* pseudosymbol "G" */
|
|
{"H", "C", NULL};
|
|
|
|
static const char *non_metal_hetero_elements[] = /* pseudosymbol "Q" */
|
|
{
|
|
"He", "B", "N", "O", "F", "Ne", "Si", "P", "S", "Cl", "Ar",
|
|
"As", "Se", "Br", "Kr", "Sb", "Te", "I", "Xe", "At", /* "Rn", This
|
|
element must be
|
|
removed */
|
|
NULL, /* because of a trick in utils.c */
|
|
};
|
|
|
|
static const char *metals[] = /* pseudosymbol "M" */
|
|
{
|
|
"Li", "Be", "Na", "Mg", "Al", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn",
|
|
"Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
|
|
"Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Cs", "Ba", "La", "Ce",
|
|
"Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
|
|
"Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb",
|
|
"Bi", "Po", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", NULL,
|
|
};
|
|
|
|
static const char *non_metal_small_solution[] = /* pseudosymbol "Qs" */
|
|
{
|
|
"H", "B", "C", "N", "O", "F", "Si",
|
|
"P", "S", "Cl", "Se", "Br", "I", NULL,
|
|
};
|
|
|
|
static const char *alkali_metals[] = /* pseudosymbol "alk" */
|
|
{
|
|
"Li", "Na", "K", "Rb", "Cs", "Fr", NULL,
|
|
};
|
|
|
|
static const char *gr2[] = /* pseudosymbol "gr2" */
|
|
{
|
|
"Be", "Mg", "Ca", "Sr", "Ba", "Ra", NULL,
|
|
};
|
|
|
|
static const char *gr3[] = /* pseudosymbol "gr3" */
|
|
{
|
|
"B", "Al", "Ga", "In", "Tl", NULL,
|
|
};
|
|
|
|
static const char *gr4[] = /* pseudosymbol "gr4" */
|
|
{
|
|
"C", "Si", "Ge", "Sn", "Pb", NULL,
|
|
};
|
|
|
|
static const char *ONS_table[] = /* pseudosymbol "ONS" or "ons" */
|
|
{"O", "N", "S", NULL};
|
|
|
|
static const char *on2[] = /* pseudosymbol "on2" */
|
|
{
|
|
"O", "N", "S", "P", "Se", "Te", "Po", NULL,
|
|
};
|
|
|
|
static const char *halogenes[] = /* pseudosymbol "X" or "hal" */
|
|
{"F", "Cl", "Br", "I", "At", NULL};
|
|
|
|
static const char *ha2[] = /* pseudosymbol "ha2" */
|
|
{"Cl", "Br", "I", "At", NULL};
|
|
|
|
static const char *transition_metals[] = /* pseudosymbol "trn" */
|
|
{
|
|
"Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Y",
|
|
"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "La", "Hf",
|
|
"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", NULL,
|
|
};
|
|
|
|
static const char *tra[] = /* pseudosymbol "tra" */
|
|
{
|
|
"Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Zr", "Nb", "Mo", "Tc",
|
|
"Ru", "Rh", "Pd", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", NULL,
|
|
};
|
|
|
|
static const char *trb[] = /* pseudosymbol "trb" */
|
|
{
|
|
"Cu", "Zn", "Ag", "Cd", "Au", "Hg", NULL,
|
|
};
|
|
|
|
static const char *tm1[] = /* pseudosymbol "tm1" */
|
|
{
|
|
"Cu", "Ag", "Au", NULL,
|
|
};
|
|
|
|
static const char *tm2[] = /* pseudosymbol "tm2" */
|
|
{
|
|
"Zn", "Cd", "Hg", NULL,
|
|
};
|
|
|
|
static const char *tm3[] = /* pseudosymbol "tm3" */
|
|
{
|
|
"Sc", "Y", "La", NULL,
|
|
};
|
|
|
|
static const char *tm4[] = /* pseudosymbol "tm4" */
|
|
{
|
|
"Ti", "Zr", "Hf", NULL,
|
|
};
|
|
|
|
static const char *tm5[] = /* pseudosymbol "tm5" */
|
|
{
|
|
"V", "Nb", "Ta", NULL,
|
|
};
|
|
|
|
static const char *tm6[] = /* pseudosymbol "tm6" */
|
|
{
|
|
"Cr", "Mo", "W", NULL,
|
|
};
|
|
|
|
static const char *tm7[] = /* pseudosymbol "tm7" */
|
|
{
|
|
"Mn", "Tc", "Re", NULL,
|
|
};
|
|
|
|
static const char *tm8[] = /* pseudosymbol "tm8" */
|
|
{
|
|
"Fe", "Co", "Ni", "Ru", "Rh", "Pd", "Os", "Ir", "Pt", NULL,
|
|
};
|
|
|
|
static const char *lanthanoids[] = /* pseudosymbol "lan" */
|
|
{
|
|
"Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb",
|
|
"Dy", "Ho", "Er", "Tm", "Yb", "Lu", NULL,
|
|
};
|
|
|
|
static const char *amino_acids[] = /* pseudosymbol "Ami" or "ami"*/
|
|
{
|
|
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu",
|
|
"Gly", "His", "Ile", "Leu", "Lys", "Met", "Phe",
|
|
"Pro", "Ser", "Thr", "Trp", "Tyr", "Val", NULL,
|
|
};
|
|
|
|
static bool IsInStringTable(const char *symbol, const char *table[]) {
|
|
// Checks if the string symbol is listed in table[]
|
|
const char **stringp;
|
|
for (stringp = table; *stringp; stringp++)
|
|
if (0 == strcmp(*stringp, symbol)) return true;
|
|
return false;
|
|
}
|
|
|
|
bool AtomSymbolMatch(const std::string symbol, const std::string pattern) {
|
|
/*
|
|
* Returns TRUE if symbol is in the comma separated list of atom symbols
|
|
* stored in pattern and FALSE otherwise.
|
|
* There are also a number of standard atom type lists like "alk" for alkali
|
|
* metals or
|
|
* "Q" for non-C/non-H defined above as arrays of strings.
|
|
*/
|
|
char *context;
|
|
#ifdef WIN32
|
|
#define strtok_r strtok_s // thread safe strtok()
|
|
#endif
|
|
const char *atsym = symbol.c_str();
|
|
char pat_buf[512];
|
|
char *tokp;
|
|
|
|
strcpy(pat_buf, pattern.c_str());
|
|
for (tokp = strtok_r(pat_buf, ",", &context); tokp;
|
|
tokp = strtok_r((char *)NULL, ",", &context)) {
|
|
if (islower(*tokp)) {
|
|
if (0 == strcmp("alk", tokp)) {
|
|
if (IsInStringTable(atsym, alkali_metals)) return true;
|
|
} else if (0 == strcmp("gr2", tokp)) {
|
|
if (IsInStringTable(atsym, gr2)) return true;
|
|
} else if (0 == strcmp("gr3", tokp)) {
|
|
if (IsInStringTable(atsym, gr3)) return true;
|
|
} else if (0 == strcmp("gr4", tokp)) {
|
|
if (IsInStringTable(atsym, gr4)) return true;
|
|
} else if (0 == strcmp("ons", tokp)) {
|
|
if (IsInStringTable(atsym, ONS_table)) return true;
|
|
} else if (0 == strcmp("on2", tokp)) {
|
|
if (IsInStringTable(atsym, on2)) return true;
|
|
} else if (0 == strcmp("hal", tokp)) {
|
|
if (IsInStringTable(atsym, halogenes)) return true;
|
|
} else if (0 == strcmp("ha2", tokp)) {
|
|
if (IsInStringTable(atsym, ha2)) return true;
|
|
} else if (0 == strcmp("trn", tokp)) {
|
|
if (IsInStringTable(atsym, transition_metals)) return true;
|
|
} else if (0 == strcmp("tra", tokp)) {
|
|
if (IsInStringTable(atsym, tra)) return true;
|
|
} else if (0 == strcmp("trb", tokp)) {
|
|
if (IsInStringTable(atsym, trb)) return true;
|
|
} else if (0 == strcmp("tm1", tokp)) {
|
|
if (IsInStringTable(atsym, tm1)) return true;
|
|
} else if (0 == strcmp("tm2", tokp)) {
|
|
if (IsInStringTable(atsym, tm2)) return true;
|
|
} else if (0 == strcmp("tm3", tokp)) {
|
|
if (IsInStringTable(atsym, tm3)) return true;
|
|
} else if (0 == strcmp("tm4", tokp)) {
|
|
if (IsInStringTable(atsym, tm4)) return true;
|
|
} else if (0 == strcmp("tm5", tokp)) {
|
|
if (IsInStringTable(atsym, tm5)) return true;
|
|
} else if (0 == strcmp("tm6", tokp)) {
|
|
if (IsInStringTable(atsym, tm6)) return true;
|
|
} else if (0 == strcmp("tm7", tokp)) {
|
|
if (IsInStringTable(atsym, tm7)) return true;
|
|
} else if (0 == strcmp("tm8", tokp)) {
|
|
if (IsInStringTable(atsym, tm8)) return true;
|
|
} else if (0 == strcmp("lan", tokp)) {
|
|
if (IsInStringTable(atsym, lanthanoids)) return true;
|
|
} else if (0 == strcmp("ami", tokp)) {
|
|
if (IsInStringTable(atsym, amino_acids)) return true;
|
|
}
|
|
}
|
|
if (0 == strcmp(atsym, tokp)) return true;
|
|
}
|
|
|
|
if (0 == strcmp("A", pattern.c_str()))
|
|
return (0 != strcmp("H", atsym));
|
|
else if (0 == strcmp("Qs", pattern.c_str()))
|
|
return (IsInStringTable(atsym, non_metal_small_solution));
|
|
else if (0 == strcmp("G", pattern.c_str()))
|
|
return (IsInStringTable(atsym, HC_table));
|
|
else if (0 == strcmp("ONS", pattern.c_str()))
|
|
return (IsInStringTable(atsym, ONS_table));
|
|
else if (0 == strcmp("X", pattern.c_str()))
|
|
return (IsInStringTable(atsym, halogenes));
|
|
else if (0 == strcmp("Q", pattern.c_str()))
|
|
return (IsInStringTable(atsym, non_metal_hetero_elements));
|
|
else if (0 == strcmp("M", pattern.c_str()))
|
|
return (IsInStringTable(atsym, metals));
|
|
else if (0 == strcmp("Ami", pattern.c_str()))
|
|
return (IsInStringTable(atsym, amino_acids));
|
|
return false;
|
|
}
|
|
|
|
} // namespace StructureCheck
|
|
} // namespace RDKit
|