Files
rdkit/Code/GraphMol/StructChecker/AtomSymbolMatch.cpp
Brian Kelley 8609cd4883 Add StructChecker functionality
* StructChecker changes. Initial commit. First implementation. Added some tests.

* StructChecker: add  GoodAtoms and AcidicAtoms. new updates

* StructChecker: add new tests

* StructChecker: added TransformAugmentedAtoms()

* StructCheck: add structCheck to GraphMol. Fix compilation errors.

* StructChecker: add stereo verification and some utilities.

* StructChecker: function FixDubious3DMolecule was added

* StructChecker: checkStereo added. done with stereo.

* StructChecker: add StripSmallFragments()

* StructChecker: add AtomClash() function. Some cosmetic + tests

* StructChecker: checkAtoms() was started

* StructChecker: checkAtoms is ready

* StructChecker: user RingInfo from RDkit. Start regarge

* StructChecker: ReCharge molecule method prototype

* StructChecker: updates for ReCharge. Almost finished

* StructChecker: all ReCharge is done except external data tables loading

* StructChecker: add path tables into API. ReCharge completed

* Adds augmented atom data

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Removes extra files

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Adds path to test data via RDBASE environment

Signed-off-by: Brian Kelley <brian.kelley@novartis.com>

* Revert "Struct checker apr15"

* StructChecker: add missing tautomer tests

* Updates test to use RDBASE

* Adds initialization of data from data section

* Adds Python API and tests

* Fixes namespace for enum

* StructChecker: update/imporve strip small fragments

* StructChecker: fix acidic atoms (but logic does not work)

* StructChecker: fix match issue for CheckAtoms

* Adds macro guards

* Adds loading API and proper constructor

* Fixes tests, adds stereo test

* Fixes crash bug, matches[0] was being accessed from an empty match vector

* Reverts crash fix - conflicts with previous

* Adds the rest of the structure checker options

* StructChecker: fix atom matching for aromatic rings

* StructChecker: add tautomers checks. Update some tests

* StructChecker: stereo fixes. Add some tests

* StructChecker: fix check atoms. Start ligand symbol list

* StructChecker: fix some check atoms validation. Add Tranform to query lists. Start correct loading augmented atoms

* update

* another set of fixes

* StructChecker: fix loadDefaultAugmentedAtoms. Some changes in CheckAtom + tests + debug conditional breakpoints (TEMP operators)

* StructChecker: rewrited RecMatch() to sequential. Changed bond matching algorithm. small bug fixes

* Adds better logging of mismatched atoms

* Removes duplicated negative charge

* Fixes charges

* Adds nitro group test

* StructChecker: add better logging

* remove double logging

* Reformats code using RDKit's clang-format style

* StructChecker: Fix charge reformat using RDKit format.

* StructChecker: compilation restore after merge

* restore bond matching

* Removes the same fragments that strucheck does in case of ties

* Don't resanitize - this adds aromaticity which mucks things up

* Adds empty molecule checks

* Fixes atom clashes.

* Removes debug printing

* Removes debug logging info

* First pass at stereo fixes

* Fixes off by one error for dubious stereo fix

* Fixes more off by one errors

* Fixes more off by one errors

* More off by one fixes.

* Another off by one

* Fixes chiral flag set in molfile check

* Copies chiral flag over to largest fragment if necessary

* Poor man’s parity check.

* Find unspecified chiral centers ala Avalon.

* StructChecker: fix recursive match. Fix transformations

* StructChecker: fix transformation for atom list (using query atoms)

* Fixes checks && to &

* StructChecker: fix carboxylic acids tranform issue. Atom list is changed only if different

* StructChecker: documentation was updated

* Fixes snprintf and silences some warnings

* Adds Get/Set StructCheckerOptions

* Adds default AugmentedAtomTransforms
2016-10-24 08:00:07 +02:00

305 lines
9.5 KiB
C++

//
// Copyright (C) 2016 Novartis Institutes for BioMedical Research
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string.h>
#include <ctype.h>
#include "Pattern.h"
namespace RDKit {
namespace StructureCheck {
static const char *AtomSymbol[] = {
// Periodic Table
"*", // ANY_ELEMENT = 0,
"H", // 1
"He", // 2
"Li", "Be", "B",
"C", // 6
"N", "O", "F",
"Ne", // 10
"Na", "Mg", "Al", "Si", "P", "S", "Cl",
"Ar", // 18
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co",
"Ni", // 28
"Cu", "Zn", "Ga", "Ge", "As", "Se", "Br",
"Kr", // 36
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh",
"Pd", // 46
"Ag", "Cd", "In", "Sn", "Sb", "Te", "I",
"Xe", // 54
"Cs", "Ba",
"La", // 57
"Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy",
"Ho", "Er", "Tm", "Yb",
"Lu", // 71
"Hf", "Ta", "W", "Re", "Os", "Ir",
"Pt", // 78
"Au", "Hg", "Tl", "Pb", "Bi", "Po", "At",
"Rn", // 86
"Fr", "Ra",
"Ac", // 89
"Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf",
"Es", "Fm", "Md", "No",
"Lr", // 103
"Rf", "Db", "Sg", "Bh", "Hn",
"Mt" // 109
};
class AtomSymbolMapper {
std::map<std::string, unsigned> SymbolMap;
public:
AtomSymbolMapper() {
for (unsigned n = 0; n < 110; n++) SymbolMap[AtomSymbol[n]] = n;
}
inline unsigned getAtomicNumber(const std::string symbol) const {
return SymbolMap.at(symbol);
}
};
static const AtomSymbolMapper smap;
unsigned getAtomicNumber(const std::string symbol) {
return smap.getAtomicNumber(symbol);
}
// predefined generic atom type sets for use in STRUCHK
static const char *HC_table[] = /* pseudosymbol "G" */
{"H", "C", NULL};
static const char *non_metal_hetero_elements[] = /* pseudosymbol "Q" */
{
"He", "B", "N", "O", "F", "Ne", "Si", "P", "S", "Cl", "Ar",
"As", "Se", "Br", "Kr", "Sb", "Te", "I", "Xe", "At", /* "Rn", This
element must be
removed */
NULL, /* because of a trick in utils.c */
};
static const char *metals[] = /* pseudosymbol "M" */
{
"Li", "Be", "Na", "Mg", "Al", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn",
"Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Cs", "Ba", "La", "Ce",
"Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
"Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb",
"Bi", "Po", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", NULL,
};
static const char *non_metal_small_solution[] = /* pseudosymbol "Qs" */
{
"H", "B", "C", "N", "O", "F", "Si",
"P", "S", "Cl", "Se", "Br", "I", NULL,
};
static const char *alkali_metals[] = /* pseudosymbol "alk" */
{
"Li", "Na", "K", "Rb", "Cs", "Fr", NULL,
};
static const char *gr2[] = /* pseudosymbol "gr2" */
{
"Be", "Mg", "Ca", "Sr", "Ba", "Ra", NULL,
};
static const char *gr3[] = /* pseudosymbol "gr3" */
{
"B", "Al", "Ga", "In", "Tl", NULL,
};
static const char *gr4[] = /* pseudosymbol "gr4" */
{
"C", "Si", "Ge", "Sn", "Pb", NULL,
};
static const char *ONS_table[] = /* pseudosymbol "ONS" or "ons" */
{"O", "N", "S", NULL};
static const char *on2[] = /* pseudosymbol "on2" */
{
"O", "N", "S", "P", "Se", "Te", "Po", NULL,
};
static const char *halogenes[] = /* pseudosymbol "X" or "hal" */
{"F", "Cl", "Br", "I", "At", NULL};
static const char *ha2[] = /* pseudosymbol "ha2" */
{"Cl", "Br", "I", "At", NULL};
static const char *transition_metals[] = /* pseudosymbol "trn" */
{
"Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Y",
"Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "La", "Hf",
"Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", NULL,
};
static const char *tra[] = /* pseudosymbol "tra" */
{
"Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Zr", "Nb", "Mo", "Tc",
"Ru", "Rh", "Pd", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", NULL,
};
static const char *trb[] = /* pseudosymbol "trb" */
{
"Cu", "Zn", "Ag", "Cd", "Au", "Hg", NULL,
};
static const char *tm1[] = /* pseudosymbol "tm1" */
{
"Cu", "Ag", "Au", NULL,
};
static const char *tm2[] = /* pseudosymbol "tm2" */
{
"Zn", "Cd", "Hg", NULL,
};
static const char *tm3[] = /* pseudosymbol "tm3" */
{
"Sc", "Y", "La", NULL,
};
static const char *tm4[] = /* pseudosymbol "tm4" */
{
"Ti", "Zr", "Hf", NULL,
};
static const char *tm5[] = /* pseudosymbol "tm5" */
{
"V", "Nb", "Ta", NULL,
};
static const char *tm6[] = /* pseudosymbol "tm6" */
{
"Cr", "Mo", "W", NULL,
};
static const char *tm7[] = /* pseudosymbol "tm7" */
{
"Mn", "Tc", "Re", NULL,
};
static const char *tm8[] = /* pseudosymbol "tm8" */
{
"Fe", "Co", "Ni", "Ru", "Rh", "Pd", "Os", "Ir", "Pt", NULL,
};
static const char *lanthanoids[] = /* pseudosymbol "lan" */
{
"Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb",
"Dy", "Ho", "Er", "Tm", "Yb", "Lu", NULL,
};
static const char *amino_acids[] = /* pseudosymbol "Ami" or "ami"*/
{
"Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu",
"Gly", "His", "Ile", "Leu", "Lys", "Met", "Phe",
"Pro", "Ser", "Thr", "Trp", "Tyr", "Val", NULL,
};
static bool IsInStringTable(const char *symbol, const char *table[]) {
// Checks if the string symbol is listed in table[]
const char **stringp;
for (stringp = table; *stringp; stringp++)
if (0 == strcmp(*stringp, symbol)) return true;
return false;
}
bool AtomSymbolMatch(const std::string symbol, const std::string pattern) {
/*
* Returns TRUE if symbol is in the comma separated list of atom symbols
* stored in pattern and FALSE otherwise.
* There are also a number of standard atom type lists like "alk" for alkali
* metals or
* "Q" for non-C/non-H defined above as arrays of strings.
*/
char *context;
#ifdef WIN32
#define strtok_r strtok_s // thread safe strtok()
#endif
const char *atsym = symbol.c_str();
char pat_buf[512];
char *tokp;
strcpy(pat_buf, pattern.c_str());
for (tokp = strtok_r(pat_buf, ",", &context); tokp;
tokp = strtok_r((char *)NULL, ",", &context)) {
if (islower(*tokp)) {
if (0 == strcmp("alk", tokp)) {
if (IsInStringTable(atsym, alkali_metals)) return true;
} else if (0 == strcmp("gr2", tokp)) {
if (IsInStringTable(atsym, gr2)) return true;
} else if (0 == strcmp("gr3", tokp)) {
if (IsInStringTable(atsym, gr3)) return true;
} else if (0 == strcmp("gr4", tokp)) {
if (IsInStringTable(atsym, gr4)) return true;
} else if (0 == strcmp("ons", tokp)) {
if (IsInStringTable(atsym, ONS_table)) return true;
} else if (0 == strcmp("on2", tokp)) {
if (IsInStringTable(atsym, on2)) return true;
} else if (0 == strcmp("hal", tokp)) {
if (IsInStringTable(atsym, halogenes)) return true;
} else if (0 == strcmp("ha2", tokp)) {
if (IsInStringTable(atsym, ha2)) return true;
} else if (0 == strcmp("trn", tokp)) {
if (IsInStringTable(atsym, transition_metals)) return true;
} else if (0 == strcmp("tra", tokp)) {
if (IsInStringTable(atsym, tra)) return true;
} else if (0 == strcmp("trb", tokp)) {
if (IsInStringTable(atsym, trb)) return true;
} else if (0 == strcmp("tm1", tokp)) {
if (IsInStringTable(atsym, tm1)) return true;
} else if (0 == strcmp("tm2", tokp)) {
if (IsInStringTable(atsym, tm2)) return true;
} else if (0 == strcmp("tm3", tokp)) {
if (IsInStringTable(atsym, tm3)) return true;
} else if (0 == strcmp("tm4", tokp)) {
if (IsInStringTable(atsym, tm4)) return true;
} else if (0 == strcmp("tm5", tokp)) {
if (IsInStringTable(atsym, tm5)) return true;
} else if (0 == strcmp("tm6", tokp)) {
if (IsInStringTable(atsym, tm6)) return true;
} else if (0 == strcmp("tm7", tokp)) {
if (IsInStringTable(atsym, tm7)) return true;
} else if (0 == strcmp("tm8", tokp)) {
if (IsInStringTable(atsym, tm8)) return true;
} else if (0 == strcmp("lan", tokp)) {
if (IsInStringTable(atsym, lanthanoids)) return true;
} else if (0 == strcmp("ami", tokp)) {
if (IsInStringTable(atsym, amino_acids)) return true;
}
}
if (0 == strcmp(atsym, tokp)) return true;
}
if (0 == strcmp("A", pattern.c_str()))
return (0 != strcmp("H", atsym));
else if (0 == strcmp("Qs", pattern.c_str()))
return (IsInStringTable(atsym, non_metal_small_solution));
else if (0 == strcmp("G", pattern.c_str()))
return (IsInStringTable(atsym, HC_table));
else if (0 == strcmp("ONS", pattern.c_str()))
return (IsInStringTable(atsym, ONS_table));
else if (0 == strcmp("X", pattern.c_str()))
return (IsInStringTable(atsym, halogenes));
else if (0 == strcmp("Q", pattern.c_str()))
return (IsInStringTable(atsym, non_metal_hetero_elements));
else if (0 == strcmp("M", pattern.c_str()))
return (IsInStringTable(atsym, metals));
else if (0 == strcmp("Ami", pattern.c_str()))
return (IsInStringTable(atsym, amino_acids));
return false;
}
} // namespace StructureCheck
} // namespace RDKit