Files
rdkit/Code/GraphMol/MolStandardize/Validate.cpp
Susan Leung 956fdf268c Dev/GSOC2018_MolVS_Integration (#2002)
* short test file for MolVS standardize_sm

* short test file for MolVS fragment

* short test file for MolVS metals

* short test file for MolVS normalize

* short test file for MolVS reionize

* short test file for MolVS tautomer

* short test file for MolVS validate

* long test file for MolVS standardize smiles

* long test file for MolVS fragment

* long test file for MolVS metals

* long test file for MolVS normalize

* long test file for MolVS reionize

* long test file for MolVS tautomer

* long test file for MolVS validate

* Unit tests for MolVS steps

* dropping support for Python2

* molvs/__init__.py

* molvs/charge.py

* molvs/errors.py

* molvs/fragment.py

* molvs/metal.py

* molvs/normalize.py

* molvs/resonance.py

* molvs/standardize.py

* molvs/tautomer.py

* molvs/utils.py

* molvs/validate.py

* molvs/validations.py

* molvs/cli.py

* adapted and renamed molvs/cli.py to work within $RDBASE/Contrib/MolVS/

* setup MolStandardize directories, source with empty cleanup function, header, CMake files

* corrections to empty source, header and test1.cpp

* adding empty functions and initializers to MolStandardize

* empty Metal source, header and added test

* added most of Metal.cpp functionality and made some more tests

* empty functions and initializers to Normalize

* empty functions and initializers to Validate

* added most code for RDKitDefault mode, along with some tests

* restructure for abstract base class ValidateMethod

* written in isNoneValidation for MolVSValidation

* took out isNoneValidation, put in noAtomValidation, neutralValidation, isotopeValidation for MolVSValidation

* added in AllowedAtoms

* added in disallowedAtoms

* corrections to Validate

* added code for FragmentRemover

* extended fragment functionality to include choose largest fragment, added in tests for fragment catalog, fragment remover. Also added fragmentValidation method in MolStandardize

* added another test to testValidate test_fragment

* corrections to fragment

* corrections to Metal

* added code for Normalize

* added normalize member function to MolStandardize and added tests

* added multi fragment functionality to Normalize.cpp and additional tests

* TransformCatalog

* tests for Normalize.cpp

* first bit of cleanup

* added most of Charge functionality and some tests

* some corrections to Charge.cpp and some more tests to testCharge.cpp

* corrections to Charge.cpp

* start of Tautomer Enumerate with some tests

* added BondType option to Tautomer Enumeration

* correcting for some memory leakage

* a few alterations to formatting

* sorting out some memory leaks

* sorting out some memory leaks

* some corrections for PCS test set

* redo tests with updated RDKit

* fixing memory leak

* more fixes after 100kPCS set testing

* using tab as delimiter in CSVs rather than comma

* tutorial for MolStandardize

* still working on Tautomer enumeration

* deleted some empty tests

* starting writing tautomer canonicalize

* rename test_data -> data (the source still needs to be updated)

* automatic source reformatting

* adjust to directory rename

* move the fragment catalog test into the MolStandardize directory
do not create separate library for FragmentCatalog

* stop building separate libraries for the catalogs

* move the CleanupParameters into the MolStandardize namespace

* first pass at python wrapper

* move the py module to the correct dir;
add some python tests;
add standardizeSmiles to python wrapper

* disabling the compareMolVSTest since that requires command line arguments to run

* get this building on windows

* put the python lib in the right place

* further work on python wrapper for rdMolStandardize

* added get and set functions to Metal and wrapped them

* added get and set functions to Metal and wrapped them

* changed construstor of Reionizer class and input args for reionize, wrapped this default

* overload Reionizer constructor so user can input own AcidBaseFile from python

* added Uncharger class to Charge and added test for Uncharger

* wrapped Fragment, fixed some memory leakage, changed some args and return types, added some tests

* wrapped Normalized and changed how Normalizer class is initiated

* changing MolVSValidation structure so user can choose which MolVS submethod they want

* starting to write Wrap for Validate

* now it compiles with Wrap/Validate.cpp

* a couple refactorings around validate

* move the validate code into the rdMolStandardize module

* make sure a valid pointer is returned for standardizeSmiles

* rdMolStandardize.MolVSValidation done and tests added

* half way through AllowedAtomsValidation

* finished AllowedAtomsValidation and DisallowedAtomsValidation

* moved charge, fragment, metal, normalize into the rdMolStandardize module

* changed tutorial to use wrapped code

* added copyrights

* added copyrights

* move the data files

* modify source files to adjust to the move

* added validateSmiles functionality

* removed std::cout

* redid some of the 100k PCS tests

* working on the tutorial

* adding some documentation

* deleting some comment lines

* some changes after pull review

* More changes after pull review

* start of trying to make java wrap

* remove some warnings, add some questions

* additional warning removals, a bit more reporting

* some test cleanups

* enable testing of the java code
2018-09-28 11:24:25 +02:00

289 lines
8.7 KiB
C++

//
// Copyright (C) 2018 Susan H. Leung
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "Validate.h"
#include "Fragment.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/ROMol.h>
#include <GraphMol/MolStandardize/FragmentCatalog/FragmentCatalogParams.h>
#include <GraphMol/Substruct/SubstructMatch.h>
#include <iostream>
#include <vector>
#include <string>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
using namespace std;
using namespace RDKit;
namespace RDKit {
class RWMol;
class ROMol;
namespace MolStandardize {
std::vector<ValidationErrorInfo> RDKitValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
ROMol molCopy = mol;
std::vector<ValidationErrorInfo> errors;
unsigned int na = mol.getNumAtoms();
if (!na) {
errors.push_back(
ValidationErrorInfo("ERROR: [NoAtomValidation] Molecule has no atoms"));
}
// loop over atoms
for (size_t i = 0; i < na; ++i) {
if (!reportAllFailures) {
if (errors.size() >= 1) {
break;
}
}
Atom *atom = molCopy.getAtomWithIdx(i);
try {
atom->calcExplicitValence();
} catch (const MolSanitizeException &e) {
errors.push_back(ValidationErrorInfo("INFO: [ValenceValidation] " +
std::string(e.message())));
}
}
return errors;
}
void NoAtomValidation::run(const ROMol &mol, bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) const {
RDUNUSED_PARAM(reportAllFailures);
unsigned int na = mol.getNumAtoms();
if (!na) {
errors.push_back(
ValidationErrorInfo("ERROR: [NoAtomValidation] Molecule has no atoms"));
}
}
void FragmentValidation::run(const ROMol &mol, bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) const {
// REVIEW: reportAllFailures is not being used here. is that correct?
std::string rdbase = getenv("RDBASE");
std::string fgrpFile = rdbase + "/Data/MolStandardize/fragmentPatterns.txt";
std::shared_ptr<FragmentCatalogParams> fparams(
new FragmentCatalogParams(fgrpFile));
FragmentCatalog fcat(fparams.get());
const std::vector<std::shared_ptr<ROMol>> &fgrps = fparams->getFuncGroups();
INT_VECT mapping;
VECT_INT_VECT atom_mapping;
std::vector<ROMOL_SPTR> frags =
MolOps::getMolFrags(mol, true, &mapping, &atom_mapping);
for (auto &fgrp : fgrps) {
std::string fname;
fgrp->getProp(common_properties::_Name, fname);
std::vector<RDKit::MatchVectType> res;
unsigned int matches = SubstructMatch(mol, *fgrp, res);
// std::cout << fname << " matches " << matches << std::endl;
if (matches != 0 && frags.size() != 0) {
VECT_INT_VECT substructmap; // store idxs of frag from substructmatch
for (const auto &match : res) {
std::vector<int> vec;
for (const auto &pair : match) {
vec.push_back(pair.second);
}
substructmap.push_back(vec);
}
// to stop the same fragment being reported many times if present
// multiple times in molecule
bool fpresent = false;
for (auto &molfragidx : atom_mapping) {
std::sort(molfragidx.begin(), molfragidx.end());
for (auto &substructidx : substructmap) {
std::sort(substructidx.begin(), substructidx.end());
// // help to debug...
// std::cout << "molfragidx: " <<
// std::endl; for (const auto
// &i : molfragidx)
// {
// std::cout << i; }; std::cout
// << std::endl; std::cout <<
//"substructidx: " << std::endl;
// for (const auto &i : substructidx) { std::cout << i; };
// std::cout <<
// std::endl;
// //
if ((molfragidx == substructidx) && !fpresent) {
std::string msg = fname + " is present";
errors.push_back(
ValidationErrorInfo("INFO: [FragmentValidation] " + msg));
fpresent = true;
}
}
}
}
}
}
void NeutralValidation::run(const ROMol &mol, bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) const {
RDUNUSED_PARAM(reportAllFailures);
int charge = RDKit::MolOps::getFormalCharge(mol);
if (charge != 0) {
std::string charge_str;
if (charge > 0) {
charge_str = "+" + std::to_string(charge);
} else {
charge_str = std::to_string(charge);
}
std::string msg = "Not an overall neutral system (" + charge_str + ')';
errors.push_back(ValidationErrorInfo("INFO: [NeutralValidation] " + msg));
}
}
void IsotopeValidation::run(const ROMol &mol, bool reportAllFailures,
std::vector<ValidationErrorInfo> &errors) const {
unsigned int na = mol.getNumAtoms();
std::set<string> isotopes;
// loop over atoms
for (size_t i = 0; i < na; ++i) {
if (!reportAllFailures) {
if (errors.size() >= 1) {
break;
}
}
const Atom *atom = mol.getAtomWithIdx(i);
unsigned int isotope = atom->getIsotope();
if (isotope != 0) {
std::string symbol = atom->getSymbol();
isotopes.insert(std::to_string(isotope) + symbol);
}
}
for (auto &isotope : isotopes) {
errors.push_back(ValidationErrorInfo(
"INFO: [IsotopeValidation] Molecule contains isotope " + isotope));
}
}
// constructor
MolVSValidation::MolVSValidation() {
std::vector<boost::shared_ptr<MolVSValidations>> validations = {
boost::make_shared<NoAtomValidation>(),
boost::make_shared<FragmentValidation>(),
boost::make_shared<NeutralValidation>(),
boost::make_shared<IsotopeValidation>()};
this->d_validations = validations;
}
// overloaded constructor
MolVSValidation::MolVSValidation(
const std::vector<boost::shared_ptr<MolVSValidations>> validations) {
this->d_validations = validations;
}
// copy constructor
MolVSValidation::MolVSValidation(const MolVSValidation &other) {
d_validations = other.d_validations;
}
MolVSValidation::~MolVSValidation(){};
std::vector<ValidationErrorInfo> MolVSValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
for (const auto method : this->d_validations) {
method->run(mol, reportAllFailures, errors);
}
return errors;
}
std::vector<ValidationErrorInfo> AllowedAtomsValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
unsigned int na = mol.getNumAtoms();
for (size_t i = 0; i < na; ++i) {
if (!reportAllFailures) {
if (errors.size() >= 1) {
break;
}
}
const Atom *qatom = mol.getAtomWithIdx(i);
bool match = false;
// checks to see qatom matches one of list of allowedAtoms
for (const auto &allowedAtom : this->d_allowedList) {
if (allowedAtom->Match(qatom)) {
match = true;
break;
}
}
// if no match, append to list of errors.
if (!match) {
std::string symbol = qatom->getSymbol();
errors.push_back(
ValidationErrorInfo("INFO: [AllowedAtomsValidation] Atom " + symbol +
" is not in allowedAtoms list"));
}
}
return errors;
}
std::vector<ValidationErrorInfo> DisallowedAtomsValidation::validate(
const ROMol &mol, bool reportAllFailures) const {
std::vector<ValidationErrorInfo> errors;
unsigned int na = mol.getNumAtoms();
for (size_t i = 0; i < na; ++i) {
if (!reportAllFailures) {
if (errors.size() >= 1) {
break;
}
}
const Atom *qatom = mol.getAtomWithIdx(i);
bool match = false;
// checks to see qatom matches one of list of allowedAtoms
for (const auto &disallowedAtom : this->d_disallowedList) {
if (disallowedAtom->Match(qatom)) {
match = true;
}
}
// if no match, append to list of errors.
if (match) {
std::string symbol = qatom->getSymbol();
errors.push_back(
ValidationErrorInfo("INFO: [DisallowedAtomsValidation] Atom " +
symbol + " is in disallowedAtoms list"));
}
}
return errors;
}
std::vector<ValidationErrorInfo> validateSmiles(const std::string &smiles) {
RWMOL_SPTR mol(SmilesToMol(smiles));
if (!mol) {
std::string message =
"SMILES Parse Error: syntax error for input: " + smiles;
throw ValueErrorException(message);
}
MolVSValidation vm;
std::vector<ValidationErrorInfo> errors = vm.validate(*mol, true);
return errors;
}
} // namespace MolStandardize
} // namespace RDKit