Files
rdkit/Code/GraphMol/MolStandardize/testPCS.cpp
Susan Leung 956fdf268c Dev/GSOC2018_MolVS_Integration (#2002)
* short test file for MolVS standardize_sm

* short test file for MolVS fragment

* short test file for MolVS metals

* short test file for MolVS normalize

* short test file for MolVS reionize

* short test file for MolVS tautomer

* short test file for MolVS validate

* long test file for MolVS standardize smiles

* long test file for MolVS fragment

* long test file for MolVS metals

* long test file for MolVS normalize

* long test file for MolVS reionize

* long test file for MolVS tautomer

* long test file for MolVS validate

* Unit tests for MolVS steps

* dropping support for Python2

* molvs/__init__.py

* molvs/charge.py

* molvs/errors.py

* molvs/fragment.py

* molvs/metal.py

* molvs/normalize.py

* molvs/resonance.py

* molvs/standardize.py

* molvs/tautomer.py

* molvs/utils.py

* molvs/validate.py

* molvs/validations.py

* molvs/cli.py

* adapted and renamed molvs/cli.py to work within $RDBASE/Contrib/MolVS/

* setup MolStandardize directories, source with empty cleanup function, header, CMake files

* corrections to empty source, header and test1.cpp

* adding empty functions and initializers to MolStandardize

* empty Metal source, header and added test

* added most of Metal.cpp functionality and made some more tests

* empty functions and initializers to Normalize

* empty functions and initializers to Validate

* added most code for RDKitDefault mode, along with some tests

* restructure for abstract base class ValidateMethod

* written in isNoneValidation for MolVSValidation

* took out isNoneValidation, put in noAtomValidation, neutralValidation, isotopeValidation for MolVSValidation

* added in AllowedAtoms

* added in disallowedAtoms

* corrections to Validate

* added code for FragmentRemover

* extended fragment functionality to include choose largest fragment, added in tests for fragment catalog, fragment remover. Also added fragmentValidation method in MolStandardize

* added another test to testValidate test_fragment

* corrections to fragment

* corrections to Metal

* added code for Normalize

* added normalize member function to MolStandardize and added tests

* added multi fragment functionality to Normalize.cpp and additional tests

* TransformCatalog

* tests for Normalize.cpp

* first bit of cleanup

* added most of Charge functionality and some tests

* some corrections to Charge.cpp and some more tests to testCharge.cpp

* corrections to Charge.cpp

* start of Tautomer Enumerate with some tests

* added BondType option to Tautomer Enumeration

* correcting for some memory leakage

* a few alterations to formatting

* sorting out some memory leaks

* sorting out some memory leaks

* some corrections for PCS test set

* redo tests with updated RDKit

* fixing memory leak

* more fixes after 100kPCS set testing

* using tab as delimiter in CSVs rather than comma

* tutorial for MolStandardize

* still working on Tautomer enumeration

* deleted some empty tests

* starting writing tautomer canonicalize

* rename test_data -> data (the source still needs to be updated)

* automatic source reformatting

* adjust to directory rename

* move the fragment catalog test into the MolStandardize directory
do not create separate library for FragmentCatalog

* stop building separate libraries for the catalogs

* move the CleanupParameters into the MolStandardize namespace

* first pass at python wrapper

* move the py module to the correct dir;
add some python tests;
add standardizeSmiles to python wrapper

* disabling the compareMolVSTest since that requires command line arguments to run

* get this building on windows

* put the python lib in the right place

* further work on python wrapper for rdMolStandardize

* added get and set functions to Metal and wrapped them

* added get and set functions to Metal and wrapped them

* changed construstor of Reionizer class and input args for reionize, wrapped this default

* overload Reionizer constructor so user can input own AcidBaseFile from python

* added Uncharger class to Charge and added test for Uncharger

* wrapped Fragment, fixed some memory leakage, changed some args and return types, added some tests

* wrapped Normalized and changed how Normalizer class is initiated

* changing MolVSValidation structure so user can choose which MolVS submethod they want

* starting to write Wrap for Validate

* now it compiles with Wrap/Validate.cpp

* a couple refactorings around validate

* move the validate code into the rdMolStandardize module

* make sure a valid pointer is returned for standardizeSmiles

* rdMolStandardize.MolVSValidation done and tests added

* half way through AllowedAtomsValidation

* finished AllowedAtomsValidation and DisallowedAtomsValidation

* moved charge, fragment, metal, normalize into the rdMolStandardize module

* changed tutorial to use wrapped code

* added copyrights

* added copyrights

* move the data files

* modify source files to adjust to the move

* added validateSmiles functionality

* removed std::cout

* redid some of the 100k PCS tests

* working on the tutorial

* adding some documentation

* deleting some comment lines

* some changes after pull review

* More changes after pull review

* start of trying to make java wrap

* remove some warnings, add some questions

* additional warning removals, a bit more reporting

* some test cleanups

* enable testing of the java code
2018-09-28 11:24:25 +02:00

350 lines
11 KiB
C++

//
// Copyright (C) 2018 Susan H. Leung
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string>
#include <GraphMol/RDKitBase.h>
#include <fstream>
#include <iostream>
#include <RDGeneral/BadFileException.h>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/gzip.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/tokenizer.hpp>
#include "Metal.h"
#include "Validate.h"
#include "MolStandardize.h"
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/ROMol.h>
#include <RDGeneral/Invariant.h>
typedef boost::tokenizer<boost::char_separator<char>> tokenizer;
using namespace RDKit;
typedef enum {
StandardizeSmShort,
StandardizeSmLong,
FragmentShort,
FragmentLong,
TautomerShort,
TautomerLong,
ValidateShort,
ValidateLong,
MetalShort,
MetalLong,
NormalizeShort,
NormalizeLong,
ReionizeShort,
ReionizeLong
} RDKitStandardizeMode;
RDKitStandardizeMode setMode(const std::string &argv1,
const std::string &argv2) {
RDKitStandardizeMode standardize_mode;
if (argv1 == "Metal" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::MetalShort;
}
if (argv1 == "Metal" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::MetalLong;
}
if (argv1 == "Fragment" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::FragmentShort;
}
if (argv1 == "Fragment" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::FragmentLong;
}
if (argv1 == "StandardizeSm" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::StandardizeSmShort;
}
if (argv1 == "StandardizeSm" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::StandardizeSmLong;
}
if (argv1 == "Validate" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::ValidateShort;
}
if (argv1 == "Validate" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::ValidateLong;
}
if (argv1 == "Normalize" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::NormalizeShort;
}
if (argv1 == "Normalize" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::NormalizeLong;
}
if (argv1 == "Reionize" && argv2 == "short") {
standardize_mode = RDKitStandardizeMode::ReionizeShort;
}
if (argv1 == "Reionize" && argv2 == "long") {
standardize_mode = RDKitStandardizeMode::ReionizeLong;
}
// check if argv is within defined enum types
if (standardize_mode < RDKitStandardizeMode::StandardizeSmShort ||
standardize_mode > RDKitStandardizeMode::ReionizeLong) {
throw ValueErrorException("Invalid RDKit standardize mode");
}
std::cout << "Mode: " << standardize_mode << std::endl;
return standardize_mode;
}
std::pair<std::string, std::string> readLine(const std::string &line) {
std::pair<std::string, std::string> smiles =
std::pair<std::string, std::string>("", "");
// empty line
if (line.length() == 0) {
return smiles;
}
if (line.substr(0, 2) == "//") {
// comment line
return smiles;
}
boost::char_separator<char> tabSep("\t");
tokenizer tokens(line, tabSep);
std::vector<std::string> result(tokens.begin(), tokens.end());
// line must have at least two tab separated values
if (result.size() < 2) {
std::cout << "Invalid line." << std::endl;
return smiles;
}
std::string smi = result[0];
boost::erase_all(smi, " ");
result.erase(result.begin()); // delete first element
std::string molvsSmi;
// dealing with multiple outputs from molvs
unsigned int counter = 0;
for (const auto &r : result) {
if (counter == 0) {
molvsSmi = r;
} else {
molvsSmi = molvsSmi + " " + r;
}
++counter;
}
// tokenizer::iterator token = tokens.begin();
//
// // smiles from PCS
// std::string smi = *token;
// boost::erase_all(smi, " ");
// ++token;
//
// // smiles after MolVS
// std::string molvsSmi = *token;
//// boost::erase_all(molvsSmi, " ");
// ++token;
return std::pair<std::string, std::string>(smi, molvsSmi);
}
std::vector<std::pair<std::string, std::string>> readCSV(
const RDKitStandardizeMode &func) {
std::string rdbase = std::getenv("RDBASE");
std::string filename;
switch (func) {
case RDKitStandardizeMode::MetalShort:
filename =
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_metals.csv.gz";
break;
case RDKitStandardizeMode::MetalLong:
filename =
rdbase + "/rdkit/Chem/MolStandardize/test_data/100kPCS_metals.csv.gz";
break;
case RDKitStandardizeMode::StandardizeSmShort:
filename =
rdbase +
"/rdkit/Chem/MolStandardize/test_data/1kPCS_standardize_sm.csv.gz";
break;
case RDKitStandardizeMode::StandardizeSmLong:
filename =
rdbase +
"/rdkit/Chem/MolStandardize/test_data/100kPCS_standardize_sm.csv.gz";
break;
case RDKitStandardizeMode::ValidateShort:
filename =
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_validate.csv.gz";
// filename = "/data/dipper/leung/gsoc/downloads/1kPCS_validate.csv.gz";
break;
case RDKitStandardizeMode::ValidateLong:
filename = rdbase +
"/rdkit/Chem/MolStandardize/test_data/100kPCS_validate.csv.gz";
// filename = "/data/dipper/leung/gsoc/downloads/100kPCS_validate.csv.gz";
break;
case RDKitStandardizeMode::FragmentShort:
filename =
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_fragment.csv.gz";
break;
case RDKitStandardizeMode::FragmentLong:
filename = rdbase +
"/rdkit/Chem/MolStandardize/test_data/100kPCS_fragment.csv.gz";
break;
case RDKitStandardizeMode::ReionizeShort:
filename =
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_reionize.csv.gz";
break;
case RDKitStandardizeMode::ReionizeLong:
filename = rdbase +
"/rdkit/Chem/MolStandardize/test_data/100kPCS_reionize.csv.gz";
break;
case RDKitStandardizeMode::NormalizeShort:
filename = rdbase +
"/rdkit/Chem/MolStandardize/test_data/1kPCS_normalize.csv.gz";
break;
case RDKitStandardizeMode::NormalizeLong:
filename =
rdbase +
"/rdkit/Chem/MolStandardize/test_data/100kPCS_normalize.csv.gz";
break;
}
std::cout << "Reading: " << filename << std::endl;
std::ifstream file(filename, std::ios_base::in | std::ios_base::binary);
if (!file) {
std::cerr << "Unable to open file.\n";
exit(1); // call system to stop
}
boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf;
inbuf.push(boost::iostreams::gzip_decompressor());
inbuf.push(file);
// Convert streambuf to istream
std::istream instream(&inbuf);
// Iterate lines
std::string line;
std::vector<std::pair<std::string, std::string>> res;
while (std::getline(instream, line)) {
std::pair<std::string, std::string> smiles = readLine(line);
// smiles.first and smiles.second will be empty for comment lines
if (!(smiles.first == "" & smiles.second == "")) {
res.push_back(smiles);
}
}
// Cleanup
file.close();
return res;
}
std::string rdkitMolStandardizeMetal(const std::string &smi) {
MolStandardize::MetalDisconnector md;
std::unique_ptr<RWMol> m(SmilesToMol(smi));
md.disconnect(*m);
// std::cout << "Rdkit standardize: " << MolToSmiles(*m) << std::endl;
return MolToSmiles(*m);
}
std::string rdkitMolStandardizeReionize(const std::string &smi) {
MolStandardize::CleanupParameters params;
std::unique_ptr<RWMol> m(SmilesToMol(smi));
RWMOL_SPTR reionized(MolStandardize::reionize(m.get(), params));
return MolToSmiles(*reionized);
}
std::string rdkitMolStandardizeNormalize(const std::string &smi) {
MolStandardize::CleanupParameters params;
std::unique_ptr<RWMol> m(SmilesToMol(smi));
RWMOL_SPTR normalized(MolStandardize::normalize(m.get(), params));
return MolToSmiles(*normalized);
}
std::string rdkitMolStandardizeFragment(const std::string &smi) {
MolStandardize::CleanupParameters params;
std::unique_ptr<RWMol> m(SmilesToMol(smi));
RWMOL_SPTR fragmentParent(MolStandardize::fragmentParent(*m, params));
return MolToSmiles(*fragmentParent);
}
std::string rdkitMolStandardizeStandardizeSm(const std::string &smi) {
return MolStandardize::standardizeSmiles(smi);
}
std::string rdkitMolStandardizeValidate(const std::string &smi) {
MolStandardize::MolVSValidation vm;
std::unique_ptr<RWMol> m(SmilesToMol(smi));
std::vector<MolStandardize::ValidationErrorInfo> errout =
vm.validate(*m, true);
std::string res;
if (errout.size() != 0) {
unsigned int counter = 0;
for (const auto &err : errout) {
if (counter == 0) {
res = err.message();
} else {
res = res + " " + err.message();
}
++counter;
}
} else {
res = "[]";
}
return res;
}
void testfunc(const std::vector<std::pair<std::string, std::string>> &molvs_res,
const RDKitStandardizeMode &func) {
for (const auto &pair : molvs_res) {
std::string smi = pair.first;
std::string rdkit_smi;
switch (func) {
case RDKitStandardizeMode::MetalShort:
case RDKitStandardizeMode::MetalLong:
rdkit_smi = rdkitMolStandardizeMetal(smi);
break;
case RDKitStandardizeMode::ReionizeShort:
case RDKitStandardizeMode::ReionizeLong:
rdkit_smi = rdkitMolStandardizeReionize(smi);
break;
case RDKitStandardizeMode::NormalizeShort:
case RDKitStandardizeMode::NormalizeLong:
rdkit_smi = rdkitMolStandardizeNormalize(smi);
break;
case RDKitStandardizeMode::FragmentShort:
case RDKitStandardizeMode::FragmentLong:
rdkit_smi = rdkitMolStandardizeFragment(smi);
break;
case RDKitStandardizeMode::StandardizeSmShort:
case RDKitStandardizeMode::StandardizeSmLong:
rdkit_smi = rdkitMolStandardizeStandardizeSm(smi);
break;
case RDKitStandardizeMode::ValidateShort:
case RDKitStandardizeMode::ValidateLong:
rdkit_smi = rdkitMolStandardizeValidate(smi);
break;
}
if (rdkit_smi != pair.second) {
std::cout << "RDKIT DOES NOT MATCH MOLVS" << std::endl;
std::cout << "smi, molvs standardize, rdkit standardize" << std::endl;
std::cout << pair.first << std::endl
<< pair.second << std::endl
<< rdkit_smi << std::endl;
}
TEST_ASSERT(rdkit_smi == pair.second);
}
}
int main(int argc, char *argv[]) {
RDKitStandardizeMode standardize_mode = setMode(argv[1], argv[2]);
std::vector<std::pair<std::string, std::string>> res =
readCSV(standardize_mode);
if (argc < 3) {
std::cerr << "Usage: " << argv[0] << "--func --short/long" << std::endl;
return 1;
}
testfunc(res, standardize_mode);
return 0;
}