mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
* short test file for MolVS standardize_sm * short test file for MolVS fragment * short test file for MolVS metals * short test file for MolVS normalize * short test file for MolVS reionize * short test file for MolVS tautomer * short test file for MolVS validate * long test file for MolVS standardize smiles * long test file for MolVS fragment * long test file for MolVS metals * long test file for MolVS normalize * long test file for MolVS reionize * long test file for MolVS tautomer * long test file for MolVS validate * Unit tests for MolVS steps * dropping support for Python2 * molvs/__init__.py * molvs/charge.py * molvs/errors.py * molvs/fragment.py * molvs/metal.py * molvs/normalize.py * molvs/resonance.py * molvs/standardize.py * molvs/tautomer.py * molvs/utils.py * molvs/validate.py * molvs/validations.py * molvs/cli.py * adapted and renamed molvs/cli.py to work within $RDBASE/Contrib/MolVS/ * setup MolStandardize directories, source with empty cleanup function, header, CMake files * corrections to empty source, header and test1.cpp * adding empty functions and initializers to MolStandardize * empty Metal source, header and added test * added most of Metal.cpp functionality and made some more tests * empty functions and initializers to Normalize * empty functions and initializers to Validate * added most code for RDKitDefault mode, along with some tests * restructure for abstract base class ValidateMethod * written in isNoneValidation for MolVSValidation * took out isNoneValidation, put in noAtomValidation, neutralValidation, isotopeValidation for MolVSValidation * added in AllowedAtoms * added in disallowedAtoms * corrections to Validate * added code for FragmentRemover * extended fragment functionality to include choose largest fragment, added in tests for fragment catalog, fragment remover. Also added fragmentValidation method in MolStandardize * added another test to testValidate test_fragment * corrections to fragment * corrections to Metal * added code for Normalize * added normalize member function to MolStandardize and added tests * added multi fragment functionality to Normalize.cpp and additional tests * TransformCatalog * tests for Normalize.cpp * first bit of cleanup * added most of Charge functionality and some tests * some corrections to Charge.cpp and some more tests to testCharge.cpp * corrections to Charge.cpp * start of Tautomer Enumerate with some tests * added BondType option to Tautomer Enumeration * correcting for some memory leakage * a few alterations to formatting * sorting out some memory leaks * sorting out some memory leaks * some corrections for PCS test set * redo tests with updated RDKit * fixing memory leak * more fixes after 100kPCS set testing * using tab as delimiter in CSVs rather than comma * tutorial for MolStandardize * still working on Tautomer enumeration * deleted some empty tests * starting writing tautomer canonicalize * rename test_data -> data (the source still needs to be updated) * automatic source reformatting * adjust to directory rename * move the fragment catalog test into the MolStandardize directory do not create separate library for FragmentCatalog * stop building separate libraries for the catalogs * move the CleanupParameters into the MolStandardize namespace * first pass at python wrapper * move the py module to the correct dir; add some python tests; add standardizeSmiles to python wrapper * disabling the compareMolVSTest since that requires command line arguments to run * get this building on windows * put the python lib in the right place * further work on python wrapper for rdMolStandardize * added get and set functions to Metal and wrapped them * added get and set functions to Metal and wrapped them * changed construstor of Reionizer class and input args for reionize, wrapped this default * overload Reionizer constructor so user can input own AcidBaseFile from python * added Uncharger class to Charge and added test for Uncharger * wrapped Fragment, fixed some memory leakage, changed some args and return types, added some tests * wrapped Normalized and changed how Normalizer class is initiated * changing MolVSValidation structure so user can choose which MolVS submethod they want * starting to write Wrap for Validate * now it compiles with Wrap/Validate.cpp * a couple refactorings around validate * move the validate code into the rdMolStandardize module * make sure a valid pointer is returned for standardizeSmiles * rdMolStandardize.MolVSValidation done and tests added * half way through AllowedAtomsValidation * finished AllowedAtomsValidation and DisallowedAtomsValidation * moved charge, fragment, metal, normalize into the rdMolStandardize module * changed tutorial to use wrapped code * added copyrights * added copyrights * move the data files * modify source files to adjust to the move * added validateSmiles functionality * removed std::cout * redid some of the 100k PCS tests * working on the tutorial * adding some documentation * deleting some comment lines * some changes after pull review * More changes after pull review * start of trying to make java wrap * remove some warnings, add some questions * additional warning removals, a bit more reporting * some test cleanups * enable testing of the java code
350 lines
11 KiB
C++
350 lines
11 KiB
C++
//
|
|
// Copyright (C) 2018 Susan H. Leung
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include <string>
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <boost/iostreams/filtering_streambuf.hpp>
|
|
#include <boost/iostreams/copy.hpp>
|
|
#include <boost/iostreams/filter/gzip.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/tokenizer.hpp>
|
|
#include "Metal.h"
|
|
#include "Validate.h"
|
|
#include "MolStandardize.h"
|
|
#include <GraphMol/RDKitBase.h>
|
|
#include <GraphMol/SmilesParse/SmilesParse.h>
|
|
#include <GraphMol/SmilesParse/SmilesWrite.h>
|
|
#include <GraphMol/ROMol.h>
|
|
#include <RDGeneral/Invariant.h>
|
|
typedef boost::tokenizer<boost::char_separator<char>> tokenizer;
|
|
|
|
using namespace RDKit;
|
|
|
|
typedef enum {
|
|
StandardizeSmShort,
|
|
StandardizeSmLong,
|
|
FragmentShort,
|
|
FragmentLong,
|
|
TautomerShort,
|
|
TautomerLong,
|
|
ValidateShort,
|
|
ValidateLong,
|
|
MetalShort,
|
|
MetalLong,
|
|
NormalizeShort,
|
|
NormalizeLong,
|
|
ReionizeShort,
|
|
ReionizeLong
|
|
} RDKitStandardizeMode;
|
|
|
|
RDKitStandardizeMode setMode(const std::string &argv1,
|
|
const std::string &argv2) {
|
|
RDKitStandardizeMode standardize_mode;
|
|
if (argv1 == "Metal" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::MetalShort;
|
|
}
|
|
if (argv1 == "Metal" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::MetalLong;
|
|
}
|
|
if (argv1 == "Fragment" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::FragmentShort;
|
|
}
|
|
if (argv1 == "Fragment" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::FragmentLong;
|
|
}
|
|
if (argv1 == "StandardizeSm" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::StandardizeSmShort;
|
|
}
|
|
if (argv1 == "StandardizeSm" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::StandardizeSmLong;
|
|
}
|
|
if (argv1 == "Validate" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::ValidateShort;
|
|
}
|
|
if (argv1 == "Validate" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::ValidateLong;
|
|
}
|
|
if (argv1 == "Normalize" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::NormalizeShort;
|
|
}
|
|
if (argv1 == "Normalize" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::NormalizeLong;
|
|
}
|
|
if (argv1 == "Reionize" && argv2 == "short") {
|
|
standardize_mode = RDKitStandardizeMode::ReionizeShort;
|
|
}
|
|
if (argv1 == "Reionize" && argv2 == "long") {
|
|
standardize_mode = RDKitStandardizeMode::ReionizeLong;
|
|
}
|
|
// check if argv is within defined enum types
|
|
if (standardize_mode < RDKitStandardizeMode::StandardizeSmShort ||
|
|
standardize_mode > RDKitStandardizeMode::ReionizeLong) {
|
|
throw ValueErrorException("Invalid RDKit standardize mode");
|
|
}
|
|
std::cout << "Mode: " << standardize_mode << std::endl;
|
|
|
|
return standardize_mode;
|
|
}
|
|
|
|
std::pair<std::string, std::string> readLine(const std::string &line) {
|
|
std::pair<std::string, std::string> smiles =
|
|
std::pair<std::string, std::string>("", "");
|
|
// empty line
|
|
if (line.length() == 0) {
|
|
return smiles;
|
|
}
|
|
if (line.substr(0, 2) == "//") {
|
|
// comment line
|
|
return smiles;
|
|
}
|
|
|
|
boost::char_separator<char> tabSep("\t");
|
|
tokenizer tokens(line, tabSep);
|
|
std::vector<std::string> result(tokens.begin(), tokens.end());
|
|
|
|
// line must have at least two tab separated values
|
|
if (result.size() < 2) {
|
|
std::cout << "Invalid line." << std::endl;
|
|
return smiles;
|
|
}
|
|
|
|
std::string smi = result[0];
|
|
boost::erase_all(smi, " ");
|
|
result.erase(result.begin()); // delete first element
|
|
|
|
std::string molvsSmi;
|
|
// dealing with multiple outputs from molvs
|
|
unsigned int counter = 0;
|
|
for (const auto &r : result) {
|
|
if (counter == 0) {
|
|
molvsSmi = r;
|
|
} else {
|
|
molvsSmi = molvsSmi + " " + r;
|
|
}
|
|
++counter;
|
|
}
|
|
|
|
// tokenizer::iterator token = tokens.begin();
|
|
//
|
|
// // smiles from PCS
|
|
// std::string smi = *token;
|
|
// boost::erase_all(smi, " ");
|
|
// ++token;
|
|
//
|
|
// // smiles after MolVS
|
|
// std::string molvsSmi = *token;
|
|
//// boost::erase_all(molvsSmi, " ");
|
|
// ++token;
|
|
|
|
return std::pair<std::string, std::string>(smi, molvsSmi);
|
|
}
|
|
|
|
std::vector<std::pair<std::string, std::string>> readCSV(
|
|
const RDKitStandardizeMode &func) {
|
|
std::string rdbase = std::getenv("RDBASE");
|
|
std::string filename;
|
|
switch (func) {
|
|
case RDKitStandardizeMode::MetalShort:
|
|
filename =
|
|
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_metals.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::MetalLong:
|
|
filename =
|
|
rdbase + "/rdkit/Chem/MolStandardize/test_data/100kPCS_metals.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::StandardizeSmShort:
|
|
filename =
|
|
rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/1kPCS_standardize_sm.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::StandardizeSmLong:
|
|
filename =
|
|
rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/100kPCS_standardize_sm.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::ValidateShort:
|
|
filename =
|
|
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_validate.csv.gz";
|
|
// filename = "/data/dipper/leung/gsoc/downloads/1kPCS_validate.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::ValidateLong:
|
|
filename = rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/100kPCS_validate.csv.gz";
|
|
// filename = "/data/dipper/leung/gsoc/downloads/100kPCS_validate.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::FragmentShort:
|
|
filename =
|
|
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_fragment.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::FragmentLong:
|
|
filename = rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/100kPCS_fragment.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::ReionizeShort:
|
|
filename =
|
|
rdbase + "/rdkit/Chem/MolStandardize/test_data/1kPCS_reionize.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::ReionizeLong:
|
|
filename = rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/100kPCS_reionize.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::NormalizeShort:
|
|
filename = rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/1kPCS_normalize.csv.gz";
|
|
break;
|
|
case RDKitStandardizeMode::NormalizeLong:
|
|
filename =
|
|
rdbase +
|
|
"/rdkit/Chem/MolStandardize/test_data/100kPCS_normalize.csv.gz";
|
|
break;
|
|
}
|
|
|
|
std::cout << "Reading: " << filename << std::endl;
|
|
|
|
std::ifstream file(filename, std::ios_base::in | std::ios_base::binary);
|
|
if (!file) {
|
|
std::cerr << "Unable to open file.\n";
|
|
exit(1); // call system to stop
|
|
}
|
|
boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf;
|
|
inbuf.push(boost::iostreams::gzip_decompressor());
|
|
inbuf.push(file);
|
|
// Convert streambuf to istream
|
|
std::istream instream(&inbuf);
|
|
// Iterate lines
|
|
std::string line;
|
|
std::vector<std::pair<std::string, std::string>> res;
|
|
while (std::getline(instream, line)) {
|
|
std::pair<std::string, std::string> smiles = readLine(line);
|
|
// smiles.first and smiles.second will be empty for comment lines
|
|
if (!(smiles.first == "" & smiles.second == "")) {
|
|
res.push_back(smiles);
|
|
}
|
|
}
|
|
// Cleanup
|
|
file.close();
|
|
return res;
|
|
}
|
|
|
|
std::string rdkitMolStandardizeMetal(const std::string &smi) {
|
|
MolStandardize::MetalDisconnector md;
|
|
std::unique_ptr<RWMol> m(SmilesToMol(smi));
|
|
md.disconnect(*m);
|
|
// std::cout << "Rdkit standardize: " << MolToSmiles(*m) << std::endl;
|
|
return MolToSmiles(*m);
|
|
}
|
|
|
|
std::string rdkitMolStandardizeReionize(const std::string &smi) {
|
|
MolStandardize::CleanupParameters params;
|
|
std::unique_ptr<RWMol> m(SmilesToMol(smi));
|
|
RWMOL_SPTR reionized(MolStandardize::reionize(m.get(), params));
|
|
return MolToSmiles(*reionized);
|
|
}
|
|
|
|
std::string rdkitMolStandardizeNormalize(const std::string &smi) {
|
|
MolStandardize::CleanupParameters params;
|
|
std::unique_ptr<RWMol> m(SmilesToMol(smi));
|
|
RWMOL_SPTR normalized(MolStandardize::normalize(m.get(), params));
|
|
return MolToSmiles(*normalized);
|
|
}
|
|
|
|
std::string rdkitMolStandardizeFragment(const std::string &smi) {
|
|
MolStandardize::CleanupParameters params;
|
|
std::unique_ptr<RWMol> m(SmilesToMol(smi));
|
|
RWMOL_SPTR fragmentParent(MolStandardize::fragmentParent(*m, params));
|
|
return MolToSmiles(*fragmentParent);
|
|
}
|
|
|
|
std::string rdkitMolStandardizeStandardizeSm(const std::string &smi) {
|
|
return MolStandardize::standardizeSmiles(smi);
|
|
}
|
|
|
|
std::string rdkitMolStandardizeValidate(const std::string &smi) {
|
|
MolStandardize::MolVSValidation vm;
|
|
std::unique_ptr<RWMol> m(SmilesToMol(smi));
|
|
std::vector<MolStandardize::ValidationErrorInfo> errout =
|
|
vm.validate(*m, true);
|
|
std::string res;
|
|
if (errout.size() != 0) {
|
|
unsigned int counter = 0;
|
|
for (const auto &err : errout) {
|
|
if (counter == 0) {
|
|
res = err.message();
|
|
} else {
|
|
res = res + " " + err.message();
|
|
}
|
|
++counter;
|
|
}
|
|
} else {
|
|
res = "[]";
|
|
}
|
|
return res;
|
|
}
|
|
|
|
void testfunc(const std::vector<std::pair<std::string, std::string>> &molvs_res,
|
|
const RDKitStandardizeMode &func) {
|
|
for (const auto &pair : molvs_res) {
|
|
std::string smi = pair.first;
|
|
std::string rdkit_smi;
|
|
|
|
switch (func) {
|
|
case RDKitStandardizeMode::MetalShort:
|
|
case RDKitStandardizeMode::MetalLong:
|
|
rdkit_smi = rdkitMolStandardizeMetal(smi);
|
|
break;
|
|
case RDKitStandardizeMode::ReionizeShort:
|
|
case RDKitStandardizeMode::ReionizeLong:
|
|
rdkit_smi = rdkitMolStandardizeReionize(smi);
|
|
break;
|
|
case RDKitStandardizeMode::NormalizeShort:
|
|
case RDKitStandardizeMode::NormalizeLong:
|
|
rdkit_smi = rdkitMolStandardizeNormalize(smi);
|
|
break;
|
|
case RDKitStandardizeMode::FragmentShort:
|
|
case RDKitStandardizeMode::FragmentLong:
|
|
rdkit_smi = rdkitMolStandardizeFragment(smi);
|
|
break;
|
|
case RDKitStandardizeMode::StandardizeSmShort:
|
|
case RDKitStandardizeMode::StandardizeSmLong:
|
|
rdkit_smi = rdkitMolStandardizeStandardizeSm(smi);
|
|
break;
|
|
case RDKitStandardizeMode::ValidateShort:
|
|
case RDKitStandardizeMode::ValidateLong:
|
|
rdkit_smi = rdkitMolStandardizeValidate(smi);
|
|
break;
|
|
}
|
|
|
|
if (rdkit_smi != pair.second) {
|
|
std::cout << "RDKIT DOES NOT MATCH MOLVS" << std::endl;
|
|
std::cout << "smi, molvs standardize, rdkit standardize" << std::endl;
|
|
std::cout << pair.first << std::endl
|
|
<< pair.second << std::endl
|
|
<< rdkit_smi << std::endl;
|
|
}
|
|
|
|
TEST_ASSERT(rdkit_smi == pair.second);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
RDKitStandardizeMode standardize_mode = setMode(argv[1], argv[2]);
|
|
std::vector<std::pair<std::string, std::string>> res =
|
|
readCSV(standardize_mode);
|
|
if (argc < 3) {
|
|
std::cerr << "Usage: " << argv[0] << "--func --short/long" << std::endl;
|
|
return 1;
|
|
}
|
|
|
|
testfunc(res, standardize_mode);
|
|
return 0;
|
|
}
|