Files
rdkit/Code/GraphMol/MolStandardize/testCharge.cpp
Greg Landrum f5a54af475 A collection of MolStandardize improvements (#4118)
* Swap to using a data structure for default normalization parameters

* bring the default fragment data into the code too

* cleanup

* add reionizer parameters via data

change fragment parse failures to ValueErrorExceptions

* tautomer parameters in the code

* got a little over-enthusiastic in that last cleanup

* use boost::flyweight to cache normalization and charge data params

* a bit more cleanup

* support reading params from JSON

* fragments from JSON
single-call for fragment removal

* add a one-liner for the canonical tautomer

* quick refactor

* Fixes #4115

* complete the parents

* docs

* move the definitions to a namespace and make them const

* see if switching to c++14 fixes the CI compile problems with g++ 5.5

* somewhat uglier way of solving the initalizer list problem
2021-05-19 09:11:23 +02:00

391 lines
14 KiB
C++

//
// Copyright (C) 2018 Susan H. Leung
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <RDGeneral/test.h>
#include "MolStandardize.h"
#include <GraphMol/MolStandardize/AcidBaseCatalog/AcidBaseCatalogParams.h>
#include <GraphMol/MolStandardize/AcidBaseCatalog/AcidBaseCatalogUtils.h>
#include "Charge.h"
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <GraphMol/SmilesParse/SmilesWrite.h>
#include <GraphMol/FileParsers/FileParsers.h>
using namespace RDKit;
using namespace MolStandardize;
void testReionizer() {
BOOST_LOG(rdDebugLog) << "-----------------------\n test reionizer"
<< std::endl;
std::string smi1, smi2, smi3, smi4, smi5, smi6, smi7;
Reionizer reionizer;
// Test table salt.
smi1 = "[Na].[Cl]";
std::shared_ptr<ROMol> m1(SmilesToMol(smi1));
ROMOL_SPTR reionized(reionizer.reionize(*m1));
TEST_ASSERT(MolToSmiles(*reionized) == "[Cl-].[Na+]");
// Test forced charge correction maintaining overall neutral charge.
smi2 = "[Na].O=C(O)c1ccccc1";
std::shared_ptr<ROMol> m2(SmilesToMol(smi2));
ROMOL_SPTR reionized2(reionizer.reionize(*m2));
TEST_ASSERT(MolToSmiles(*reionized2) == "O=C([O-])c1ccccc1.[Na+]");
// Test reionizer moves proton to weaker acid.
smi3 = "C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O";
std::shared_ptr<ROMol> m3(SmilesToMol(smi3));
ROMOL_SPTR reionized3(reionizer.reionize(*m3));
TEST_ASSERT(MolToSmiles(*reionized3) == "O=S(O)c1ccc(S(=O)(=O)[O-])cc1");
// Test reionizer moves proton to weaker acid.
smi5 = "C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O";
std::shared_ptr<ROMol> m5(SmilesToMol(smi5));
ROMOL_SPTR reionized5(reionizer.reionize(*m5));
TEST_ASSERT(MolToSmiles(*reionized3) == "O=S(O)c1ccc(S(=O)(=O)[O-])cc1");
// Test charged carbon doesn't get recognised as alpha-carbon-hydrogen-keto.
smi6 = "CCOC(=O)C(=O)[CH-]C#N";
std::shared_ptr<ROMol> m6(SmilesToMol(smi6));
ROMOL_SPTR reionized6(reionizer.reionize(*m6));
TEST_ASSERT(MolToSmiles(*reionized6) == "CCOC(=O)C(=O)[CH-]C#N");
// TODO... can't make this work. Python SanitizeMol looks to correct...
// what is different with MolOps::sanitizeMol?
smi7 = "C[N+]1=C[CH-]N(C(=N)N)/C1=C/[N+](=O)[O-]";
std::shared_ptr<ROMol> m7(SmilesToMol(smi7));
ROMOL_SPTR reionized7(reionizer.reionize(*m7));
TEST_ASSERT(MolToSmiles(*reionized7) ==
"C[N+]1=CCN(C(=N)N)/C1=[C-]/[N+](=O)[O-]");
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testChargeParent() {
BOOST_LOG(rdDebugLog) << "-----------------------\n test charge parent"
<< std::endl;
MolStandardize::CleanupParameters params;
// initialize CleanupParameters with preferOrganic=true
MolStandardize::CleanupParameters params_preferorg;
params_preferorg.preferOrganic = true;
// Test neutralization of ionized acids and bases.
auto m1 = "C(C(=O)[O-])(Cc1n[n-]nn1)(C[NH3+])(C[N+](=O)[O-])"_smiles;
std::unique_ptr<RWMol> res1(MolStandardize::chargeParent(*m1, params));
TEST_ASSERT(MolToSmiles(*res1) == "NCC(Cc1nn[nH]n1)(C[N+](=O)[O-])C(=O)O");
// Test preservation of zwitterion.
auto m2 = "n(C)1cc[n+]2cccc([O-])c12"_smiles;
std::unique_ptr<RWMol> res2(MolStandardize::chargeParent(*m2, params));
TEST_ASSERT(MolToSmiles(*res2) == "Cn1cc[n+]2cccc([O-])c12");
// Choline should be left with a positive charge.
auto m3 = "C[N+](C)(C)CCO"_smiles;
std::unique_ptr<RWMol> res3(MolStandardize::chargeParent(*m3, params));
TEST_ASSERT(MolToSmiles(*res3) == "C[N+](C)(C)CCO");
// Hydrogen should be removed to give deanol as a charge parent.
auto m4 = "C[NH+](C)CCO"_smiles;
std::unique_ptr<RWMol> res4(MolStandardize::chargeParent(*m4, params));
TEST_ASSERT(MolToSmiles(*res4) == "CN(C)CCO");
// Sodium benzoate to benzoic acid.
auto m5 = "[Na+].O=C([O-])c1ccccc1"_smiles;
std::unique_ptr<RWMol> res5(MolStandardize::chargeParent(*m5, params));
TEST_ASSERT(MolToSmiles(*res5) == "O=C(O)c1ccccc1");
// Benzoate ion to benzoic acid.
auto m6 = "O=C([O-])c1ccccc1"_smiles;
std::unique_ptr<RWMol> res6(MolStandardize::chargeParent(*m6, params));
TEST_ASSERT(MolToSmiles(*res6) == "O=C(O)c1ccccc1");
// Charges in histidine should be neutralized.
auto m7 = "[NH3+]C(Cc1cnc[nH]1)C(=O)[O-]"_smiles;
std::unique_ptr<RWMol> res7(MolStandardize::chargeParent(*m7, params));
TEST_ASSERT(MolToSmiles(*res7) == "NC(Cc1cnc[nH]1)C(=O)O");
//
auto m8 = "C[NH+](C)(C).[Cl-]"_smiles;
std::unique_ptr<RWMol> res8(MolStandardize::chargeParent(*m8, params));
TEST_ASSERT(MolToSmiles(*res8) == "CN(C)C");
// No organic fragments.
auto m9 = "[N+](=O)([O-])[O-]"_smiles;
std::unique_ptr<RWMol> res9(MolStandardize::chargeParent(*m9, params));
TEST_ASSERT(MolToSmiles(*res9) == "O=[N+]([O-])O");
// TODO switch prefer_organic=true
// No organic fragments.
auto m10 = "[N+](=O)([O-])[O-]"_smiles;
std::unique_ptr<RWMol> res10(
MolStandardize::chargeParent(*m10, params_preferorg));
TEST_ASSERT(MolToSmiles(*res10) == "O=[N+]([O-])O");
// Larger inorganic fragment should be chosen.
auto m11 = "[N+](=O)([O-])[O-].[CH2]"_smiles;
std::unique_ptr<RWMol> res11(MolStandardize::chargeParent(*m11, params));
TEST_ASSERT(MolToSmiles(*res11) == "O=[N+]([O-])O");
// TODO prefer_organic=true
// Smaller organic fragment should be chosen over larger inorganic fragment.
auto m12 = "[N+](=O)([O-])[O-].[CH2]"_smiles;
std::unique_ptr<RWMol> res12(
MolStandardize::chargeParent(*m12, params_preferorg));
TEST_ASSERT(MolToSmiles(*res12) == "[CH2]");
// do not completely neutralize zwitterions
auto m13 = "C[S+](=O)([O-])NC"_smiles;
std::unique_ptr<RWMol> res13(MolStandardize::chargeParent(*m13, params));
TEST_ASSERT(MolToSmiles(*res13) == "CN[S+](C)(=O)[O-]");
// standalone metal ion
auto m14 = "[Cu+2]"_smiles;
std::unique_ptr<RWMol> res14(MolStandardize::chargeParent(*m14));
TEST_ASSERT(MolToSmiles(*res14) == "[Cu+2]");
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testGithub2144() {
BOOST_LOG(rdDebugLog) << "-----------------------\n Testing github #2144: "
"Error when calling ChargeParent twice"
<< std::endl;
{
// Test neutralization of ionized acids and bases.
auto m1 = "c1ccccn1"_smiles;
TEST_ASSERT(m1);
std::unique_ptr<RWMol> res1(MolStandardize::chargeParent(*m1));
TEST_ASSERT(res1);
TEST_ASSERT(MolToSmiles(*res1) == MolToSmiles(*m1));
std::unique_ptr<RWMol> res2(MolStandardize::chargeParent(*res1));
TEST_ASSERT(res2);
TEST_ASSERT(MolToSmiles(*res2) == MolToSmiles(*m1));
}
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testGithub2346() {
BOOST_LOG(rdDebugLog) << "-----------------------\n Testing github #2346: "
"uncharger behaves differently on molecules "
"constructed from mol blocks and SMILES"
<< std::endl;
{
auto m1 = "[NH3+]CC[O-]"_smiles;
TEST_ASSERT(m1);
MolStandardize::Uncharger uncharger;
std::unique_ptr<ROMol> res1(uncharger.uncharge(*m1));
TEST_ASSERT(res1);
TEST_ASSERT(res1->getAtomWithIdx(0)->getFormalCharge() == 0);
TEST_ASSERT(res1->getAtomWithIdx(1)->getFormalCharge() == 0);
std::unique_ptr<ROMol> m2(MolBlockToMol(MolToMolBlock(*m1)));
TEST_ASSERT(m2);
std::unique_ptr<ROMol> res2(uncharger.uncharge(*m2));
TEST_ASSERT(res2);
TEST_ASSERT(res2->getAtomWithIdx(0)->getFormalCharge() == 0);
TEST_ASSERT(res2->getAtomWithIdx(1)->getFormalCharge() == 0);
}
{
auto m1 = "[O-]C(=O)C([O-])C(=O)[O-]"_smiles;
TEST_ASSERT(m1);
MolStandardize::Uncharger uncharger;
std::unique_ptr<ROMol> res1(uncharger.uncharge(*m1));
TEST_ASSERT(res1);
for (auto &atom : res1->atoms()) {
TEST_ASSERT(atom->getFormalCharge() == 0);
}
std::unique_ptr<ROMol> m2(MolBlockToMol(MolToMolBlock(*m1)));
TEST_ASSERT(m2);
std::unique_ptr<ROMol> res2(uncharger.uncharge(*m2));
TEST_ASSERT(res2);
for (auto &atom : res2->atoms()) {
TEST_ASSERT(atom->getFormalCharge() == 0);
}
}
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testChargedAromatics() {
BOOST_LOG(rdDebugLog)
<< "-----------------------\n Testing charged aromatics: "
"need to sanitize after using uncharger"
<< std::endl;
{
auto cyclopentadienyl = "[cH-]1cccc1"_smiles;
TEST_ASSERT(cyclopentadienyl);
MolStandardize::Uncharger uncharger;
std::unique_ptr<ROMol> res(uncharger.uncharge(*cyclopentadienyl));
TEST_ASSERT(res.get());
TEST_ASSERT(MolToSmiles(*res) == "c1cccc1");
MolOps::sanitizeMol(*static_cast<RWMol *>(res.get()));
TEST_ASSERT(MolToSmiles(*res) == "C1=CCC=C1");
}
{
auto tropylium = "[cH+]1cccccc1"_smiles;
TEST_ASSERT(tropylium);
MolStandardize::Uncharger uncharger;
std::unique_ptr<ROMol> res(uncharger.uncharge(*tropylium));
TEST_ASSERT(res.get());
TEST_ASSERT(MolToSmiles(*res) == "c1cccccc1");
MolOps::sanitizeMol(*static_cast<RWMol *>(res.get()));
TEST_ASSERT(MolToSmiles(*res) == "C1=CC=CCC=C1");
}
{
auto azolium = "[NH2+]1C=CC=C1"_smiles;
TEST_ASSERT(azolium);
MolStandardize::Uncharger uncharger;
std::unique_ptr<ROMol> res(uncharger.uncharge(*azolium));
TEST_ASSERT(res.get());
TEST_ASSERT(MolToSmiles(*res) == "C1=CNC=C1");
MolOps::sanitizeMol(*static_cast<RWMol *>(res.get()));
TEST_ASSERT(MolToSmiles(*res) == "c1cc[nH]c1");
}
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testInorganicAcids() {
BOOST_LOG(rdDebugLog) << "-----------------------\n Testing inorganic acids"
<< std::endl;
MolStandardize::Uncharger uncharger;
std::vector<std::string> halogens{"Cl", "Br", "I"};
std::unique_ptr<ROMol> res;
for (const auto &halogen : halogens) {
std::unique_ptr<ROMol> hypohalite(SmilesToMol("[" + halogen + "][O-]"));
TEST_ASSERT(hypohalite);
res.reset(uncharger.uncharge(*hypohalite));
TEST_ASSERT(MolToSmiles(*res) == "O" + halogen);
std::unique_ptr<ROMol> halite(SmilesToMol("[" + halogen + "](=O)[O-]"));
TEST_ASSERT(halite);
res.reset(uncharger.uncharge(*halite));
TEST_ASSERT(MolToSmiles(*res) == "[O-][" + halogen + "+]O");
std::unique_ptr<ROMol> halate(SmilesToMol("[" + halogen + "](=O)(=O)[O-]"));
TEST_ASSERT(halate);
res.reset(uncharger.uncharge(*halate));
TEST_ASSERT(MolToSmiles(*res) == "[O-][" + halogen + "+2]([O-])O");
std::unique_ptr<ROMol> perhalate(
SmilesToMol("[" + halogen + "](=O)(=O)(=O)[O-]"));
TEST_ASSERT(perhalate);
res.reset(uncharger.uncharge(*perhalate));
TEST_ASSERT(MolToSmiles(*res) == "[O-][" + halogen + "+3]([O-])([O-])O");
}
{
auto hyponitrite = "[O-]N=N[O-]"_smiles;
TEST_ASSERT(hyponitrite);
res.reset(uncharger.uncharge(*hyponitrite));
TEST_ASSERT(MolToSmiles(*res) == "ON=NO");
}
{
auto nitrite = "N(=O)[O-]"_smiles;
TEST_ASSERT(nitrite);
res.reset(uncharger.uncharge(*nitrite));
TEST_ASSERT(MolToSmiles(*res) == "O=NO");
}
{
auto nitrate = "N(=O)(=O)[O-]"_smiles;
TEST_ASSERT(nitrate);
res.reset(uncharger.uncharge(*nitrate));
TEST_ASSERT(MolToSmiles(*res) == "O=[N+]([O-])O");
}
{
auto hyposulfite = "S([O-])[O-]"_smiles;
TEST_ASSERT(hyposulfite);
res.reset(uncharger.uncharge(*hyposulfite));
TEST_ASSERT(MolToSmiles(*res) == "OSO");
}
{
auto sulfite = "S(=O)([O-])[O-]"_smiles;
TEST_ASSERT(sulfite);
res.reset(uncharger.uncharge(*sulfite));
TEST_ASSERT(MolToSmiles(*res) == "O=S(O)O");
}
{
auto sulfate = "S(=O)(=O)([O-])[O-]"_smiles;
TEST_ASSERT(sulfate);
res.reset(uncharger.uncharge(*sulfate));
TEST_ASSERT(MolToSmiles(*res) == "O=S(=O)(O)O");
}
{
auto persulfate = "S(=O)(=O)([O-])OOS(=O)(=O)[O-]"_smiles;
TEST_ASSERT(persulfate);
res.reset(uncharger.uncharge(*persulfate));
TEST_ASSERT(MolToSmiles(*res) == "O=S(=O)(O)OOS(=O)(=O)O");
}
{
auto hypophosphite = "P(=O)[O-]"_smiles;
TEST_ASSERT(hypophosphite);
res.reset(uncharger.uncharge(*hypophosphite));
TEST_ASSERT(MolToSmiles(*res) == "O=PO");
}
{
auto phosphite = "P(=O)([O-])[O-]"_smiles;
TEST_ASSERT(phosphite);
res.reset(uncharger.uncharge(*phosphite));
TEST_ASSERT(MolToSmiles(*res) == "O=[PH](O)O");
}
{
auto phosphate = "P(=O)([O-])([O-])[O-]"_smiles;
TEST_ASSERT(phosphate);
res.reset(uncharger.uncharge(*phosphate));
TEST_ASSERT(MolToSmiles(*res) == "O=P(O)(O)O");
}
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
void testReionizerParams() {
BOOST_LOG(rdDebugLog)
<< "-----------------------\n Testing reionizer parameters" << std::endl;
{ // defaults
Reionizer reionizer;
auto m1 = "c1cc([O-])cc(C(=O)O)c1"_smiles;
std::unique_ptr<ROMol> reionized1{reionizer.reionize(*m1)};
TEST_ASSERT(MolToSmiles(*reionized1) == "O=C([O-])c1cccc(O)c1");
auto m2 = "C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O"_smiles;
std::unique_ptr<ROMol> reionized2{reionizer.reionize(*m2)};
TEST_ASSERT(MolToSmiles(*reionized2) == "O=S(O)c1ccc(S(=O)(=O)[O-])cc1");
}
{ // parameters via tuple
std::vector<std::tuple<std::string, std::string, std::string>> params{
{"-CO2H", "C(=O)[OH]", "C(=O)[O-]"}, {"phenol", "c[OH]", "c[O-]"}};
Reionizer reionizer(params);
auto m1 = "c1cc([O-])cc(C(=O)O)c1"_smiles;
std::unique_ptr<ROMol> reionized1{reionizer.reionize(*m1)};
TEST_ASSERT(MolToSmiles(*reionized1) == "O=C([O-])c1cccc(O)c1");
auto m2 = "C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O"_smiles;
std::unique_ptr<ROMol> reionized2{reionizer.reionize(*m2)};
TEST_ASSERT(MolToSmiles(*reionized2) == "O=S([O-])c1ccc(S(=O)(=O)O)cc1");
}
BOOST_LOG(rdDebugLog) << "Finished" << std::endl;
}
int main() {
RDLog::InitLogs();
boost::logging::disable_logs("rdApp.info");
testReionizer();
testChargeParent();
testGithub2144();
testGithub2346();
testChargedAromatics();
testInorganicAcids();
testReionizerParams();
return 0;
}