Allow components of the MolStandardize code to be initialized from streams (#2385)

* Fixes #2383 (tests coming in the next commit)
Minor typo fix
Fixes a "bug" in one of the default transforms

* Adds support for directly providing normalization parameter data
instead of requiring the use of a text file.

* allow fragment removers to be initialized with string data

* remove unicode

* allow the reionizer to be initialized from a stream
This commit is contained in:
Greg Landrum
2019-04-03 04:48:05 +02:00
committed by GitHub
parent 67fcc14226
commit 9f103a9913
21 changed files with 178 additions and 10 deletions

View File

@@ -216,7 +216,7 @@ RDKIT_GRAPHMOL_EXPORT void addHs(RWMol &mol, bool explicitOnly = false,
- Hs that are part of the definition of double bond Stereochemistry
will not be removed
- Hs that are not connected to anything else will not be removed
- Hs that have a query defined (i.e. hasQuery() returns true) will not
- Hs that have a query defined (i.e. hasQuery() returns true) will not
be removed
- the caller is responsible for <tt>delete</tt>ing the pointer this

View File

@@ -20,6 +20,11 @@ AcidBaseCatalogParams::AcidBaseCatalogParams(const std::string &acidBaseFile) {
d_pairs = readPairs(acidBaseFile);
}
AcidBaseCatalogParams::AcidBaseCatalogParams(std::istream &acidBaseFile) {
d_pairs.clear();
d_pairs = readPairs(acidBaseFile);
}
AcidBaseCatalogParams::AcidBaseCatalogParams(
const AcidBaseCatalogParams &other) {
d_typeStr = other.d_typeStr;

View File

@@ -31,6 +31,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT AcidBaseCatalogParams
}
AcidBaseCatalogParams(const std::string &acidBaseFile);
AcidBaseCatalogParams(std::istream &acidBaseFile);
// copy constructor
AcidBaseCatalogParams(const AcidBaseCatalogParams &other);

View File

@@ -45,6 +45,13 @@ Reionizer::Reionizer(const std::string acidbaseFile,
this->d_ccs = ccs;
}
Reionizer::Reionizer(std::istream &acidbaseStream,
const std::vector<ChargeCorrection> ccs) {
AcidBaseCatalogParams abparams(acidbaseStream);
this->d_abcat = new AcidBaseCatalog(&abparams);
this->d_ccs = ccs;
}
Reionizer::~Reionizer() { delete d_abcat; }
// Reionizer::Reionizer(const AcidBaseCatalog *abcat, const

View File

@@ -64,6 +64,10 @@ class RDKIT_MOLSTANDARDIZE_EXPORT Reionizer {
// corrections
Reionizer(const std::string acidbaseFile,
const std::vector<ChargeCorrection> ccs);
//! construct a Reionizer with a particular acidbaseFile and charge
// corrections
Reionizer(std::istream &acidbaseStream,
const std::vector<ChargeCorrection> ccs);
//! making Reionizer objects non-copyable
Reionizer(const Reionizer &other) = delete;
Reionizer &operator=(Reionizer const &) = delete;

View File

@@ -47,6 +47,18 @@ FragmentRemover::FragmentRemover(const std::string fragmentFile,
this->SKIP_IF_ALL_MATCH = skip_if_all_match;
}
// overloaded constructor
FragmentRemover::FragmentRemover(std::istream &fragmentStream, bool leave_last,
bool skip_if_all_match) {
FragmentCatalogParams fparams(fragmentStream);
this->d_fcat = new FragmentCatalog(&fparams);
if (!this->d_fcat) {
throw ValueErrorException("could not constract fragment catalog");
}
this->LEAVE_LAST = leave_last;
this->SKIP_IF_ALL_MATCH = skip_if_all_match;
}
// Destructor
FragmentRemover::~FragmentRemover() { delete d_fcat; };

View File

@@ -33,6 +33,8 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentRemover {
FragmentRemover();
FragmentRemover(const std::string fragmentFile, bool leave_last,
bool skip_if_all_match = false);
FragmentRemover(std::istream &fragmentStream, bool leave_last,
bool skip_if_all_match = false);
~FragmentRemover();
//! making FragmentRemover objects non-copyable

View File

@@ -20,6 +20,11 @@ FragmentCatalogParams::FragmentCatalogParams(const std::string &fgroupFile) {
d_funcGroups = readFuncGroups(fgroupFile);
}
FragmentCatalogParams::FragmentCatalogParams(std::istream &fgroupStream) {
d_funcGroups.clear();
d_funcGroups = readFuncGroups(fgroupStream);
}
FragmentCatalogParams::FragmentCatalogParams(
const FragmentCatalogParams &other) {
d_typeStr = other.d_typeStr;

View File

@@ -31,6 +31,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentCatalogParams
}
FragmentCatalogParams(const std::string &fgroupFile);
FragmentCatalogParams(std::istream &fgroupStream);
// copy constructor
FragmentCatalogParams(const FragmentCatalogParams &other);

View File

@@ -104,7 +104,7 @@ RWMol *chargeParent(const RWMol &mol, const CleanupParameters &params,
void superParent(RWMol &mol, const CleanupParameters &params) {
RDUNUSED_PARAM(mol);
RDUNUSED_PARAM(params);
UNDER_CONSTRUCTION("Not yet implmented");
UNDER_CONSTRUCTION("Not yet implemented");
}
RWMol *normalize(const RWMol *mol, const CleanupParameters &params) {

View File

@@ -46,6 +46,15 @@ Normalizer::Normalizer(const std::string normalizeFile,
this->MAX_RESTARTS = maxRestarts;
}
// overloaded constructor
Normalizer::Normalizer(std::istream &normalizeStream,
const unsigned int maxRestarts) {
BOOST_LOG(rdInfoLog) << "Initializing Normalizer\n";
TransformCatalogParams tparams(normalizeStream);
this->d_tcat = new TransformCatalog(&tparams);
this->MAX_RESTARTS = maxRestarts;
}
// destructor
Normalizer::~Normalizer() { delete d_tcat; }
@@ -57,10 +66,12 @@ ROMol *Normalizer::normalize(const ROMol &mol) {
PRECONDITION(tparams, "");
const std::vector<std::shared_ptr<ChemicalReaction>> &transforms =
tparams->getTransformations();
std::vector<boost::shared_ptr<ROMol>> frags = MolOps::getMolFrags(mol);
bool sanitizeFrags = false;
std::vector<boost::shared_ptr<ROMol>> frags =
MolOps::getMolFrags(mol, sanitizeFrags);
std::vector<ROMOL_SPTR> nfrags; //( frags.size() );
for (const auto &frag : frags) {
frag->updatePropertyCache(false);
ROMOL_SPTR nfrag(this->normalizeFragment(*frag, transforms));
nfrags.push_back(nfrag);
}
@@ -134,8 +145,10 @@ boost::shared_ptr<ROMol> Normalizer::applyTransform(
// std::endl;
unsigned int failed;
try {
MolOps::sanitizeMol(*static_cast<RWMol *>(pdt[0].get()), failed);
Normalizer::Product np(MolToSmiles(*pdt[0]), pdt[0]);
RWMol tmol(*static_cast<RWMol *>(pdt[0].get()));
MolOps::sanitizeMol(tmol, failed);
pdt[0]->updatePropertyCache(false);
Normalizer::Product np(MolToSmiles(tmol), pdt[0]);
pdts.push_back(np);
} catch (MolSanitizeException &) {
BOOST_LOG(rdInfoLog) << "FAILED sanitizeMol.\n";

View File

@@ -48,6 +48,9 @@ class RDKIT_MOLSTANDARDIZE_EXPORT Normalizer {
Normalizer();
//! Construct a Normalizer with a particular normalizeFile and maxRestarts
Normalizer(const std::string normalizeFile, const unsigned int maxRestarts);
//! Construct a Normalizer with a particular stream (with parameters) and
//! maxRestarts
Normalizer(std::istream &normalizeStream, const unsigned int maxRestarts);
//! making Normalizer objects non-copyable
Normalizer(const Normalizer &other) = delete;
Normalizer &operator=(Normalizer const &) = delete;

View File

@@ -21,6 +21,11 @@ TransformCatalogParams::TransformCatalogParams(
d_transformations = readTransformations(transformFile);
}
TransformCatalogParams::TransformCatalogParams(std::istream &transformStream) {
d_transformations.clear();
d_transformations = readTransformations(transformStream);
}
TransformCatalogParams::TransformCatalogParams(
const TransformCatalogParams &other) {
d_typeStr = other.d_typeStr;

View File

@@ -32,6 +32,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TransformCatalogParams
}
TransformCatalogParams(const std::string &transformFile);
TransformCatalogParams(std::istream &transformStream);
// copy constructor
TransformCatalogParams(const TransformCatalogParams &other);

View File

@@ -25,6 +25,21 @@ ROMol *reionizeHelper(MolStandardize::Reionizer &self, const ROMol &mol) {
return self.reionize(mol);
}
MolStandardize::Reionizer *reionizerFromData(const std::string &data,
python::object chargeCorrections) {
std::istringstream sstr(data);
auto corrections =
pythonObjectToVect<MolStandardize::ChargeCorrection>(chargeCorrections);
MolStandardize::Reionizer *res;
if (corrections) {
res = new MolStandardize::Reionizer(sstr, *corrections);
} else {
res = new MolStandardize::Reionizer(
sstr, std::vector<MolStandardize::ChargeCorrection>());
}
return res;
}
} // namespace
struct charge_wrapper {
@@ -51,6 +66,12 @@ struct charge_wrapper {
(python::arg("self"), python::arg("mol")), "",
python::return_value_policy<python::manage_new_object>());
python::def("ReionizerFromData", &reionizerFromData,
(python::arg("paramData"),
python::arg("chargeCorrections") = python::list()),
"creates a reionizer from a string containing parameter data "
"and a list of charge corrections",
python::return_value_policy<python::manage_new_object>());
python::class_<MolStandardize::Uncharger, boost::noncopyable>(
"Uncharger", python::init<bool>((python::arg("self"),
python::arg("canonicalOrder") = true)))

View File

@@ -25,6 +25,13 @@ ROMol *chooseHelper(MolStandardize::LargestFragmentChooser &self,
const ROMol &mol) {
return self.choose(mol);
}
MolStandardize::FragmentRemover *removerFromParams(const std::string &data,
bool leave_last,
bool skip_if_all_match) {
std::istringstream sstr(data);
return new MolStandardize::FragmentRemover(sstr, leave_last,
skip_if_all_match);
}
} // namespace
@@ -40,11 +47,18 @@ struct fragment_wrapper {
"FragmentRemover", python::init<>())
.def(python::init<std::string, bool, bool>(
(python::arg("fragmentFilename") = "",
python::arg("leave_last") = true,
python::arg("skip_if_all_match") = false)))
python::arg("leave_last") = true,
python::arg("skip_if_all_match") = false)))
.def("remove", &removeHelper, (python::arg("self"), python::arg("mol")),
"", python::return_value_policy<python::manage_new_object>());
python::def(
"FragmentRemoverFromData", &removerFromParams,
(python::arg("fragmentData"), python::arg("leave_last") = true,
python::arg("skip_if_all_match") = false),
"creates a FragmentRemover from a string containing parameter data",
python::return_value_policy<python::manage_new_object>());
python::class_<MolStandardize::LargestFragmentChooser, boost::noncopyable>(
"LargestFragmentChooser",
python::init<bool>((python::arg("preferOrganic") = false)))

View File

@@ -11,6 +11,7 @@
#include <GraphMol/RDKitBase.h>
#include <GraphMol/MolStandardize/Normalize.h>
#include <sstream>
namespace python = boost::python;
using namespace RDKit;
@@ -21,6 +22,11 @@ ROMol *normalizeHelper(MolStandardize::Normalizer &self, const ROMol &mol) {
return self.normalize(mol);
}
MolStandardize::Normalizer *normalizerFromParams(
const std::string &data, const MolStandardize::CleanupParameters &params) {
std::istringstream sstr(data);
return new MolStandardize::Normalizer(sstr, params.maxRestarts);
}
} // namespace
struct normalize_wrapper {
@@ -37,6 +43,10 @@ struct normalize_wrapper {
.def("normalize", &normalizeHelper,
(python::arg("self"), python::arg("mol")), "",
python::return_value_policy<python::manage_new_object>());
python::def("NormalizerFromData", &normalizerFromParams,
(python::arg("paramData")),
"creates a normalizer from a string containing parameter data",
python::return_value_policy<python::manage_new_object>());
}
};

View File

@@ -173,6 +173,50 @@ class TestCase(unittest.TestCase):
self.assertEqual
("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg6[0])
def test10NormalizeParams(self):
data = """// Name SMIRKS
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
// Azide to N=N+=N- [*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4]
"""
normalizer1 = rdMolStandardize.Normalizer()
params = rdMolStandardize.CleanupParameters()
normalizer2 = rdMolStandardize.NormalizerFromData(data, params)
imol = Chem.MolFromSmiles("O=N(=O)CCN=N#N", sanitize=False)
mol1 = normalizer1.normalize(imol)
mol2 = normalizer2.normalize(imol)
self.assertEqual(Chem.MolToSmiles(imol), "N#N=NCCN(=O)=O")
self.assertEqual(Chem.MolToSmiles(mol1), "[N-]=[N+]=NCC[N+](=O)[O-]")
self.assertEqual(Chem.MolToSmiles(mol2), "N#N=NCC[N+](=O)[O-]")
def test11FragmentParams(self):
data = """// Name SMARTS
fluorine [F]
chlorine [Cl]
"""
fragremover = rdMolStandardize.FragmentRemoverFromData(data)
mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br")
nm = fragremover.remove(mol)
self.assertEqual(Chem.MolToSmiles(nm), "Br.CN(C)C")
def test12ChargeParams(self):
params = """// The default list of AcidBasePairs, sorted from strongest to weakest.
// This list is derived from the Food and Drug: Administration Substance
// Registration System Standard Operating Procedure guide.
//
// Name Acid Base
-SO2H [!O][SD3](=O)[OH] [!O][SD3](=O)[O-]
-SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
"""
mol = Chem.MolFromSmiles("C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O")
# instantiate with default acid base pair library
reionizer = rdMolStandardize.ReionizerFromData(params, [])
print("done")
nm = reionizer.reionize(mol)
self.assertEqual(Chem.MolToSmiles(nm), "O=S([O-])c1ccc(S(=O)(=O)O)cc1")
if __name__ == "__main__":
unittest.main()

View File

@@ -85,7 +85,27 @@ void test1() {
BOOST_LOG(rdInfoLog) << "Finished" << std::endl;
}
void test2() {
BOOST_LOG(rdInfoLog) << "-----------------------\n test2" << std::endl;
std::string tfdata = R"DATA(// Name SMIRKS
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
)DATA";
std::stringstream sstr(tfdata);
Normalizer nn(sstr, 10);
bool debugParse = false;
bool sanitize = false;
std::unique_ptr<ROMol> imol(
SmilesToMol("O=N(=O)CCN=N#N", debugParse, sanitize));
std::unique_ptr<ROMol> m2(nn.normalize(*imol));
TEST_ASSERT(MolToSmiles(*m2) == "N#N=NCC[N+](=O)[O-]");
BOOST_LOG(rdInfoLog) << "Finished" << std::endl;
}
int main() {
RDLog::InitLogs();
test1();
test2();
return 0;
}

View File

@@ -4,7 +4,7 @@
//
// Name Acid Base
-OSO3H OS(=O)(=O)[OH] OS(=O)(=O)[O-]
SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
-SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
-OSO2H O[SD3](=O)[OH] O[SD3](=O)[O-]
-SO2H [!O][SD3](=O)[OH] [!O][SD3](=O)[O-]
-OPO3H2 OP(=O)([OH])[OH] OP(=O)([OH])[O-]

View File

@@ -3,7 +3,7 @@
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
Azide to N=N+=N- [*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4]
Azide to N=N+=N- [*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]
Diazo/azo to =N+=N- [*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]
Sulfoxide to -S+(O-)- [!O:1][S+0;X3:2](=[O:3])[!O:4]>>[*:1][S+1:2]([O-:3])[*:4]
// Equivalent to #1.5 in InChI technical manual