mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-03 21:44:30 +08:00
Allow components of the MolStandardize code to be initialized from streams (#2385)
* Fixes #2383 (tests coming in the next commit) Minor typo fix Fixes a "bug" in one of the default transforms * Adds support for directly providing normalization parameter data instead of requiring the use of a text file. * allow fragment removers to be initialized with string data * remove unicode * allow the reionizer to be initialized from a stream
This commit is contained in:
@@ -216,7 +216,7 @@ RDKIT_GRAPHMOL_EXPORT void addHs(RWMol &mol, bool explicitOnly = false,
|
||||
- Hs that are part of the definition of double bond Stereochemistry
|
||||
will not be removed
|
||||
- Hs that are not connected to anything else will not be removed
|
||||
- Hs that have a query defined (i.e. hasQuery() returns true) will not
|
||||
- Hs that have a query defined (i.e. hasQuery() returns true) will not
|
||||
be removed
|
||||
|
||||
- the caller is responsible for <tt>delete</tt>ing the pointer this
|
||||
|
||||
@@ -20,6 +20,11 @@ AcidBaseCatalogParams::AcidBaseCatalogParams(const std::string &acidBaseFile) {
|
||||
d_pairs = readPairs(acidBaseFile);
|
||||
}
|
||||
|
||||
AcidBaseCatalogParams::AcidBaseCatalogParams(std::istream &acidBaseFile) {
|
||||
d_pairs.clear();
|
||||
d_pairs = readPairs(acidBaseFile);
|
||||
}
|
||||
|
||||
AcidBaseCatalogParams::AcidBaseCatalogParams(
|
||||
const AcidBaseCatalogParams &other) {
|
||||
d_typeStr = other.d_typeStr;
|
||||
|
||||
@@ -31,6 +31,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT AcidBaseCatalogParams
|
||||
}
|
||||
|
||||
AcidBaseCatalogParams(const std::string &acidBaseFile);
|
||||
AcidBaseCatalogParams(std::istream &acidBaseFile);
|
||||
// copy constructor
|
||||
AcidBaseCatalogParams(const AcidBaseCatalogParams &other);
|
||||
|
||||
|
||||
@@ -45,6 +45,13 @@ Reionizer::Reionizer(const std::string acidbaseFile,
|
||||
this->d_ccs = ccs;
|
||||
}
|
||||
|
||||
Reionizer::Reionizer(std::istream &acidbaseStream,
|
||||
const std::vector<ChargeCorrection> ccs) {
|
||||
AcidBaseCatalogParams abparams(acidbaseStream);
|
||||
this->d_abcat = new AcidBaseCatalog(&abparams);
|
||||
this->d_ccs = ccs;
|
||||
}
|
||||
|
||||
Reionizer::~Reionizer() { delete d_abcat; }
|
||||
|
||||
// Reionizer::Reionizer(const AcidBaseCatalog *abcat, const
|
||||
|
||||
@@ -64,6 +64,10 @@ class RDKIT_MOLSTANDARDIZE_EXPORT Reionizer {
|
||||
// corrections
|
||||
Reionizer(const std::string acidbaseFile,
|
||||
const std::vector<ChargeCorrection> ccs);
|
||||
//! construct a Reionizer with a particular acidbaseFile and charge
|
||||
// corrections
|
||||
Reionizer(std::istream &acidbaseStream,
|
||||
const std::vector<ChargeCorrection> ccs);
|
||||
//! making Reionizer objects non-copyable
|
||||
Reionizer(const Reionizer &other) = delete;
|
||||
Reionizer &operator=(Reionizer const &) = delete;
|
||||
|
||||
@@ -47,6 +47,18 @@ FragmentRemover::FragmentRemover(const std::string fragmentFile,
|
||||
this->SKIP_IF_ALL_MATCH = skip_if_all_match;
|
||||
}
|
||||
|
||||
// overloaded constructor
|
||||
FragmentRemover::FragmentRemover(std::istream &fragmentStream, bool leave_last,
|
||||
bool skip_if_all_match) {
|
||||
FragmentCatalogParams fparams(fragmentStream);
|
||||
this->d_fcat = new FragmentCatalog(&fparams);
|
||||
if (!this->d_fcat) {
|
||||
throw ValueErrorException("could not constract fragment catalog");
|
||||
}
|
||||
this->LEAVE_LAST = leave_last;
|
||||
this->SKIP_IF_ALL_MATCH = skip_if_all_match;
|
||||
}
|
||||
|
||||
// Destructor
|
||||
FragmentRemover::~FragmentRemover() { delete d_fcat; };
|
||||
|
||||
|
||||
@@ -33,6 +33,8 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentRemover {
|
||||
FragmentRemover();
|
||||
FragmentRemover(const std::string fragmentFile, bool leave_last,
|
||||
bool skip_if_all_match = false);
|
||||
FragmentRemover(std::istream &fragmentStream, bool leave_last,
|
||||
bool skip_if_all_match = false);
|
||||
~FragmentRemover();
|
||||
|
||||
//! making FragmentRemover objects non-copyable
|
||||
|
||||
@@ -20,6 +20,11 @@ FragmentCatalogParams::FragmentCatalogParams(const std::string &fgroupFile) {
|
||||
d_funcGroups = readFuncGroups(fgroupFile);
|
||||
}
|
||||
|
||||
FragmentCatalogParams::FragmentCatalogParams(std::istream &fgroupStream) {
|
||||
d_funcGroups.clear();
|
||||
d_funcGroups = readFuncGroups(fgroupStream);
|
||||
}
|
||||
|
||||
FragmentCatalogParams::FragmentCatalogParams(
|
||||
const FragmentCatalogParams &other) {
|
||||
d_typeStr = other.d_typeStr;
|
||||
|
||||
@@ -31,6 +31,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT FragmentCatalogParams
|
||||
}
|
||||
|
||||
FragmentCatalogParams(const std::string &fgroupFile);
|
||||
FragmentCatalogParams(std::istream &fgroupStream);
|
||||
// copy constructor
|
||||
FragmentCatalogParams(const FragmentCatalogParams &other);
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ RWMol *chargeParent(const RWMol &mol, const CleanupParameters ¶ms,
|
||||
void superParent(RWMol &mol, const CleanupParameters ¶ms) {
|
||||
RDUNUSED_PARAM(mol);
|
||||
RDUNUSED_PARAM(params);
|
||||
UNDER_CONSTRUCTION("Not yet implmented");
|
||||
UNDER_CONSTRUCTION("Not yet implemented");
|
||||
}
|
||||
|
||||
RWMol *normalize(const RWMol *mol, const CleanupParameters ¶ms) {
|
||||
|
||||
@@ -46,6 +46,15 @@ Normalizer::Normalizer(const std::string normalizeFile,
|
||||
this->MAX_RESTARTS = maxRestarts;
|
||||
}
|
||||
|
||||
// overloaded constructor
|
||||
Normalizer::Normalizer(std::istream &normalizeStream,
|
||||
const unsigned int maxRestarts) {
|
||||
BOOST_LOG(rdInfoLog) << "Initializing Normalizer\n";
|
||||
TransformCatalogParams tparams(normalizeStream);
|
||||
this->d_tcat = new TransformCatalog(&tparams);
|
||||
this->MAX_RESTARTS = maxRestarts;
|
||||
}
|
||||
|
||||
// destructor
|
||||
Normalizer::~Normalizer() { delete d_tcat; }
|
||||
|
||||
@@ -57,10 +66,12 @@ ROMol *Normalizer::normalize(const ROMol &mol) {
|
||||
PRECONDITION(tparams, "");
|
||||
const std::vector<std::shared_ptr<ChemicalReaction>> &transforms =
|
||||
tparams->getTransformations();
|
||||
|
||||
std::vector<boost::shared_ptr<ROMol>> frags = MolOps::getMolFrags(mol);
|
||||
bool sanitizeFrags = false;
|
||||
std::vector<boost::shared_ptr<ROMol>> frags =
|
||||
MolOps::getMolFrags(mol, sanitizeFrags);
|
||||
std::vector<ROMOL_SPTR> nfrags; //( frags.size() );
|
||||
for (const auto &frag : frags) {
|
||||
frag->updatePropertyCache(false);
|
||||
ROMOL_SPTR nfrag(this->normalizeFragment(*frag, transforms));
|
||||
nfrags.push_back(nfrag);
|
||||
}
|
||||
@@ -134,8 +145,10 @@ boost::shared_ptr<ROMol> Normalizer::applyTransform(
|
||||
// std::endl;
|
||||
unsigned int failed;
|
||||
try {
|
||||
MolOps::sanitizeMol(*static_cast<RWMol *>(pdt[0].get()), failed);
|
||||
Normalizer::Product np(MolToSmiles(*pdt[0]), pdt[0]);
|
||||
RWMol tmol(*static_cast<RWMol *>(pdt[0].get()));
|
||||
MolOps::sanitizeMol(tmol, failed);
|
||||
pdt[0]->updatePropertyCache(false);
|
||||
Normalizer::Product np(MolToSmiles(tmol), pdt[0]);
|
||||
pdts.push_back(np);
|
||||
} catch (MolSanitizeException &) {
|
||||
BOOST_LOG(rdInfoLog) << "FAILED sanitizeMol.\n";
|
||||
|
||||
@@ -48,6 +48,9 @@ class RDKIT_MOLSTANDARDIZE_EXPORT Normalizer {
|
||||
Normalizer();
|
||||
//! Construct a Normalizer with a particular normalizeFile and maxRestarts
|
||||
Normalizer(const std::string normalizeFile, const unsigned int maxRestarts);
|
||||
//! Construct a Normalizer with a particular stream (with parameters) and
|
||||
//! maxRestarts
|
||||
Normalizer(std::istream &normalizeStream, const unsigned int maxRestarts);
|
||||
//! making Normalizer objects non-copyable
|
||||
Normalizer(const Normalizer &other) = delete;
|
||||
Normalizer &operator=(Normalizer const &) = delete;
|
||||
|
||||
@@ -21,6 +21,11 @@ TransformCatalogParams::TransformCatalogParams(
|
||||
d_transformations = readTransformations(transformFile);
|
||||
}
|
||||
|
||||
TransformCatalogParams::TransformCatalogParams(std::istream &transformStream) {
|
||||
d_transformations.clear();
|
||||
d_transformations = readTransformations(transformStream);
|
||||
}
|
||||
|
||||
TransformCatalogParams::TransformCatalogParams(
|
||||
const TransformCatalogParams &other) {
|
||||
d_typeStr = other.d_typeStr;
|
||||
|
||||
@@ -32,6 +32,7 @@ class RDKIT_MOLSTANDARDIZE_EXPORT TransformCatalogParams
|
||||
}
|
||||
|
||||
TransformCatalogParams(const std::string &transformFile);
|
||||
TransformCatalogParams(std::istream &transformStream);
|
||||
// copy constructor
|
||||
TransformCatalogParams(const TransformCatalogParams &other);
|
||||
|
||||
|
||||
@@ -25,6 +25,21 @@ ROMol *reionizeHelper(MolStandardize::Reionizer &self, const ROMol &mol) {
|
||||
return self.reionize(mol);
|
||||
}
|
||||
|
||||
MolStandardize::Reionizer *reionizerFromData(const std::string &data,
|
||||
python::object chargeCorrections) {
|
||||
std::istringstream sstr(data);
|
||||
auto corrections =
|
||||
pythonObjectToVect<MolStandardize::ChargeCorrection>(chargeCorrections);
|
||||
MolStandardize::Reionizer *res;
|
||||
if (corrections) {
|
||||
res = new MolStandardize::Reionizer(sstr, *corrections);
|
||||
} else {
|
||||
res = new MolStandardize::Reionizer(
|
||||
sstr, std::vector<MolStandardize::ChargeCorrection>());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
struct charge_wrapper {
|
||||
@@ -51,6 +66,12 @@ struct charge_wrapper {
|
||||
(python::arg("self"), python::arg("mol")), "",
|
||||
python::return_value_policy<python::manage_new_object>());
|
||||
|
||||
python::def("ReionizerFromData", &reionizerFromData,
|
||||
(python::arg("paramData"),
|
||||
python::arg("chargeCorrections") = python::list()),
|
||||
"creates a reionizer from a string containing parameter data "
|
||||
"and a list of charge corrections",
|
||||
python::return_value_policy<python::manage_new_object>());
|
||||
python::class_<MolStandardize::Uncharger, boost::noncopyable>(
|
||||
"Uncharger", python::init<bool>((python::arg("self"),
|
||||
python::arg("canonicalOrder") = true)))
|
||||
|
||||
@@ -25,6 +25,13 @@ ROMol *chooseHelper(MolStandardize::LargestFragmentChooser &self,
|
||||
const ROMol &mol) {
|
||||
return self.choose(mol);
|
||||
}
|
||||
MolStandardize::FragmentRemover *removerFromParams(const std::string &data,
|
||||
bool leave_last,
|
||||
bool skip_if_all_match) {
|
||||
std::istringstream sstr(data);
|
||||
return new MolStandardize::FragmentRemover(sstr, leave_last,
|
||||
skip_if_all_match);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -40,11 +47,18 @@ struct fragment_wrapper {
|
||||
"FragmentRemover", python::init<>())
|
||||
.def(python::init<std::string, bool, bool>(
|
||||
(python::arg("fragmentFilename") = "",
|
||||
python::arg("leave_last") = true,
|
||||
python::arg("skip_if_all_match") = false)))
|
||||
python::arg("leave_last") = true,
|
||||
python::arg("skip_if_all_match") = false)))
|
||||
.def("remove", &removeHelper, (python::arg("self"), python::arg("mol")),
|
||||
"", python::return_value_policy<python::manage_new_object>());
|
||||
|
||||
python::def(
|
||||
"FragmentRemoverFromData", &removerFromParams,
|
||||
(python::arg("fragmentData"), python::arg("leave_last") = true,
|
||||
python::arg("skip_if_all_match") = false),
|
||||
"creates a FragmentRemover from a string containing parameter data",
|
||||
python::return_value_policy<python::manage_new_object>());
|
||||
|
||||
python::class_<MolStandardize::LargestFragmentChooser, boost::noncopyable>(
|
||||
"LargestFragmentChooser",
|
||||
python::init<bool>((python::arg("preferOrganic") = false)))
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
|
||||
#include <GraphMol/RDKitBase.h>
|
||||
#include <GraphMol/MolStandardize/Normalize.h>
|
||||
#include <sstream>
|
||||
|
||||
namespace python = boost::python;
|
||||
using namespace RDKit;
|
||||
@@ -21,6 +22,11 @@ ROMol *normalizeHelper(MolStandardize::Normalizer &self, const ROMol &mol) {
|
||||
return self.normalize(mol);
|
||||
}
|
||||
|
||||
MolStandardize::Normalizer *normalizerFromParams(
|
||||
const std::string &data, const MolStandardize::CleanupParameters ¶ms) {
|
||||
std::istringstream sstr(data);
|
||||
return new MolStandardize::Normalizer(sstr, params.maxRestarts);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
struct normalize_wrapper {
|
||||
@@ -37,6 +43,10 @@ struct normalize_wrapper {
|
||||
.def("normalize", &normalizeHelper,
|
||||
(python::arg("self"), python::arg("mol")), "",
|
||||
python::return_value_policy<python::manage_new_object>());
|
||||
python::def("NormalizerFromData", &normalizerFromParams,
|
||||
(python::arg("paramData")),
|
||||
"creates a normalizer from a string containing parameter data",
|
||||
python::return_value_policy<python::manage_new_object>());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -173,6 +173,50 @@ class TestCase(unittest.TestCase):
|
||||
self.assertEqual
|
||||
("""INFO: [FragmentValidation] 1,2-dichloroethane is present""", msg6[0])
|
||||
|
||||
def test10NormalizeParams(self):
|
||||
data = """// Name SMIRKS
|
||||
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
|
||||
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
|
||||
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
|
||||
// Azide to N=N+=N- [*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4]
|
||||
"""
|
||||
normalizer1 = rdMolStandardize.Normalizer()
|
||||
params = rdMolStandardize.CleanupParameters()
|
||||
normalizer2 = rdMolStandardize.NormalizerFromData(data, params)
|
||||
|
||||
imol = Chem.MolFromSmiles("O=N(=O)CCN=N#N", sanitize=False)
|
||||
mol1 = normalizer1.normalize(imol)
|
||||
mol2 = normalizer2.normalize(imol)
|
||||
self.assertEqual(Chem.MolToSmiles(imol), "N#N=NCCN(=O)=O")
|
||||
self.assertEqual(Chem.MolToSmiles(mol1), "[N-]=[N+]=NCC[N+](=O)[O-]")
|
||||
self.assertEqual(Chem.MolToSmiles(mol2), "N#N=NCC[N+](=O)[O-]")
|
||||
|
||||
def test11FragmentParams(self):
|
||||
data = """// Name SMARTS
|
||||
fluorine [F]
|
||||
chlorine [Cl]
|
||||
"""
|
||||
fragremover = rdMolStandardize.FragmentRemoverFromData(data)
|
||||
mol = Chem.MolFromSmiles("CN(C)C.Cl.Cl.Br")
|
||||
nm = fragremover.remove(mol)
|
||||
self.assertEqual(Chem.MolToSmiles(nm), "Br.CN(C)C")
|
||||
|
||||
def test12ChargeParams(self):
|
||||
params = """// The default list of AcidBasePairs, sorted from strongest to weakest.
|
||||
// This list is derived from the Food and Drug: Administration Substance
|
||||
// Registration System Standard Operating Procedure guide.
|
||||
//
|
||||
// Name Acid Base
|
||||
-SO2H [!O][SD3](=O)[OH] [!O][SD3](=O)[O-]
|
||||
-SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
|
||||
"""
|
||||
mol = Chem.MolFromSmiles("C1=C(C=CC(=C1)[S]([O-])=O)[S](O)(=O)=O")
|
||||
# instantiate with default acid base pair library
|
||||
reionizer = rdMolStandardize.ReionizerFromData(params, [])
|
||||
print("done")
|
||||
nm = reionizer.reionize(mol)
|
||||
self.assertEqual(Chem.MolToSmiles(nm), "O=S([O-])c1ccc(S(=O)(=O)O)cc1")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -85,7 +85,27 @@ void test1() {
|
||||
BOOST_LOG(rdInfoLog) << "Finished" << std::endl;
|
||||
}
|
||||
|
||||
void test2() {
|
||||
BOOST_LOG(rdInfoLog) << "-----------------------\n test2" << std::endl;
|
||||
std::string tfdata = R"DATA(// Name SMIRKS
|
||||
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
|
||||
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
|
||||
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
|
||||
)DATA";
|
||||
std::stringstream sstr(tfdata);
|
||||
Normalizer nn(sstr, 10);
|
||||
bool debugParse = false;
|
||||
bool sanitize = false;
|
||||
std::unique_ptr<ROMol> imol(
|
||||
SmilesToMol("O=N(=O)CCN=N#N", debugParse, sanitize));
|
||||
std::unique_ptr<ROMol> m2(nn.normalize(*imol));
|
||||
TEST_ASSERT(MolToSmiles(*m2) == "N#N=NCC[N+](=O)[O-]");
|
||||
BOOST_LOG(rdInfoLog) << "Finished" << std::endl;
|
||||
}
|
||||
|
||||
int main() {
|
||||
RDLog::InitLogs();
|
||||
test1();
|
||||
test2();
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//
|
||||
// Name Acid Base
|
||||
-OSO3H OS(=O)(=O)[OH] OS(=O)(=O)[O-]
|
||||
–SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
|
||||
-SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-]
|
||||
-OSO2H O[SD3](=O)[OH] O[SD3](=O)[O-]
|
||||
-SO2H [!O][SD3](=O)[OH] [!O][SD3](=O)[O-]
|
||||
-OPO3H2 OP(=O)([OH])[OH] OP(=O)([OH])[O-]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Nitro to N+(O-)=O [N,P,As,Sb;X3:1](=[O,S,Se,Te:2])=[O,S,Se,Te:3]>>[*+1:1]([*-1:2])=[*:3]
|
||||
Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>>[S+0:1](=[O-0:2])(=[O-0:3])
|
||||
Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2]
|
||||
Azide to N=N+=N- [*,H:1][N:2]=[N:3]#[N:4]>>[*,H:1][N:2]=[N+:3]=[N-:4]
|
||||
Azide to N=N+=N- [*:1][N:2]=[N:3]#[N:4]>>[*:1][N:2]=[N+:3]=[N-:4]
|
||||
Diazo/azo to =N+=N- [*:1]=[N:2]#[N:3]>>[*:1]=[N+:2]=[N-:3]
|
||||
Sulfoxide to -S+(O-)- [!O:1][S+0;X3:2](=[O:3])[!O:4]>>[*:1][S+1:2]([O-:3])[*:4]
|
||||
// Equivalent to #1.5 in InChI technical manual
|
||||
|
||||
Reference in New Issue
Block a user