Files
rdkit/Code/GraphMol/SmilesParse/SmilesParse.cpp
Ric 39bcee635e Mem checkup (#3083)
* clean up in testDatastructs

* avoid jump on unassigned mem in rxns

* set valgrind error exit code

* update suppressions

* clean up RDValue in testDict

* don't leak mol if parsing Maestro file fails

* don't leak mol if parsing PDB block fails

* cleanup leftover rxns in testReaction

* cleanup mol if CX extensions cannot be parsed

* cleanup leftover mols in smiTest1

* cleanup leftover mols in moldraw2DTest1

* cleanup leftover mols in testSubstructMatch

* make testEnumeration go easier on valgrind

* reduce R counts
2020-04-17 17:48:58 +02:00

467 lines
14 KiB
C++

//
// Copyright (C) 2001-2020 Greg Landrum and Rational Discovery LLC
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
// ----------------------------------------------------------------------------------
// Despite the name of this file, both SMILES and SMARTS parsers are exposed
// here
//
// General comments about the parsers:
// - Atom numbering will be preserved, so input order of atoms==internal order
//
// - Bond ordering is not, in general, preserved. Specifically, ring closure
// bonds will occur at the end of the bond list in general. Basically ring
// closure bonds are not constructed until fragments are closed. This
// forces
// some form of reordering.
//
//
//
#include "SmilesParse.h"
#include <RDGeneral/BoostStartInclude.h>
#include <boost/algorithm/string.hpp>
#include <boost/foreach.hpp>
#include <boost/lexical_cast.hpp>
#include <RDGeneral/BoostEndInclude.h>
#include <GraphMol/RDKitBase.h>
#include "SmilesParseOps.h"
#include <RDGeneral/RDLog.h>
#include <RDGeneral/Invariant.h>
#include "smiles.tab.hpp"
// NOTE: this is a bit fragile since a lot of the #defines in smiles.tab.hpp
// could prevent the same #defines in smarts.tab.hpp from being read.
// Fortunately if there are actually any problems here, they will inevitably
// show up very quickly in the tests.
#include "smarts.tab.hpp"
#include <list>
int yysmiles_lex_init(void **);
int yysmiles_lex_destroy(void *);
size_t setup_smiles_string(const std::string &text, void *);
extern int yysmiles_debug;
int yysmarts_lex_init(void **);
int yysmarts_lex_destroy(void *);
size_t setup_smarts_string(const std::string &text, void *);
extern int yysmarts_debug;
namespace RDKit {
namespace {
int smarts_parse_helper(const std::string &inp,
std::vector<RDKit::RWMol *> &molVect, Atom *&atom,
Bond *&bond, int start_tok) {
void *scanner;
int res = 1; // initialize with fail code
TEST_ASSERT(!yysmarts_lex_init(&scanner));
try {
size_t ltrim = setup_smarts_string(inp, scanner);
res = yysmarts_parse(inp.c_str() + ltrim, &molVect, atom, bond, scanner,
start_tok);
} catch (...) {
yysmarts_lex_destroy(scanner);
throw;
}
yysmarts_lex_destroy(scanner);
if (res == 1) {
std::stringstream errout;
errout << "Failed parsing SMARTS '" << inp << "'";
throw SmilesParseException(errout.str());
}
return res;
}
int smarts_bond_parse(const std::string &inp, Bond *&bond) {
auto start_tok = static_cast<int>(START_BOND);
std::vector<RWMol *> molVect;
Atom *atom = nullptr;
return smarts_parse_helper(inp, molVect, atom, bond, start_tok);
}
int smarts_atom_parse(const std::string &inp, Atom *&atom) {
auto start_tok = static_cast<int>(START_ATOM);
std::vector<RWMol *> molVect;
Bond *bond = nullptr;
return smarts_parse_helper(inp, molVect, atom, bond, start_tok);
}
int smarts_parse(const std::string &inp, std::vector<RDKit::RWMol *> &molVect) {
auto start_tok = static_cast<int>(START_MOL);
Atom *atom = nullptr;
Bond *bond = nullptr;
return smarts_parse_helper(inp, molVect, atom, bond, start_tok);
}
int smiles_parse_helper(const std::string &inp,
std::vector<RDKit::RWMol *> &molVect, Atom *&atom,
Bond *&bond, int start_tok) {
std::list<unsigned int> branchPoints;
void *scanner;
int res = 1; // initialize with fail code
TEST_ASSERT(!yysmiles_lex_init(&scanner));
try {
size_t ltrim = setup_smiles_string(inp, scanner);
res = yysmiles_parse(inp.c_str() + ltrim, &molVect, atom, bond,
&branchPoints, scanner, start_tok);
} catch (...) {
yysmiles_lex_destroy(scanner);
throw;
}
yysmiles_lex_destroy(scanner);
if (res == 1) {
std::stringstream errout;
errout << "Failed parsing SMILES '" << inp << "'";
throw SmilesParseException(errout.str());
}
if (!branchPoints.empty()) {
throw SmilesParseException("extra open parentheses");
}
return res;
}
int smiles_bond_parse(const std::string &inp, Bond *&bond) {
auto start_tok = static_cast<int>(START_BOND);
std::vector<RWMol *> molVect;
Atom *atom = nullptr;
return smiles_parse_helper(inp, molVect, atom, bond, start_tok);
}
int smiles_atom_parse(const std::string &inp, Atom *&atom) {
auto start_tok = static_cast<int>(START_ATOM);
std::vector<RWMol *> molVect;
Bond *bond = nullptr;
return smiles_parse_helper(inp, molVect, atom, bond, start_tok);
}
int smiles_parse(const std::string &inp, std::vector<RDKit::RWMol *> &molVect) {
auto start_tok = static_cast<int>(START_MOL);
Atom *atom = nullptr;
Bond *bond = nullptr;
return smiles_parse_helper(inp, molVect, atom, bond, start_tok);
}
typedef enum { BASE = 0, BRANCH, RECURSE } SmaState;
std::string labelRecursivePatterns(const std::string &sma) {
#ifndef NO_AUTOMATIC_SMARTS_RELABELLING
std::list<SmaState> state;
std::list<unsigned int> startRecurse;
std::map<std::string, std::string> patterns;
std::string res;
state.push_back(BASE);
unsigned int pos = 0;
while (pos < sma.size()) {
res += sma[pos];
if (sma[pos] == '$' && pos + 1 < sma.size() && sma[pos + 1] == '(') {
state.push_back(RECURSE);
startRecurse.push_back(pos);
++pos;
res += sma[pos];
} else if (sma[pos] == '(') {
state.push_back(BRANCH);
} else if (sma[pos] == ')') {
SmaState currState = state.back();
state.pop_back();
if (currState == RECURSE) {
unsigned int dollarPos = startRecurse.back();
startRecurse.pop_back();
if (pos + 1 >= sma.size() || sma[pos + 1] != '_') {
std::string recurs = sma.substr(dollarPos, pos - dollarPos + 1);
std::string label;
if (patterns.find(recurs) != patterns.end()) {
// seen this one before, add the label
label = patterns[recurs];
} else {
label = std::to_string(patterns.size() + 100);
patterns[recurs] = label;
}
res += "_" + label;
}
} else if (currState == BRANCH) {
// no need to do anything here.
}
}
++pos;
}
// std::cerr<< " >"<<sma<<"->"<<res<<std::endl;
return res;
#else
return sma;
#endif
}
} // namespace
RWMol *toMol(const std::string &inp,
int func(const std::string &, std::vector<RDKit::RWMol *> &),
const std::string &origInp) {
// empty strings produce empty molecules:
if (inp.empty()) {
return new RWMol();
}
RWMol *res = nullptr;
std::vector<RDKit::RWMol *> molVect;
try {
func(inp, molVect);
if (!molVect.empty()) {
res = molVect[0];
SmilesParseOps::CloseMolRings(res, false);
SmilesParseOps::SetUnspecifiedBondTypes(res);
SmilesParseOps::AdjustAtomChiralityFlags(res);
// No sense leaving this bookmark intact:
if (res->hasAtomBookmark(ci_RIGHTMOST_ATOM)) {
res->clearAtomBookmark(ci_RIGHTMOST_ATOM);
}
SmilesParseOps::CleanupAfterParsing(res);
molVect[0] = nullptr; // NOTE: to avoid leaks on failures, this should
// occur last in this if.
}
} catch (SmilesParseException &e) {
std::string nm = "SMILES";
if (func == smarts_parse) {
nm = "SMARTS";
}
BOOST_LOG(rdErrorLog) << nm << " Parse Error: " << e.what()
<< " for input: '" << origInp << "'" << std::endl;
res = nullptr;
}
BOOST_FOREACH (RDKit::RWMol *molPtr, molVect) {
if (molPtr) {
// Clean-up the bond bookmarks when not calling CloseMolRings
SmilesParseOps::CleanupAfterParseError(molPtr);
delete molPtr;
}
}
return res;
}
Atom *toAtom(const std::string &inp, int func(const std::string &, Atom *&)) {
// empty strings produce empty molecules:
if (inp.empty()) {
return nullptr;
}
Atom *res = nullptr;
try {
func(inp, res);
} catch (SmilesParseException &e) {
std::string nm = "SMILES";
if (func != smiles_atom_parse) {
nm = "SMARTS";
}
BOOST_LOG(rdErrorLog) << nm << " Parse Error: " << e.what()
<< " for input: '" << inp << "'" << std::endl;
res = nullptr;
}
return res;
}
Bond *toBond(const std::string &inp, int func(const std::string &, Bond *&)) {
// empty strings produce empty molecules:
if (inp.empty()) {
return nullptr;
}
Bond *res = nullptr;
try {
func(inp, res);
} catch (SmilesParseException &e) {
std::string nm = "SMILES";
if (func != smiles_bond_parse) {
nm = "SMARTS";
}
BOOST_LOG(rdErrorLog) << nm << " Parse Error: " << e.what()
<< " for input: '" << inp << "'" << std::endl;
res = nullptr;
}
return res;
}
namespace {
void preprocessSmiles(const std::string &smiles,
const SmilesParserParams &params, std::string &lsmiles,
std::string &name, std::string &cxPart) {
if (params.parseName && !params.allowCXSMILES) {
std::vector<std::string> tokens;
boost::split(tokens, smiles, boost::is_any_of(" \t"),
boost::token_compress_on);
lsmiles = tokens[0];
if (tokens.size() > 1) {
name = tokens[1];
}
} else if (params.allowCXSMILES) {
size_t sidx = smiles.find_first_of(" \t");
if (sidx != std::string::npos && sidx != 0) {
lsmiles = smiles.substr(0, sidx);
cxPart = boost::trim_copy(smiles.substr(sidx, smiles.size() - sidx));
}
}
if (lsmiles.empty()) {
lsmiles = smiles;
}
if (params.replacements) {
std::string smi = lsmiles;
bool loopAgain = true;
while (loopAgain) {
loopAgain = false;
for (std::map<std::string, std::string>::const_iterator replIt =
params.replacements->begin();
replIt != params.replacements->end(); ++replIt) {
if (boost::find_first(smi, replIt->first)) {
loopAgain = true;
boost::replace_all(smi, replIt->first, replIt->second);
}
}
}
lsmiles = smi;
}
}
} // namespace
Atom *SmilesToAtom(const std::string &smiles) {
yysmiles_debug = false;
Atom *res = nullptr;
res = toAtom(smiles, smiles_atom_parse);
return res;
};
Bond *SmilesToBond(const std::string &smiles) {
yysmiles_debug = false;
Bond *res = nullptr;
res = toBond(smiles, smiles_bond_parse);
return res;
};
RWMol *SmilesToMol(const std::string &smiles,
const SmilesParserParams &params) {
// Calling SmilesToMol in a multithreaded context is generally safe *unless*
// the value of debugParse is different for different threads. The if
// statement below avoids a TSAN warning in the case where multiple threads
// all use the same value for debugParse.
if (yysmiles_debug != params.debugParse) {
yysmiles_debug = params.debugParse;
}
std::string lsmiles, name, cxPart;
preprocessSmiles(smiles, params, lsmiles, name, cxPart);
// strip any leading/trailing whitespace:
// boost::trim_if(smi,boost::is_any_of(" \t\r\n"));
RWMol *res = nullptr;
res = toMol(lsmiles, smiles_parse, lsmiles);
if (res && params.allowCXSMILES && !cxPart.empty()) {
std::string::const_iterator pos = cxPart.cbegin();
try {
SmilesParseOps::parseCXExtensions(*res, cxPart, pos);
} catch (const SmilesParseException &) {
if (params.strictCXSMILES) {
delete res;
throw;
}
}
res->setProp("_CXSMILES_Data", std::string(cxPart.cbegin(), pos));
if (params.parseName && pos != cxPart.cend()) {
std::string nmpart(pos, cxPart.cend());
name = boost::trim_copy(nmpart);
}
}
if (res && (params.sanitize || params.removeHs)) {
try {
if (params.removeHs) {
bool implicitOnly = false, updateExplicitCount = true;
MolOps::removeHs(*res, implicitOnly, updateExplicitCount,
params.sanitize);
} else if (params.sanitize) {
MolOps::sanitizeMol(*res);
}
} catch (...) {
delete res;
throw;
}
// figure out stereochemistry:
bool cleanIt = true, force = true, flagPossible = true;
MolOps::assignStereochemistry(*res, cleanIt, force, flagPossible);
}
if (res && !name.empty()) {
res->setProp(common_properties::_Name, name);
}
return res;
};
Atom *SmartsToAtom(const std::string &smiles) {
yysmarts_debug = false;
Atom *res = nullptr;
res = toAtom(smiles, smarts_atom_parse);
return res;
};
Bond *SmartsToBond(const std::string &smiles) {
yysmarts_debug = false;
Bond *res = nullptr;
res = toBond(smiles, smarts_bond_parse);
return res;
};
RWMol *SmartsToMol(const std::string &smarts, int debugParse, bool mergeHs,
std::map<std::string, std::string> *replacements) {
// Calling SmartsToMol in a multithreaded context is generally safe *unless*
// the value of debugParse is different for different threads. The if
// statement below avoids a TSAN warning in the case where multiple threads
// all use the same value for debugParse.
if (yysmarts_debug != debugParse) {
yysmarts_debug = debugParse;
}
// boost::trim_if(sma,boost::is_any_of(" \t\r\n"));
std::string sma;
RWMol *res;
if (replacements) {
sma = smarts;
bool loopAgain = true;
while (loopAgain) {
loopAgain = false;
for (std::map<std::string, std::string>::const_iterator replIt =
replacements->begin();
replIt != replacements->end(); ++replIt) {
if (boost::find_first(sma, replIt->first)) {
loopAgain = true;
boost::replace_all(sma, replIt->first, replIt->second);
}
}
}
std::string oInput = sma;
res = toMol(labelRecursivePatterns(sma), smarts_parse, oInput);
} else {
res = toMol(labelRecursivePatterns(smarts), smarts_parse, smarts);
}
if (res) {
if (mergeHs) {
try {
MolOps::mergeQueryHs(*res);
} catch (...) {
delete res;
throw;
}
}
MolOps::setBondStereoFromDirections(*res);
}
return res;
};
} // namespace RDKit