// $Id$ // // Copyright (C) 2004-2008 Greg Landrum and Rational Discovery LLC // // @@ All Rights Reserved @@ // This file is part of the RDKit. // The contents are covered by the terms of the BSD license // which is included in the file license.txt, found at the root // of the RDKit source tree. // #include "FeatureParser.h" #include "MolChemicalFeatureDef.h" #include #include #include #include #include #include #include #include typedef boost::tokenizer> tokenizer; #include #include #include namespace RDKit { namespace Local { typedef boost::tokenizer> CommaTokenizer; void getNextLine(std::istream &inStream, std::string &line, unsigned int &lineNo) { if (inStream.eof() || inStream.fail()) { return; } line = ""; bool continuationLine = false; while (!inStream.eof() && !inStream.fail()) { std::string tmpLine; std::getline(inStream, tmpLine); lineNo++; // std::cerr << ">> " << lineNo << " " << tmpLine << std::endl; if (tmpLine == "") { continue; } if (tmpLine[0] != '#') { // strip space at the end to check for a continuation line: std::string stripLine = boost::trim_right_copy_if(tmpLine, boost::is_any_of(" \t\r\n")); if (stripLine == "") { continue; } if (stripLine[stripLine.size() - 1] != '\\') { if (continuationLine) { // if it's a continuation line, strip any whitespace: boost::trim_if(tmpLine, boost::is_any_of(" \t\r\n")); } line += tmpLine; return; } else { continuationLine = true; boost::trim_if(tmpLine, boost::is_any_of(" \t\r\n")); line += tmpLine.substr(0, tmpLine.size() - 1); } } } } // ------------------------------------------------------ bool expandAndTestSmarts( std::string &smarts, const std::map &atomTypeDefs) { for (const auto &atomTypeDef : atomTypeDefs) { std::string atomName = atomTypeDef.first; std::string atomSma = atomTypeDef.second; boost::replace_all(smarts, atomName, atomSma); } RWMol *mol = nullptr; try { mol = SmartsToMol(smarts); } catch (SmilesParseException &) { return false; } if (mol) { delete mol; } else { return false; } return true; } // ------------------------------------------------------ void parseAtomType(const std::string &inLine, std::map &atomTypeDefs, const unsigned int &lineNo) { boost::char_separator sep(" \t"); boost::tokenizer> tok(inLine, sep); boost::tokenizer>::iterator tokIt = tok.begin(); if (tokIt == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "empty input line for AtomType"); } std::string keyword = boost::to_upper_copy(*tokIt); if (keyword != "ATOMTYPE") { throw FeatureFileParseException(lineNo, inLine, "bad input line for AtomType"); } tokIt++; if (tokIt == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "bad AtomType line, missing label"); } std::string atomType = *tokIt; bool negater = false; if (atomType[0] == '!') { atomType.erase(0, 1); negater = true; } atomType = "{" + atomType + "}"; tokIt++; if (tokIt == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "bad AtomType line, missing definition"); } std::string sma; if (atomTypeDefs.count(atomType)) { std::string base = atomTypeDefs[atomType]; sma = "$(" + *tokIt + ")"; if (negater) { std::string toAdd = "[!" + sma + ";"; boost::replace_first(base, "[", toAdd); } else { std::string toAdd = "," + sma + "]"; boost::replace_last(base, "]", toAdd); } sma = base; } else { sma = "$(" + *tokIt + ")"; } // make it a valid smarts definition for an atom: sma = "[" + sma + "]"; // make sure we get sensible SMARTS: if (!expandAndTestSmarts(sma, atomTypeDefs)) { std::string msg = "invalid SMARTS in AtomType (" + atomType + "): " + sma; throw FeatureFileParseException(lineNo, inLine, msg); } // now cut the brackets back off: sma = sma.substr(1, sma.size() - 2); atomTypeDefs[atomType] = sma; } // ------------------------------------------------------ MolChemicalFeatureDef *parseFeatureDef( std::istream &inStream, const std::string &inLine, unsigned int &lineNo, const std::map &atomTypeDefs) { std::string nextLine = inLine; MolChemicalFeatureDef *res = nullptr; // handle a blank or comment first line: boost::trim_if(nextLine, boost::is_any_of(" \t\r\n")); while (nextLine.empty() || nextLine[0] == '#') { Local::getNextLine(inStream, nextLine, lineNo); // need to check for EOS before we strip: if (nextLine.empty()) { // we hit EOS: throw FeatureFileParseException(lineNo, inLine, "EOF hit parsing feature definition"); } boost::trim_if(nextLine, boost::is_any_of(" \t\r\n")); } boost::char_separator sep(" \t"); boost::tokenizer> tok(nextLine, sep); if (tok.begin() == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "bad DefineFeature line, no tokens found"); } boost::tokenizer>::iterator tokIt = tok.begin(); tokIt++; if (tokIt == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "bad DefineFeature line, missing subtype"); } std::string subType = *tokIt; tokIt++; if (tokIt == tok.end()) { throw FeatureFileParseException(lineNo, inLine, "bad DefineFeature line, missing pattern"); } std::string pattern = *tokIt; //--------------- // make sure we get sensible SMARTS: // if (!expandAndTestSmarts(pattern, atomTypeDefs)) { std::string msg = "invalid SMARTS in DefineFeature for type " + subType + ": " + pattern; throw FeatureFileParseException(lineNo, inLine, msg); } //--------------- // read out the rest of the definition // std::vector weights; std::string family = ""; bool foundEnd = false; Local::getNextLine(inStream, nextLine, lineNo); // std::getline(inStream,nextLine); while (nextLine != "") { boost::trim_if(nextLine, boost::is_any_of(" \t\r\n")); if (nextLine != "" && nextLine[0] != '#') { tok.assign(nextLine, sep); tokIt = tok.begin(); std::string token = boost::to_upper_copy(*tokIt); if (token == "ENDFEATURE") { foundEnd = true; break; } else if (token == "FAMILY") { tokIt++; if (tokIt == tok.end()) { std::string msg = "bad Type line for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } family = *tokIt; } else if (token == "WEIGHTS") { tokIt++; if (tokIt == tok.end()) { std::string msg = "bad Weights line for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } CommaTokenizer commaTok(*tokIt); for (CommaTokenizer::const_iterator commaTokIt = commaTok.begin(); commaTokIt != commaTok.end(); commaTokIt++) { std::string number = *commaTokIt; try { weights.push_back(boost::lexical_cast(number)); } catch (boost::bad_lexical_cast &) { std::string msg = "bad weight value (" + number + ") for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } } } else { std::string msg = "bad input line for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } } Local::getNextLine(inStream, nextLine, lineNo); // std::getline(inStream,nextLine); } if (!foundEnd) { std::string msg = "could not find EndFeature line for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } if (family == "") { std::string msg = "did not find Family definition for feature: " + subType; throw FeatureFileParseException(lineNo, inLine, msg); } //--------------- // Build the feature definition // res = new MolChemicalFeatureDef(pattern, family, subType); if (weights.size()) { res->setWeights(weights); res->normalizeWeights(); } return res; } } // end of namespace Local // ------------------------------------------------------ int parseFeatureData(const std::string &defnText, MolChemicalFeatureDef::CollectionType &featDefs) { std::stringstream ss(defnText); return parseFeatureData(ss, featDefs); } // ------------------------------------------------------ int parseFeatureData(std::istream &inStream, MolChemicalFeatureDef::CollectionType &res) { unsigned int lineNo = 0; std::string inLine; Local::getNextLine(inStream, inLine, lineNo); std::map atomTypeDefs; while (!inStream.eof() && !inStream.fail()) { // clean any whitespace off the line: boost::trim_if(inLine, boost::is_any_of(" \t\r\n")); if (!inLine.empty() && inLine[0] != '#' && inLine[0] != '\n') { boost::tokenizer<> tok(inLine); boost::tokenizer<>::iterator tokIt = tok.begin(); std::string token = boost::to_upper_copy(*tokIt); if (token == "ATOMTYPE") { Local::parseAtomType(inLine, atomTypeDefs, lineNo); } else if (token == "DEFINEFEATURE") { MolChemicalFeatureDef *fDef = Local::parseFeatureDef(inStream, inLine, lineNo, atomTypeDefs); if (fDef) { res.push_back(boost::shared_ptr(fDef)); } } else { throw FeatureFileParseException(lineNo, inLine, "bad or missing keyword"); } } // std::getline(inStream,inLine); Local::getNextLine(inStream, inLine, lineNo); } return 0; } // ------------------------------------------------------ int parseFeatureFile(const std::string &fileName, MolChemicalFeatureDef::CollectionType &res) { std::ifstream inStream(fileName.c_str()); if (!inStream || inStream.eof()) { return -1; } return parseFeatureData(inStream, res); } } // namespace RDKit