Files
rdkit/Code/GraphMol/MolChemicalFeatures/FeatureParser.cpp
2010-09-26 17:04:37 +00:00

323 lines
11 KiB
C++

// $Id$
//
// Copyright (C) 2004-2008 Greg Landrum and Rational Discovery LLC
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include "FeatureParser.h"
#include "MolChemicalFeatureDef.h"
#include <RDGeneral/StreamOps.h>
#include <GraphMol/RDKitBase.h>
#include <GraphMol/SmilesParse/SmilesParse.h>
#include <boost/shared_ptr.hpp>
#include <boost/tokenizer.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/tokenizer.hpp>
typedef boost::tokenizer<boost::char_separator<char> > tokenizer;
#include <fstream>
#include <sstream>
#include <map>
namespace RDKit {
namespace Local {
typedef boost::tokenizer<boost::escaped_list_separator<char> > CommaTokenizer;
void getNextLine(std::istream &inStream,std::string &line,unsigned int &lineNo){
if(inStream.eof()) return;
line = "";
bool continuationLine=false;
while(!inStream.eof()){
std::string tmpLine;
std::getline(inStream,tmpLine);
lineNo++;
//std::cerr << ">> " << lineNo << " " << tmpLine << std::endl;
if(tmpLine=="") continue;
if(tmpLine[0]!='#'){
// strip space at the end to check for a continuation line:
std::string stripLine=boost::trim_right_copy_if(tmpLine,
boost::is_any_of(" \t\r\n"));
if(stripLine=="") continue;
if(stripLine[stripLine.size()-1]!='\\'){
if(continuationLine){
// if it's a continuation line, strip any whitespace:
boost::trim_if(tmpLine,boost::is_any_of(" \t\r\n"));
}
line += tmpLine;
return;
} else {
continuationLine=true;
boost::trim_if(tmpLine,boost::is_any_of(" \t\r\n"));
line += tmpLine.substr(0,tmpLine.size()-1);
}
}
}
}
// ------------------------------------------------------
bool expandAndTestSmarts(std::string &smarts,
const std::map<std::string,std::string> &atomTypeDefs) {
for(std::map<std::string,std::string>::const_iterator mapIt=atomTypeDefs.begin();
mapIt!=atomTypeDefs.end();mapIt++){
std::string atomName=mapIt->first;
std::string atomSma=mapIt->second;
boost::replace_all(smarts,atomName,atomSma);
}
RWMol *mol=0;
try{
mol=SmartsToMol(smarts);
} catch (SmilesParseException &) {
return false;
}
if(mol){
delete mol;
} else {
return false;
}
return true;
}
// ------------------------------------------------------
void parseAtomType(const std::string &inLine,
std::map<std::string,std::string> &atomTypeDefs,
const unsigned int &lineNo) {
boost::char_separator<char> sep(" \t");
boost::tokenizer<boost::char_separator<char> > tok(inLine,sep);
boost::tokenizer<boost::char_separator<char> >::iterator tokIt=tok.begin();
if(tokIt==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"empty input line for AtomType");
}
std::string keyword=boost::to_upper_copy(*tokIt);
if(keyword!="ATOMTYPE"){
throw FeatureFileParseException(lineNo,inLine,
"bad input line for AtomType");
}
tokIt++;
if(tokIt==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"bad AtomType line, missing label");
}
std::string atomType=*tokIt;
bool negater=false;
if(atomType[0]=='!'){
atomType.erase(0,1);
negater=true;
}
atomType="{" + atomType + "}";
tokIt++;
if(tokIt==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"bad AtomType line, missing definition");
}
std::string sma;
if(atomTypeDefs.count(atomType)){
std::string base=atomTypeDefs[atomType];
sma="$(" + *tokIt + ")";
if(negater){
std::string toAdd="[!"+sma+";";
boost::replace_first(base,"[",toAdd);
} else {
std::string toAdd=","+sma+"]";
boost::replace_last(base,"]",toAdd);
}
sma = base;
} else {
sma="$(" + *tokIt + ")";
}
// make it a valid smarts definition for an atom:
sma = "["+sma+"]";
// make sure we get sensible SMARTS:
if(!expandAndTestSmarts(sma,atomTypeDefs)){
std::string msg="invalid SMARTS in AtomType (" + atomType + "): "+sma;
throw FeatureFileParseException(lineNo,inLine,msg);
}
// now cut the brackets back off:
sma = sma.substr(1,sma.size()-2);
atomTypeDefs[atomType] = sma;
}
// ------------------------------------------------------
MolChemicalFeatureDef *
parseFeatureDef(std::istream &inStream,
const std::string &inLine,
unsigned int &lineNo,
const std::map<std::string,std::string> &atomTypeDefs){
std::string nextLine=inLine;
MolChemicalFeatureDef *res=0;
// handle a blank or comment first line:
boost::trim_if(nextLine,boost::is_any_of(" \t\r\n"));
while(nextLine=="" || nextLine[0]=='#'){
Local::getNextLine(inStream,nextLine,lineNo);
// need to check for EOS before we strip:
if(nextLine=="") {
// we hit EOS:
throw FeatureFileParseException(lineNo,inLine,
"EOF hit parsing feature definition");
}
boost::trim_if(nextLine,boost::is_any_of(" \t\r\n"));
}
boost::char_separator<char> sep(" \t");
boost::tokenizer<boost::char_separator<char> > tok(nextLine,sep);
if(tok.begin()==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"bad DefineFeature line, no tokens found");
}
boost::tokenizer<boost::char_separator<char> >::iterator tokIt=tok.begin();
tokIt++;
if(tokIt==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"bad DefineFeature line, missing subtype");
}
std::string subType=*tokIt;
tokIt++;
if(tokIt==tok.end()){
throw FeatureFileParseException(lineNo,inLine,
"bad DefineFeature line, missing pattern");
}
std::string pattern=*tokIt;
//---------------
// make sure we get sensible SMARTS:
//
if(!expandAndTestSmarts(pattern,atomTypeDefs)){
std::string msg="invalid SMARTS in DefineFeature for type "+subType+": "+pattern;
throw FeatureFileParseException(lineNo,inLine,msg);
}
//---------------
// read out the rest of the definition
//
std::vector<double> weights;
std::string family="";
bool foundEnd=false;
Local::getNextLine(inStream,nextLine,lineNo);
//std::getline(inStream,nextLine);
while(nextLine != ""){
boost::trim_if(nextLine,boost::is_any_of(" \t\r\n"));
if(nextLine != "" && nextLine[0]!='#'){
tok.assign(nextLine,sep);
tokIt = tok.begin();
std::string token=boost::to_upper_copy(*tokIt);
if(token=="ENDFEATURE"){
foundEnd=true;
break;
} else if(token=="FAMILY"){
tokIt++;
if(tokIt==tok.end()){
std::string msg="bad Type line for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
family = *tokIt;
} else if(token=="WEIGHTS"){
tokIt++;
if(tokIt==tok.end()){
std::string msg="bad Weights line for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
CommaTokenizer commaTok(*tokIt);
for(CommaTokenizer::const_iterator commaTokIt=commaTok.begin();
commaTokIt!=commaTok.end();commaTokIt++){
std::string number=*commaTokIt;
try {
weights.push_back(boost::lexical_cast<double>(number));
} catch (boost::bad_lexical_cast &){
std::string msg="bad weight value (" + number + ") for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
}
} else {
std::string msg="bad input line for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
}
Local::getNextLine(inStream,nextLine,lineNo);
//std::getline(inStream,nextLine);
}
if(!foundEnd){
std::string msg="could not find EndFeature line for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
if(family==""){
std::string msg="did not find Family definition for feature: "+subType;
throw FeatureFileParseException(lineNo,inLine,msg);
}
//---------------
// Build the feature definition
//
res = new MolChemicalFeatureDef(pattern,family,subType);
if(weights.size()){
res->setWeights(weights);
res->normalizeWeights();
}
return res;
}
} // end of namespace Local
// ------------------------------------------------------
int parseFeatureData(const std::string &defnText,
MolChemicalFeatureDef::CollectionType &featDefs) {
std::stringstream ss(defnText);
return parseFeatureData(ss,featDefs);
}
// ------------------------------------------------------
int parseFeatureData(std::istream &inStream,
MolChemicalFeatureDef::CollectionType &res){
unsigned int lineNo=0;
std::string inLine;
Local::getNextLine(inStream,inLine,lineNo);
std::map<std::string,std::string> atomTypeDefs;
while(!inStream.eof()) {
// clean any whitespace off the line:
boost::trim_if(inLine,boost::is_any_of(" \t\r\n"));
if(inLine != "" && inLine[0]!='#' && inLine[0]!='\n'){
boost::tokenizer<> tok(inLine);
boost::tokenizer<>::iterator tokIt=tok.begin();
std::string token=boost::to_upper_copy(*tokIt);
if(token=="ATOMTYPE"){
Local::parseAtomType(inLine,atomTypeDefs,lineNo);
} else if(token=="DEFINEFEATURE"){
MolChemicalFeatureDef *fDef=Local::parseFeatureDef(inStream,inLine,
lineNo,atomTypeDefs);
if(fDef) res.push_back(boost::shared_ptr<MolChemicalFeatureDef>(fDef));
} else {
throw FeatureFileParseException(lineNo,inLine,"bad or missing keyword");
}
}
//std::getline(inStream,inLine);
Local::getNextLine(inStream,inLine,lineNo);
}
return 0;
}
// ------------------------------------------------------
int parseFeatureFile(const std::string &fileName,
MolChemicalFeatureDef::CollectionType &res){
std::ifstream inStream(fileName.c_str());
if( !inStream || inStream.eof() ){
return -1;
}
return parseFeatureData(inStream,res);
}
}