mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
320 lines
9.9 KiB
C++
Executable File
320 lines
9.9 KiB
C++
Executable File
// $Id$
|
|
//
|
|
// Copyright (C) 2007 Greg Landrum
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
//
|
|
|
|
#include "FileParsers.h"
|
|
#include "MolFileStereochem.h"
|
|
#include <RDGeneral/StreamOps.h>
|
|
|
|
#include <fstream>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <boost/algorithm/string.hpp>
|
|
#include <boost/algorithm/string/trim.hpp>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <typeinfo>
|
|
|
|
namespace RDKit{
|
|
// it's kind of stinky that we have to do this, but as of g++3.2 and
|
|
// boost 1.30, on linux calls to lexical_cast<int>(std::string)
|
|
// crash if the string starts with spaces.
|
|
template <typename T>
|
|
T stripSpacesAndCast(const std::string &input,bool acceptSpaces=false){
|
|
T res;
|
|
std::string trimmed=boost::trim_copy(input);
|
|
if(acceptSpaces && trimmed==""){
|
|
return 0;
|
|
} else {
|
|
return boost::lexical_cast<T>(trimmed);
|
|
}
|
|
try {
|
|
res = boost::lexical_cast<T>(trimmed);
|
|
}
|
|
catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot convert " << input << " to " << typeid(T).name();
|
|
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
void ParseTPLAtomLine(std::string text,unsigned int lineNum,RWMol *mol,
|
|
Conformer *conf){
|
|
PRECONDITION(mol,"no molecule");
|
|
PRECONDITION(conf,"no conformer");
|
|
std::vector<std::string> splitLine;
|
|
boost::split(splitLine,text,boost::is_any_of(" \t"),boost::token_compress_on);
|
|
if(splitLine.size()<8){
|
|
std::ostringstream errout;
|
|
errout << "Atom line " << lineNum << " only has " << splitLine.size() << " tokens. 8 are required."<<std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
Atom *atom=new Atom(splitLine[1]);
|
|
unsigned int atomId;
|
|
atomId=mol->addAtom(atom,false,true);
|
|
|
|
atom->setFormalCharge(stripSpacesAndCast<int>(splitLine[2]));
|
|
double partialChg=stripSpacesAndCast<double>(splitLine[3]);
|
|
atom->setProp("TPLCharge",partialChg);
|
|
double xp=stripSpacesAndCast<double>(splitLine[4]);
|
|
double yp=stripSpacesAndCast<double>(splitLine[5]);
|
|
double zp=stripSpacesAndCast<double>(splitLine[6]);
|
|
// coords in TPL files are in picometers, adjust:
|
|
xp/=100.;
|
|
yp/=100.;
|
|
zp/=100.;
|
|
conf->setAtomPos(atomId,RDGeom::Point3D(xp,yp,zp));
|
|
|
|
unsigned int nBonds=stripSpacesAndCast<unsigned int>(splitLine[7]);
|
|
// the only remaining info we care about is stereochem, and then only if
|
|
// the number of bonds is 4:
|
|
if(nBonds==4 && splitLine.size()>8+nBonds){
|
|
std::string stereoChem=splitLine[8+nBonds];
|
|
atom->setProp("TPLStereoFlag",stereoChem);
|
|
}
|
|
}
|
|
|
|
void ParseTPLBondLine(std::string text,unsigned int lineNum,RWMol *mol){
|
|
PRECONDITION(mol,"no molecule");
|
|
|
|
std::vector<std::string> splitLine;
|
|
boost::split(splitLine,text,boost::is_any_of(" \t"),boost::token_compress_on);
|
|
if(splitLine.size()<5){
|
|
std::ostringstream errout;
|
|
errout << "Bond line " << lineNum << " only has " << splitLine.size() << " tokens. 5 are required."<<std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
|
|
std::string tplOrder=boost::trim_copy(splitLine[1]).substr(0,3);
|
|
Bond::BondType bondOrder;
|
|
if(tplOrder=="1.5"){
|
|
bondOrder=Bond::AROMATIC;
|
|
} else if(tplOrder=="1.0"){
|
|
bondOrder=Bond::SINGLE;
|
|
} else if(tplOrder=="2.0"){
|
|
bondOrder=Bond::DOUBLE;
|
|
} else if(tplOrder=="3.0"){
|
|
bondOrder=Bond::TRIPLE;
|
|
} else {
|
|
std::ostringstream errout;
|
|
errout << "Bond line " << lineNum << " has unknown order: " << tplOrder <<std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
unsigned int idx1,idx2;
|
|
idx1 = stripSpacesAndCast<unsigned int>(splitLine[2])-1;
|
|
idx2 = stripSpacesAndCast<unsigned int>(splitLine[3])-1;
|
|
|
|
unsigned int bondIdx=mol->addBond(idx1,idx2,bondOrder)-1;
|
|
std::string stereoFlag1="";
|
|
std::string stereoFlag2="";
|
|
if(splitLine.size()>4){
|
|
stereoFlag1=splitLine[4];
|
|
if(splitLine.size()>5){
|
|
stereoFlag2=splitLine[5];
|
|
}
|
|
}
|
|
mol->getBondWithIdx(bondIdx)->setProp("TPLBondDir1",stereoFlag1);
|
|
mol->getBondWithIdx(bondIdx)->setProp("TPLBondDir2",stereoFlag2);
|
|
}
|
|
|
|
Conformer *ParseConfData(std::istream *inStream,unsigned int &line,RWMol *mol,
|
|
unsigned int confId){
|
|
PRECONDITION(inStream,"no stream");
|
|
PRECONDITION(mol,"no mol");
|
|
|
|
std::string tempStr;
|
|
std::vector<std::string> splitLine;
|
|
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
boost::split(splitLine,tempStr,boost::is_any_of(" \t"),boost::token_compress_on);
|
|
if(splitLine[0]!="NAME"){
|
|
std::ostringstream errout;
|
|
errout << "Did not find NAME tag on line " << line << " while reading conformer " << confId << std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
std::ostringstream propName;
|
|
propName << "Conf_" <<confId<<"_Name";
|
|
mol->setProp(propName.str(),boost::trim_copy(tempStr.substr(4,tempStr.size()-4)));
|
|
|
|
Conformer *conf=new Conformer(mol->getNumAtoms());
|
|
for(unsigned int i=0;i<mol->getNumAtoms();++i){
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
std::ostringstream errout;
|
|
errout << "EOF hit while reading conformer " << confId << std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
boost::trim(tempStr);
|
|
boost::split(splitLine,tempStr,
|
|
boost::is_any_of(" \t"),boost::token_compress_on);
|
|
if(splitLine.size()<3){
|
|
std::ostringstream errout;
|
|
errout << "Did not find enough fields on line " << line << " while reading conformer " << confId << std::endl;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
double xp=stripSpacesAndCast<double>(splitLine[0]);
|
|
double yp=stripSpacesAndCast<double>(splitLine[1]);
|
|
double zp=stripSpacesAndCast<double>(splitLine[2]);
|
|
// coords in TPL files are in picometers, adjust:
|
|
xp/=100.;
|
|
yp/=100.;
|
|
zp/=100.;
|
|
conf->setAtomPos(i,RDGeom::Point3D(xp,yp,zp));
|
|
}
|
|
return conf;
|
|
}
|
|
|
|
|
|
//*************************************
|
|
//
|
|
// Every effort has been made to adhere to the BioCad tpl definition
|
|
//
|
|
//*************************************
|
|
RWMol *TPLDataStreamToMol(std::istream *inStream, unsigned int &line,
|
|
bool sanitize,bool skipFirstConf){
|
|
PRECONDITION(inStream,"no stream");
|
|
std::string tempStr;
|
|
std::vector<std::string> splitText;
|
|
|
|
// format line:
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return NULL;
|
|
}
|
|
// comment line:
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return NULL;
|
|
}
|
|
// optional name line:
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return NULL;
|
|
}
|
|
RWMol *res = new RWMol();
|
|
if(tempStr.size()>=4 && tempStr.substr(0,4)=="NAME"){
|
|
tempStr = boost::trim_copy(tempStr.substr(4,tempStr.size()-4));
|
|
res->setProp("_Name",tempStr);
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return res;
|
|
}
|
|
}
|
|
if(tempStr.size()>=4 && tempStr.substr(0,4)=="PROP"){
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return res;
|
|
}
|
|
}
|
|
|
|
// we're at the counts line:
|
|
boost::split(splitText,tempStr,boost::is_any_of(" \t"),boost::token_compress_on);
|
|
unsigned int nAtoms,nBonds;
|
|
nAtoms = stripSpacesAndCast<unsigned int>(splitText[0]);
|
|
nBonds = stripSpacesAndCast<unsigned int>(splitText[1]);
|
|
|
|
Conformer *conf = new Conformer(nAtoms);
|
|
conf->setId(0);
|
|
for(unsigned int i=0;i<nAtoms;++i){
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
throw FileParseException("EOF hit while reading atoms.") ;
|
|
}
|
|
ParseTPLAtomLine(tempStr,line,res,conf);
|
|
}
|
|
res->addConformer(conf, true);
|
|
|
|
for(unsigned int i=0;i<nBonds;++i){
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
throw FileParseException("EOF hit while reading bonds.") ;
|
|
}
|
|
ParseTPLBondLine(tempStr,line,res);
|
|
}
|
|
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
if(inStream->eof()){
|
|
return res;
|
|
}
|
|
unsigned int nConfs=0;
|
|
if(tempStr.size()>=5 && tempStr.substr(0,5)=="CONFS"){
|
|
boost::split(splitText,tempStr,boost::is_any_of(" \t"),boost::token_compress_on);
|
|
nConfs = stripSpacesAndCast<unsigned int>(splitText[1]);
|
|
}
|
|
for(unsigned int i=0;i<nConfs;++i){
|
|
Conformer *conf=ParseConfData(inStream,line,res,i+1);
|
|
if(i>0 || !skipFirstConf){
|
|
conf->setId(i+1);
|
|
res->addConformer(conf, true);
|
|
} else {
|
|
delete conf;
|
|
}
|
|
// there should be a blank line:
|
|
line++;
|
|
tempStr = getLine(inStream);
|
|
boost::trim(tempStr);
|
|
if(!inStream->eof() && tempStr != ""){
|
|
throw FileParseException("Found a non-blank line between conformers.") ;
|
|
}
|
|
}
|
|
if(sanitize && res){
|
|
MolOps::sanitizeMol(*res);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a file
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *TPLFileToMol(std::string fName, bool sanitize,bool skipFirstConf){
|
|
std::ifstream inStream(fName.c_str());
|
|
if(!inStream){
|
|
return NULL;
|
|
}
|
|
RWMol *res=NULL;
|
|
if(!inStream.eof()){
|
|
unsigned int line = 0;
|
|
res=TPLDataStreamToMol(&inStream, line, sanitize,skipFirstConf);
|
|
}
|
|
return res;
|
|
}
|
|
|
|
#if 0
|
|
|
|
RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
|
|
bool sanitize){
|
|
return MolDataStreamToMol(&inStream,line,sanitize);
|
|
};
|
|
//------------------------------------------------
|
|
//
|
|
// Read a molecule from a string
|
|
//
|
|
//------------------------------------------------
|
|
RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize){
|
|
std::istringstream inStream(molBlock);
|
|
RWMol *res=NULL;
|
|
unsigned int line = 0;
|
|
return MolDataStreamToMol(inStream, line, sanitize);
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|