Files
rdkit/Code/GraphMol/FileParsers/PDBParser.cpp
2013-10-03 09:52:48 +02:00

494 lines
15 KiB
C++

//
// Copyright (C) 2013 Greg Landrum and NextMove Software
//
// @@ All Rights Reserved @@
// This file is part of the RDKit.
// The contents are covered by the terms of the BSD license
// which is included in the file license.txt, found at the root
// of the RDKit source tree.
//
#include <string.h>
#include <stdio.h>
#include <string>
#include <iostream>
#include <fstream>
#include <boost/lexical_cast.hpp>
#include <RDGeneral/BadFileException.h>
#include <RDGeneral/FileParseException.h>
#include <GraphMol/FileParsers/FileParsers.h>
#include <GraphMol/FileParsers/FileParserUtils.h>
#include "ProximityBonds.h"
#include <GraphMol/MonomerInfo.h>
namespace RDKit {
static Atom *PDBAtomFromSymbol(const char *symb) {
PRECONDITION(symb,"bad char ptr");
if (symb[0]=='D' && !symb[1]) {
Atom *result = new Atom(1);
result->setIsotope(2);
return result;
} else if (symb[0]=='T' && !symb[1]) {
Atom *result = new Atom(1);
result->setIsotope(3);
return result;
}
int elemno = PeriodicTable::getTable()->getAtomicNumber(symb);
return elemno > 0 ? new Atom(elemno) : (Atom*)0;
}
static void PDBAtomLine(RWMol *mol, const char *ptr, unsigned int len,
unsigned int flavor, std::map<int,Atom*> &amap)
{
PRECONDITION(mol,"bad mol");
PRECONDITION(ptr,"bad char ptr");
std::string tmp;
if (len < 16)
return;
if ((flavor & 1) == 0) {
// Ignore alternate locations of atoms.
if (len >= 17 && ptr[16]!=' ' && ptr[16]!='A' && ptr[16]!='1')
return;
// Ignore XPLOR pseudo atoms
if (len >= 54 && !memcmp(ptr+30,"9999.0009999.0009999.000",24))
return;
// Ignore NMR pseudo atoms
if (ptr[12]==' ' && ptr[13]=='Q')
return;
// Ignore PDB dummy residues
if (len >= 20 && !memcmp(ptr+18,"DUM",3))
return;
}
int serialno;
tmp = std::string(ptr+6,5);
try {
serialno = FileParserUtils::toInt(tmp);
} catch (boost::bad_lexical_cast &) {
std::ostringstream errout;
errout << "Non-integer PDB serial number " << tmp;
throw FileParseException(errout.str()) ;
}
Atom *atom = (Atom*)0;
char symb[3];
// Attempt #1: Atomic Symbol in columns 76 and 77
if (len >= 78) {
if (ptr[76]>='A' && ptr[76]<='Z') {
symb[0] = ptr[76];
if (ptr[77]>='A' && ptr[77]<='Z') {
symb[1] = ptr[77]+32; // tolower
symb[2] = '\0';
} else if (ptr[77]>='a' && ptr[77]<='z') {
symb[1] = ptr[77];
symb[2] = '\0';
} else symb[1] = '\0';
} else if (ptr[76]==' ' && ptr[77]>='A' && ptr[77]<='Z') {
symb[0] = ptr[77];
symb[1] = '\0';
} else symb[0] = '\0';
} else if (len==77) {
if (ptr[76]>='A' && ptr[76]<='Z') {
symb[0] = ptr[76];
symb[1] = '\0';
} else symb[0] = '\0';
} else symb[0] = '\0';
if (symb[0])
atom = PDBAtomFromSymbol(symb);
if (!atom) {
// Attempt #2: Atomic Symbol from PDB atom name
if (ptr[13]>='A' && ptr[13]<='Z') {
if (ptr[12]==' ') {
symb[0] = ptr[13];
if (ptr[14]>='a' && ptr[14]<='z') {
symb[1] = ptr[14];
symb[2] = '\0';
} else symb[1]='\0';
} else if (ptr[12]>='A' && ptr[12]<='Z') {
symb[0] = ptr[12];
symb[1] = ptr[13]+32; // tolower
symb[2] = '\0';
if (ptr[12]=='H' && ptr[0]=='A') {
// No He, Hf, Hg, Ho or Hs in ATOM records
symb[0] = 'H';
symb[1] = '\0';
}
} else if (ptr[12]>='0' && ptr[12]<='9') {
symb[0] = ptr[13];
symb[1] = '\0';
} else symb[0] = '\0';
} else symb[0] = '\0';
if (symb[0])
atom = PDBAtomFromSymbol(symb);
}
if (!atom) {
std::ostringstream errout;
errout << "Cannot determine element for PDB atom #" << serialno;
throw FileParseException(errout.str());
}
mol->addAtom(atom,true,true);
amap[serialno] = atom;
if (len >= 38) {
RDGeom::Point3D pos;
try {
pos.x = FileParserUtils::toDouble(std::string(ptr+30,8));
if (len >= 46)
pos.y = FileParserUtils::toDouble(std::string(ptr+38,8));
if (len >= 54)
pos.z = FileParserUtils::toDouble(std::string(ptr+46,8));
}
catch (boost::bad_lexical_cast &) {
std::ostringstream errout;
errout << "Problem with coordinates for PDB atom #" << serialno;
throw FileParseException(errout.str()) ;
}
Conformer *conf;
if (!mol->getNumConformers()) {
conf = new RDKit::Conformer(mol->getNumAtoms());
conf->set3D(pos.z!=0.0);
conf->setId(0);
mol->addConformer(conf,false);
} else {
conf = &mol->getConformer();
if (pos.z != 0.0)
conf->set3D(true);
}
conf->setAtomPos(atom->getIdx(),pos);
}
if (len >= 79) {
int charge = 0;
if (ptr[78]>='1' && ptr[78]<='9') {
if (ptr[79]=='-')
charge = -(ptr[78]-'0');
else if (ptr[79]=='+' || ptr[79]==' ' || !ptr[79])
charge = ptr[78]-'0';
} else if (ptr[78]=='+') {
if (ptr[79]>='1' && ptr[79]<='9')
charge = ptr[79]-'0';
else if (ptr[79]=='+')
charge = 2;
else if (ptr[79]!='0')
charge = 1;
} else if (ptr[78]=='-') {
if (ptr[79]>='1' && ptr[79]<='9')
charge = ptr[79]-'0';
else if (ptr[79]=='-')
charge = -2;
else if (ptr[79]!='0')
charge = -1;
} else if (ptr[78]==' ') {
if (ptr[79]>='1' && ptr[79]<='9')
charge = ptr[79]-'0';
else if (ptr[79]=='+')
charge = 1;
else if (ptr[79]=='-')
charge = -1;
}
if (charge != 0)
atom->setFormalCharge(charge);
}
tmp = std::string(ptr+12,4);
AtomPDBResidueInfo *info = new AtomPDBResidueInfo(tmp);
atom->setMonomerInfo(info);
if (len >= 20)
tmp = std::string(ptr+17,3);
else tmp = "UNL";
info->setResidueName(tmp);
if (ptr[0]=='H')
info->setIsHeteroAtom(true);
if (len >= 17)
tmp = std::string(ptr+16,1);
else tmp = " ";
info->setAltLoc(tmp);
if (len >= 22)
tmp = std::string(ptr+21,1);
else tmp = " ";
info->setChainId(tmp);
if (len >= 27)
tmp = std::string(ptr+26,1);
else tmp = " ";
info->setInsertionCode(tmp);
int resno = 1;
if (len >= 26) {
try {
resno = FileParserUtils::toInt(std::string(ptr+22,4));
} catch (boost::bad_lexical_cast &) {
std::ostringstream errout;
errout << "Problem with residue number for PDB atom #" << serialno;
throw FileParseException(errout.str()) ;
}
}
info->setSerialNumber(resno);
double occup = 1.0;
if (len >= 60) {
try {
occup = FileParserUtils::toDouble(std::string(ptr+54,6));
} catch (boost::bad_lexical_cast &) {
std::ostringstream errout;
errout << "Problem with occupancy for PDB atom #" << serialno;
throw FileParseException(errout.str()) ;
}
}
info->setOccupancy(occup);
double bfactor = 0.0;
if (len >= 66) {
try {
bfactor = FileParserUtils::toDouble(std::string(ptr+60,6));
} catch (boost::bad_lexical_cast &) {
std::ostringstream errout;
errout << "Problem with temperature factor for PDB atom #" << serialno;
throw FileParseException(errout.str()) ;
}
}
info->setTempFactor(bfactor);
}
static void PDBBondLine(RWMol *mol, const char *ptr, unsigned int len,
std::map<int,Atom*> &amap, std::map<Bond*,int> &bmap)
{
PRECONDITION(mol,"bad mol");
PRECONDITION(ptr,"bad char ptr");
if (len < 16)
return;
std::string tmp(ptr+6,5);
bool fail = false;
int src, dst;
try {
src = FileParserUtils::toInt(tmp);
if (amap.find(src) == amap.end())
return;
} catch (boost::bad_lexical_cast &) {
fail = true;
}
if (!fail) {
if (len > 41) len = 41;
for (unsigned int pos=11; pos+5<=len; pos+=5) {
if (!memcmp(ptr+pos," ",5))
break;
try {
dst = FileParserUtils::toInt(std::string(ptr+pos,5));
if (dst==src || amap.find(dst) == amap.end())
continue;
} catch (boost::bad_lexical_cast &) {
fail = true;
}
if (!fail) {
Bond *bond = mol->getBondBetweenAtoms(amap[src]->getIdx(),
amap[dst]->getIdx());
if (bond) {
// Here we use a single byte bitmap to count duplicates
// Low nibble counts src < dst, high nibble for src > dst
int seen = bmap[bond];
if (src < dst) {
if ((seen&0x0f) == 0x01) {
bmap[bond] = seen | 0x02;
if ((seen & 0x20) == 0)
bond->setBondType(Bond::DOUBLE);
} else if ((seen&0x0f) == 0x03) {
bmap[bond] = seen | 0x04;
if ((seen & 0x40) == 0)
bond->setBondType(Bond::TRIPLE);
} else if ((seen&0x0f) == 0x07) {
bmap[bond] = seen | 0x08;
if ((seen & 0x80) == 0)
bond->setBondType(Bond::QUADRUPLE);
}
} else /* src < dst */ {
if ((seen&0xf0) == 0x10) {
bmap[bond] = seen | 0x20;
if ((seen & 0x02) == 0)
bond->setBondType(Bond::DOUBLE);
} else if ((seen&0xf0) == 0x30) {
bmap[bond] = seen | 0x40;
if ((seen & 0x04) == 0)
bond->setBondType(Bond::TRIPLE);
} else if ((seen&0xf0) == 0x70) {
bmap[bond] = seen | 0x80;
if ((seen & 0x08) == 0)
bond->setBondType(Bond::QUADRUPLE);
}
}
} else {
bond = new Bond(Bond::SINGLE);
bond->setOwningMol(mol);
bond->setBeginAtom(amap[src]);
bond->setEndAtom(amap[dst]);
mol->addBond(bond,true);
bmap[bond] = (src < dst) ? 0x01 : 0x10;
}
} else break;
}
}
if (fail) {
std::ostringstream errout;
errout << "Problem with CONECT record for PDB atom #" << tmp;
throw FileParseException(errout.str()) ;
}
}
static void PDBTitleLine(RWMol *mol, const char *ptr, unsigned int len)
{
PRECONDITION(mol,"bad mol");
PRECONDITION(ptr,"bad char ptr");
std::string title;
while (ptr[len-1] == ' ')
len--;
if (ptr[len-1]==';')
len--;
if (len > 21 && !strncmp(ptr+10," MOLECULE: ",11))
title = std::string(ptr+21,len-21);
else if (len > 10)
title = std::string(ptr+10,len-10);
if (!title.empty())
mol->setProp("_Name",title);
}
RWMol *PDBBlockToMol(const char *str, bool sanitize,
bool removeHs, unsigned int flavor)
{
PRECONDITION(str,"bad char ptr");
std::map<int,Atom*> amap;
std::map<Bond*,int> bmap;
RWMol *mol = 0;
while (*str) {
unsigned int len;
const char *next = str+1;
for(;;) {
if (*next == '\r') {
len = (unsigned int)(next-str);
if (next[1]=='\n') {
next += 2;
} else next++;
break;
} else if(*next == '\n') {
len = (unsigned int)(next-str);
next++;
break;
} else if(*next == '\0') {
len = (unsigned int)(next-str);
break;
}
next++;
}
// ATOM records
if (str[0]=='A' && str[1]=='T' && str[2]=='O' &&
str[3]=='M' && str[4]==' ' && str[5]==' ') {
if (!mol) mol = new RWMol();
PDBAtomLine(mol,str,len,flavor,amap);
// HETATM records
} else if (str[0]=='H' && str[1]=='E' && str[2]=='T' &&
str[3]=='A' && str[4]=='T' && str[5]=='M') {
if (!mol) mol = new RWMol();
PDBAtomLine(mol,str,len,flavor,amap);
// CONECT records
} else if (str[0]=='C' && str[1]=='O' && str[2]=='N' &&
str[3]=='E' && str[4]=='C' && str[5]=='T') {
if (mol)
PDBBondLine(mol,str,len,amap,bmap);
// COMPND records
} else if (str[0]=='C' && str[1]=='O' && str[2]=='M' &&
str[3]=='P' && str[4]=='N' && str[5]=='D') {
if (!mol) mol = new RWMol();
if (len > 10 && (str[9]==' '||!strncmp(str+9,"2 MOLECULE: ",12)))
PDBTitleLine(mol,str,len);
// HEADER records
} else if (str[0]=='H' && str[1]=='E' && str[2]=='A' &&
str[3]=='D' && str[4]=='E' && str[5]=='R') {
if (!mol) mol = new RWMol();
PDBTitleLine(mol,str,len<50?len:50);
}
str = next;
}
if (!mol)
return (RWMol*)0;
ConnectTheDots(mol);
StandardPDBResidueBondOrders(mol);
for (std::map<int,Atom*>::iterator mi=amap.begin(); mi!=amap.end(); ++mi)
(*mi).second->calcExplicitValence(false);
if (sanitize) {
if (removeHs) {
ROMol *tmp = MolOps::removeHs(*mol,false,false);
if (tmp) {
delete mol;
mol = static_cast<RWMol *>(tmp);
}
} else {
MolOps::sanitizeMol(*mol);
}
}
return mol;
}
RWMol *PDBBlockToMol(const std::string &str, bool sanitize,
bool removeHs, unsigned int flavor)
{
return PDBBlockToMol(str.c_str(),sanitize,removeHs,flavor);
}
RWMol *PDBDataStreamToMol(std::istream *inStream, bool sanitize,
bool removeHs, unsigned int flavor)
{
PRECONDITION(inStream,"bad stream");
std::string buffer;
const char *ptr;
while (!inStream->eof()) {
std::string line;
std::getline(*inStream,line);
buffer += line;
buffer += '\n';
ptr = line.c_str();
if (ptr[0]=='E' && ptr[1]=='N' && ptr[2]=='D' &&
(ptr[3]==' ' || ptr[3]=='\r' || ptr[3]=='\n' || !ptr[3]))
break;
}
return PDBBlockToMol(buffer.c_str(),sanitize,removeHs,flavor);
}
RWMol *PDBDataStreamToMol(std::istream &inStream, bool sanitize,
bool removeHs, unsigned int flavor){
return PDBDataStreamToMol(&inStream,sanitize,removeHs,flavor);
}
RWMol *PDBFileToMol(const std::string &fileName, bool sanitize,
bool removeHs, unsigned int flavor)
{
std::ifstream ifs(fileName.c_str(),std::ios_base::binary);
if (!ifs || ifs.bad()) {
std::ostringstream errout;
errout << "Bad input file " << fileName;
throw BadFileException(errout.str());
}
return PDBDataStreamToMol(static_cast<std::istream*>(&ifs),sanitize,removeHs,flavor);
}
}