mirror of
https://github.com/rdkit/rdkit.git
synced 2026-06-04 21:54:27 +08:00
492 lines
14 KiB
C++
492 lines
14 KiB
C++
//
|
|
// Copyright (C) 2013 Greg Landrum and NextMove Software
|
|
//
|
|
// @@ All Rights Reserved @@
|
|
// This file is part of the RDKit.
|
|
// The contents are covered by the terms of the BSD license
|
|
// which is included in the file license.txt, found at the root
|
|
// of the RDKit source tree.
|
|
//
|
|
#include "PDBParser.h"
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <string>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <RDGeneral/BadFileException.h>
|
|
#include <RDGeneral/FileParseException.h>
|
|
#include <GraphMol/FileParsers/FileParsers.h>
|
|
#include <GraphMol/FileParsers/FileParserUtils.h>
|
|
|
|
#define HAVE_MONOMERINFO
|
|
#ifdef HAVE_MONOMERINFO
|
|
#include <GraphMol/MonomerInfo.h>
|
|
#endif
|
|
|
|
|
|
namespace RDKit {
|
|
|
|
static Atom *PDBAtomFromSymbol(const char *symb) {
|
|
PRECONDITION(symb,"bad char ptr");
|
|
if (symb[0]=='D' && !symb[1]) {
|
|
Atom *result = new Atom(1);
|
|
result->setIsotope(2);
|
|
return result;
|
|
} else if (symb[0]=='T' && !symb[1]) {
|
|
Atom *result = new Atom(1);
|
|
result->setIsotope(3);
|
|
return result;
|
|
}
|
|
int elemno = PeriodicTable::getTable()->getAtomicNumber(symb);
|
|
return elemno > 0 ? new Atom(elemno) : (Atom*)0;
|
|
}
|
|
|
|
static void PDBAtomLine(RWMol *mol, const char *ptr, unsigned int len,
|
|
unsigned int flavor, std::map<int,Atom*> &amap)
|
|
{
|
|
PRECONDITION(mol,"bad mol");
|
|
PRECONDITION(ptr,"bad char ptr");
|
|
std::string tmp;
|
|
|
|
if (len < 16)
|
|
return;
|
|
|
|
if ((flavor & 1) == 0) {
|
|
// Ignore alternate locations of atoms.
|
|
if (len >= 17 && ptr[16]!=' ' && ptr[16]!='A' && ptr[16]!='1')
|
|
return;
|
|
// Ignore XPLOR pseudo atoms
|
|
if (len >= 54 && !memcmp(ptr+30,"9999.0009999.0009999.000",24))
|
|
return;
|
|
// Ignore NMR pseudo atoms
|
|
if (ptr[12]==' ' && ptr[13]=='Q')
|
|
return;
|
|
// Ignore PDB dummy residues
|
|
if (len >= 20 && !memcmp(ptr+18,"DUM",3))
|
|
return;
|
|
}
|
|
|
|
int serialno;
|
|
tmp = std::string(ptr+6,5);
|
|
try {
|
|
serialno = FileParserUtils::toInt(tmp);
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Non-integer PDB serial number " << tmp;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
|
|
Atom *atom = (Atom*)0;
|
|
char symb[3];
|
|
|
|
// Attempt #1: Atomic Symbol in columns 76 and 77
|
|
if (len >= 78) {
|
|
if (ptr[76]>='A' && ptr[76]<='Z') {
|
|
symb[0] = ptr[76];
|
|
if (ptr[77]>='A' && ptr[77]<='Z') {
|
|
symb[1] = ptr[77]+32; // tolower
|
|
symb[2] = '\0';
|
|
} else if (ptr[77]>='a' && ptr[77]<='z') {
|
|
symb[1] = ptr[77];
|
|
symb[2] = '\0';
|
|
} else symb[1] = '\0';
|
|
} else if (ptr[76]==' ' && ptr[77]>='A' && ptr[77]<='Z') {
|
|
symb[0] = ptr[77];
|
|
symb[1] = '\0';
|
|
} else symb[0] = '\0';
|
|
} else if (len==77) {
|
|
if (ptr[76]>='A' && ptr[76]<='Z') {
|
|
symb[0] = ptr[76];
|
|
symb[1] = '\0';
|
|
} else symb[0] = '\0';
|
|
} else symb[0] = '\0';
|
|
|
|
if (symb[0])
|
|
atom = PDBAtomFromSymbol(symb);
|
|
|
|
if (!atom) {
|
|
// Attempt #2: Atomic Symbol from PDB atom name
|
|
if (ptr[13]>='A' && ptr[13]<='Z') {
|
|
if (ptr[12]==' ') {
|
|
symb[0] = ptr[13];
|
|
if (ptr[14]>='a' && ptr[14]<='z') {
|
|
symb[1] = ptr[14];
|
|
symb[2] = '\0';
|
|
} else symb[1]='\0';
|
|
} else if (ptr[12]>='A' && ptr[12]<='Z') {
|
|
symb[0] = ptr[12];
|
|
symb[1] = ptr[13]+32; // tolower
|
|
symb[2] = '\0';
|
|
if (ptr[12]=='H' && ptr[0]=='A') {
|
|
// No He, Hf, Hg, Ho or Hs in ATOM records
|
|
symb[0] = 'H';
|
|
symb[1] = '\0';
|
|
}
|
|
} else if (ptr[12]>='0' && ptr[12]<='9') {
|
|
symb[0] = ptr[13];
|
|
symb[1] = '\0';
|
|
} else symb[0] = '\0';
|
|
} else symb[0] = '\0';
|
|
|
|
if (symb[0])
|
|
atom = PDBAtomFromSymbol(symb);
|
|
}
|
|
|
|
if (!atom) {
|
|
std::ostringstream errout;
|
|
errout << "Cannot determine element for PDB atom #" << serialno;
|
|
throw FileParseException(errout.str());
|
|
}
|
|
|
|
mol->addAtom(atom,true,true);
|
|
amap[serialno] = atom;
|
|
|
|
if (len >= 38) {
|
|
RDGeom::Point3D pos;
|
|
try {
|
|
pos.x = FileParserUtils::toDouble(std::string(ptr+30,8));
|
|
if (len >= 46)
|
|
pos.y = FileParserUtils::toDouble(std::string(ptr+38,8));
|
|
if (len >= 54)
|
|
pos.z = FileParserUtils::toDouble(std::string(ptr+46,8));
|
|
}
|
|
catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Problem with coordinates for PDB atom #" << serialno;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
|
|
Conformer *conf;
|
|
if (!mol->getNumConformers()) {
|
|
conf = new RDKit::Conformer(mol->getNumAtoms());
|
|
conf->set3D(pos.z!=0.0);
|
|
conf->setId(0);
|
|
mol->addConformer(conf,false);
|
|
} else {
|
|
conf = &mol->getConformer();
|
|
if (pos.z != 0.0)
|
|
conf->set3D(true);
|
|
}
|
|
conf->setAtomPos(atom->getIdx(),pos);
|
|
}
|
|
|
|
if (len >= 79) {
|
|
int charge = 0;
|
|
if (ptr[78]>='1' && ptr[78]<='9') {
|
|
if (ptr[79]=='-')
|
|
charge = -(ptr[78]-'0');
|
|
else if (ptr[79]=='+' || ptr[79]==' ' || !ptr[79])
|
|
charge = ptr[78]-'0';
|
|
} else if (ptr[78]=='+') {
|
|
if (ptr[79]>='1' && ptr[79]<='9')
|
|
charge = ptr[79]-'0';
|
|
else if (ptr[79]=='+')
|
|
charge = 2;
|
|
else if (ptr[79]!='0')
|
|
charge = 1;
|
|
} else if (ptr[78]=='-') {
|
|
if (ptr[79]>='1' && ptr[79]<='9')
|
|
charge = ptr[79]-'0';
|
|
else if (ptr[79]=='-')
|
|
charge = -2;
|
|
else if (ptr[79]!='0')
|
|
charge = -1;
|
|
} else if (ptr[78]==' ') {
|
|
if (ptr[79]>='1' && ptr[79]<='9')
|
|
charge = ptr[79]-'0';
|
|
else if (ptr[79]=='+')
|
|
charge = 1;
|
|
else if (ptr[79]=='-')
|
|
charge = -1;
|
|
}
|
|
if (charge != 0)
|
|
atom->setFormalCharge(charge);
|
|
}
|
|
|
|
#ifdef HAVE_MONOMERINFO
|
|
tmp = std::string(ptr+12,4);
|
|
AtomPDBResidueInfo *info = new AtomPDBResidueInfo(tmp);
|
|
atom->setMonomerInfo(info);
|
|
|
|
if (len >= 20)
|
|
tmp = std::string(ptr+17,3);
|
|
else tmp = "UNL";
|
|
info->setResidueName(tmp);
|
|
if (ptr[0]=='H')
|
|
info->setIsHeteroAtom(true);
|
|
if (len >= 17)
|
|
tmp = std::string(ptr+16,1);
|
|
else tmp = " ";
|
|
info->setAltLoc(tmp);
|
|
if (len >= 22)
|
|
tmp = std::string(ptr+21,1);
|
|
else tmp = " ";
|
|
info->setChainId(tmp);
|
|
if (len >= 27)
|
|
tmp = std::string(ptr+26,1);
|
|
else tmp = " ";
|
|
info->setInsertionCode(tmp);
|
|
|
|
int resno = 1;
|
|
if (len >= 26) {
|
|
try {
|
|
resno = FileParserUtils::toInt(std::string(ptr+22,4));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Problem with residue number for PDB atom #" << serialno;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
}
|
|
info->setSerialNumber(resno);
|
|
|
|
double occup = 1.0;
|
|
if (len >= 60) {
|
|
try {
|
|
occup = FileParserUtils::toDouble(std::string(ptr+54,6));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Problem with occupancy for PDB atom #" << serialno;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
}
|
|
info->setOccupancy(occup);
|
|
|
|
double bfactor = 0.0;
|
|
if (len >= 66) {
|
|
try {
|
|
bfactor = FileParserUtils::toDouble(std::string(ptr+60,6));
|
|
} catch (boost::bad_lexical_cast &) {
|
|
std::ostringstream errout;
|
|
errout << "Problem with temperature factor for PDB atom #" << serialno;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
}
|
|
info->setTempFactor(bfactor);
|
|
#endif
|
|
}
|
|
|
|
static void PDBBondLine(RWMol *mol, const char *ptr, unsigned int len,
|
|
std::map<int,Atom*> &amap, std::map<Bond*,int> &bmap)
|
|
{
|
|
PRECONDITION(mol,"bad mol");
|
|
PRECONDITION(ptr,"bad char ptr");
|
|
|
|
if (len < 16)
|
|
return;
|
|
|
|
std::string tmp(ptr+6,5);
|
|
bool fail = false;
|
|
int src, dst;
|
|
|
|
try {
|
|
src = FileParserUtils::toInt(tmp);
|
|
if (amap.find(src) == amap.end())
|
|
fail = true;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
fail = true;
|
|
}
|
|
|
|
if (!fail) {
|
|
if (len > 41) len = 41;
|
|
for (unsigned int pos=11; pos+5<=len; pos+=5) {
|
|
if (!memcmp(ptr+pos," ",5))
|
|
break;
|
|
try {
|
|
dst = FileParserUtils::toInt(std::string(ptr+pos,5));
|
|
if (dst==src || amap.find(dst) == amap.end())
|
|
fail = true;
|
|
} catch (boost::bad_lexical_cast &) {
|
|
fail = true;
|
|
}
|
|
if (!fail) {
|
|
Bond *bond = mol->getBondBetweenAtoms(amap[src]->getIdx(),
|
|
amap[dst]->getIdx());
|
|
if (bond) {
|
|
// Here we use a single byte bitmap to count duplicates
|
|
// Low nibble counts src < dst, high nibble for src > dst
|
|
int seen = bmap[bond];
|
|
if (src < dst) {
|
|
if ((seen&0x0f) == 0x01) {
|
|
bmap[bond] = seen | 0x02;
|
|
if ((seen & 0x20) == 0)
|
|
bond->setBondType(Bond::DOUBLE);
|
|
} else if ((seen&0x0f) == 0x03) {
|
|
bmap[bond] = seen | 0x04;
|
|
if ((seen & 0x40) == 0)
|
|
bond->setBondType(Bond::TRIPLE);
|
|
} else if ((seen&0x0f) == 0x07) {
|
|
bmap[bond] = seen | 0x08;
|
|
if ((seen & 0x80) == 0)
|
|
bond->setBondType(Bond::QUADRUPLE);
|
|
}
|
|
} else /* src < dst */ {
|
|
if ((seen&0xf0) == 0x10) {
|
|
bmap[bond] = seen | 0x20;
|
|
if ((seen & 0x02) == 0)
|
|
bond->setBondType(Bond::DOUBLE);
|
|
} else if ((seen&0xf0) == 0x30) {
|
|
bmap[bond] = seen | 0x40;
|
|
if ((seen & 0x04) == 0)
|
|
bond->setBondType(Bond::TRIPLE);
|
|
} else if ((seen&0xf0) == 0x70) {
|
|
bmap[bond] = seen | 0x80;
|
|
if ((seen & 0x08) == 0)
|
|
bond->setBondType(Bond::QUADRUPLE);
|
|
}
|
|
}
|
|
} else {
|
|
bond = new Bond(Bond::SINGLE);
|
|
bond->setOwningMol(mol);
|
|
bond->setBeginAtom(amap[src]);
|
|
bond->setEndAtom(amap[dst]);
|
|
mol->addBond(bond,true);
|
|
bmap[bond] = (src < dst) ? 0x01 : 0x10;
|
|
}
|
|
} else break;
|
|
}
|
|
}
|
|
|
|
if (fail) {
|
|
std::ostringstream errout;
|
|
errout << "Problem with CONECT record for PDB atom #" << tmp;
|
|
throw FileParseException(errout.str()) ;
|
|
}
|
|
}
|
|
|
|
static void PDBTitleLine(RWMol *mol, const char *ptr, unsigned int len)
|
|
{
|
|
PRECONDITION(mol,"bad mol");
|
|
PRECONDITION(ptr,"bad char ptr");
|
|
std::string title;
|
|
while (ptr[len-1] == ' ')
|
|
len--;
|
|
if (ptr[len-1]==';')
|
|
len--;
|
|
if (len > 21 && !strncmp(ptr+10," MOLECULE: ",11))
|
|
title = std::string(ptr+21,len-21);
|
|
else if (len > 10)
|
|
title = std::string(ptr+10,len-10);
|
|
if (!title.empty())
|
|
mol->setProp("_Name",title);
|
|
}
|
|
|
|
RWMol *PDBBlockToMol(const char *str, bool sanitize,
|
|
bool removeHs, unsigned int flavor)
|
|
{
|
|
PRECONDITION(str,"bad char ptr");
|
|
std::map<int,Atom*> amap;
|
|
std::map<Bond*,int> bmap;
|
|
RWMol *mol = 0;
|
|
|
|
while (*str) {
|
|
unsigned int len;
|
|
const char *next = str+1;
|
|
for(;;) {
|
|
if (*next == '\r') {
|
|
len = (unsigned int)(next-str);
|
|
if (next[1]=='\n') {
|
|
next += 2;
|
|
} else next++;
|
|
break;
|
|
} else if(*next == '\n') {
|
|
len = (unsigned int)(next-str);
|
|
next++;
|
|
break;
|
|
} else if(*next == '\0') {
|
|
len = (unsigned int)(next-str);
|
|
break;
|
|
}
|
|
next++;
|
|
}
|
|
|
|
// ATOM records
|
|
if (str[0]=='A' && str[1]=='T' && str[2]=='O' &&
|
|
str[3]=='M' && str[4]==' ' && str[5]==' ') {
|
|
if (!mol) mol = new RWMol();
|
|
PDBAtomLine(mol,str,len,flavor,amap);
|
|
|
|
// HETATM records
|
|
} else if (str[0]=='H' && str[1]=='E' && str[2]=='T' &&
|
|
str[3]=='A' && str[4]=='T' && str[5]=='M') {
|
|
if (!mol) mol = new RWMol();
|
|
PDBAtomLine(mol,str,len,flavor,amap);
|
|
// CONECT records
|
|
} else if (str[0]=='C' && str[1]=='O' && str[2]=='N' &&
|
|
str[3]=='E' && str[4]=='C' && str[5]=='T') {
|
|
if (mol)
|
|
PDBBondLine(mol,str,len,amap,bmap);
|
|
// COMPND records
|
|
} else if (str[0]=='C' && str[1]=='O' && str[2]=='M' &&
|
|
str[3]=='P' && str[4]=='N' && str[5]=='D') {
|
|
if (!mol) mol = new RWMol();
|
|
if (len > 10 && (str[9]==' '||!strncmp(str+9,"2 MOLECULE: ",12)))
|
|
PDBTitleLine(mol,str,len);
|
|
// HEADER records
|
|
} else if (str[0]=='H' && str[1]=='E' && str[2]=='A' &&
|
|
str[3]=='D' && str[4]=='E' && str[5]=='R') {
|
|
if (!mol) mol = new RWMol();
|
|
PDBTitleLine(mol,str,len<50?len:50);
|
|
}
|
|
str = next;
|
|
}
|
|
|
|
if (!mol)
|
|
return (RWMol*)0;
|
|
|
|
for (std::map<int,Atom*>::iterator mi=amap.begin(); mi!=amap.end(); ++mi)
|
|
(*mi).second->calcExplicitValence(false);
|
|
|
|
if (sanitize) {
|
|
if (removeHs) {
|
|
ROMol *tmp = MolOps::removeHs(*mol,false,false);
|
|
if (tmp) {
|
|
delete mol;
|
|
mol = static_cast<RWMol *>(tmp);
|
|
}
|
|
} else {
|
|
MolOps::sanitizeMol(*mol);
|
|
}
|
|
}
|
|
return mol;
|
|
}
|
|
|
|
RWMol *PDBBlockToMol(const std::string &str, bool sanitize,
|
|
bool removeHs, unsigned int flavor)
|
|
{
|
|
return PDBBlockToMol(str.c_str(),sanitize,removeHs,flavor);
|
|
}
|
|
|
|
RWMol *PDBDataStreamToMol(std::istream *inStream, bool sanitize,
|
|
bool removeHs, unsigned int flavor)
|
|
{
|
|
PRECONDITION(inStream,"bad stream");
|
|
std::string buffer;
|
|
const char *ptr;
|
|
while (!inStream->eof()) {
|
|
std::string line;
|
|
std::getline(*inStream,line);
|
|
buffer += line;
|
|
buffer += '\n';
|
|
ptr = line.c_str();
|
|
if (ptr[0]=='E' && ptr[1]=='N' && ptr[2]=='D' &&
|
|
(ptr[3]==' ' || ptr[3]=='\r' || ptr[3]=='\n' || !ptr[3]))
|
|
break;
|
|
}
|
|
return PDBBlockToMol(buffer.c_str(),sanitize,removeHs,flavor);
|
|
}
|
|
|
|
RWMol *PDBFileToMol(const std::string &fileName, bool sanitize,
|
|
bool removeHs, unsigned int flavor)
|
|
{
|
|
std::ifstream ifs(fileName.c_str(),std::ios_base::binary);
|
|
if (!ifs || ifs.bad()) {
|
|
std::ostringstream errout;
|
|
errout << "Bad input file " << fileName;
|
|
throw BadFileException(errout.str());
|
|
}
|
|
return PDBDataStreamToMol(static_cast<std::istream*>(&ifs),sanitize,removeHs,flavor);
|
|
}
|
|
}
|
|
|