Files
libcifpp/src/pdb/pdb2cif.cpp
2025-10-21 13:06:09 +02:00

6472 lines
182 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "pdb2cif_remark_3.hpp"
#include "cif++.hpp"
#include <iomanip>
#include <map>
#include <set>
#include <stack>
#include <stdexcept>
using cif::category;
using cif::datablock;
using cif::iequals;
using cif::key;
using cif::to_lower;
using cif::to_lower_copy;
// --------------------------------------------------------------------
// attempt to come up with better error handling
namespace error
{
enum pdbErrors
{
residueNotFound = 1000,
invalidDate
};
namespace detail
{
class pdbCategory : public std::error_category
{
public:
const char *name() const noexcept
{
return "pdb";
}
std::string message(int value) const
{
switch (value)
{
case residueNotFound:
return "Residue not found";
case invalidDate:
return "Invalid date";
default:
return "Error in PDB format";
}
}
};
} // namespace detail
std::error_category &pdbCategory()
{
static detail::pdbCategory impl;
return impl;
}
inline std::error_code make_error_code(pdbErrors e)
{
return std::error_code(static_cast<int>(e), pdbCategory());
}
} // namespace error
namespace std
{
template <>
struct is_error_code_enum<error::pdbErrors>
{
static const bool value = true;
};
} // namespace std
namespace cif::pdb
{
// --------------------------------------------------------------------
const std::map<std::string, int> kMonths{
{ "JAN", 1 },
{ "FEB", 2 },
{ "MAR", 3 },
{ "APR", 4 },
{ "MAY", 5 },
{ "JUN", 6 },
{ "JUL", 7 },
{ "AUG", 8 },
{ "SEP", 9 },
{ "OCT", 10 },
{ "NOV", 11 },
{ "DEC", 12 },
};
const std::set<std::string> kSupportedRecords{
"HEADER", "OBSLTE", "TITLE ", "SPLIT ", "CAVEAT", "COMPND", "SOURCE",
"KEYWDS", "EXPDTA", "NUMMDL", "MDLTYP", "AUTHOR", "REVDAT", "SPRSDE",
"JRNL ", "REMARK", "DBREF ", "DBREF1", "DBREF2", "SEQADV", "SEQRES",
"MODRES", "HET ", "HETNAM", "HETSYN", "FORMUL", "HELIX ", "SHEET ",
"SSBOND", "LINK ", "CISPEP", "SITE ", "CRYST1", "ORIGX1", "SCALE1",
"MTRIX1", "ORIGX2", "SCALE2", "MTRIX2", "ORIGX3", "SCALE3", "MTRIX3",
"MODEL ", "ATOM ", "ANISOU", "TER ", "HETATM", "ENDMDL", "CONECT",
"MASTER", "END ",
// bah...
"LINKR "
};
bool isWater(const std::string &resname)
{
return resname == "HOH" or resname == "H2O" or resname == "OH2" or resname == "WAT" or resname == "DOD" or resname == "WAT";
}
// --------------------------------------------------------------------
// Unfortunately, parsing a PDB file requires several passes over the
// data. Therefore we first obtain all records where a record has the
// value flattened out for continuation.
PDBRecord::PDBRecord(uint32_t lineNr, const std::string &name, const std::string &value)
: mNext(nullptr)
, mLineNr(lineNr)
, mVlen(value.length())
{
assert(name.length() <= 10);
strcpy(mName, name.c_str());
strcpy(mValue, value.c_str());
}
PDBRecord::~PDBRecord()
{
}
void *PDBRecord::operator new(std::size_t size, std::size_t vLen)
{
return malloc(size + vLen + 1);
}
void PDBRecord::operator delete(void *p)
{
free(p);
}
void PDBRecord::operator delete(void *p, std::size_t vLen)
{
free(p);
}
bool PDBRecord::is(const char *name) const
{
return iequals(mName, name);
}
char PDBRecord::vC(std::size_t column)
{
char result = ' ';
if (column - 7 < mVlen)
result = mValue[column - 7];
return result;
}
std::string PDBRecord::vS(std::size_t columnFirst, std::size_t columnLast)
{
std::string result;
if (columnLast > mVlen + 6)
columnLast = mVlen + 6;
if (columnFirst < mVlen + 7)
{
result = std::string{ mValue + columnFirst - 7, mValue + columnLast - 7 + 1 };
cif::trim(result);
}
return result;
}
int PDBRecord::vI(int columnFirst, int columnLast)
{
int result = 0;
const char *e = mValue + mVlen;
if (e > mValue + columnLast - 7 + 1)
e = mValue + columnLast - 7 + 1;
enum
{
start,
digit,
tail
} state = start;
bool negate = false;
try
{
for (const char *p = mValue + columnFirst - 7; p < e; ++p)
{
switch (state)
{
case start:
if (*p == '+')
state = digit;
else if (*p == '-')
{
negate = true;
state = digit;
}
else if (isdigit(*p))
{
result = *p - '0';
state = digit;
}
else if (not isspace(*p))
throw std::runtime_error("Not a valid integer in PDB record");
break;
case digit:
if (isspace(*p))
state = tail;
else if (not isdigit(*p))
throw std::runtime_error("Not a valid integer in PDB record");
else
result = result * 10 + *p - '0';
break;
case tail:
if (not isspace(*p))
throw std::runtime_error("Not a valid integer in PDB record");
break;
}
}
}
catch (const std::exception &ex)
{
if (cif::VERBOSE >= 0)
std::cerr << "Trying to parse '" << std::string(mValue + columnFirst - 7, mValue + columnLast - 7) << '\'' << '\n';
throw;
}
if (negate)
result = -result;
return result;
}
std::string PDBRecord::vF(std::size_t columnFirst, std::size_t columnLast)
{
// for now... TODO: check format?
return vS(columnFirst, columnLast);
}
// --------------------------------------------------------------------
class SpecificationListParser
{
public:
SpecificationListParser(const std::string &text)
: mText(text)
, mP(mText.begin())
{
}
std::tuple<std::string, std::string> GetNextSpecification();
private:
std::string mText;
std::string::iterator mP;
};
std::tuple<std::string, std::string> SpecificationListParser::GetNextSpecification()
{
std::string id, value;
std::string::iterator start = mP, backup;
enum
{
eStart,
eID,
eColon,
eValue,
eNL,
eNL_ID,
eSemiColon,
eError,
eDone
} state = eStart;
while (mP != mText.end() and state != eDone)
{
char ch = *mP++;
switch (state)
{
case eStart:
if (isalnum(ch) or ch == '_')
{
id = { ch };
value.clear();
state = eID;
start = mP;
}
else if (not isspace(ch))
{
if (cif::VERBOSE > 0)
std::cerr << "skipping invalid character in SOURCE ID: " << ch << '\n';
}
break;
case eID:
if (isalnum(ch) or ch == '_')
id += ch;
else if (ch == ':')
state = eColon;
else
state = eError;
break;
case eColon:
if (ch == ';')
{
if (cif::VERBOSE > 0)
std::cerr << "Empty value for SOURCE: " << id << '\n';
state = eStart;
}
else if (not isspace(ch))
{
value = { ch };
state = eValue;
}
break;
case eValue:
if (ch == '\n')
{
backup = mP;
state = eNL;
}
else if (ch == ';')
{
backup = mP;
state = eSemiColon;
}
else
value += ch;
break;
case eSemiColon:
if (ch == '\n')
state = eDone;
else if (ch != ' ')
{
value.insert(value.end(), backup, mP);
state = eValue;
}
break;
case eNL:
if (isalnum(ch))
{
value += ' ';
state = eNL_ID;
}
else if (isspace(ch))
state = eValue;
break;
case eNL_ID:
if (ch == ':')
{
mP = backup;
state = eDone;
}
else if (ch == ';')
state = eSemiColon;
else if (not(isalnum(ch) or ch == '_'))
{
value.insert(value.end(), backup, mP);
state = eValue;
}
break;
case eError:
if (ch == ';')
{
if (cif::VERBOSE > 0)
std::cerr << "Skipping invalid header line: '" << std::string(start, mP) << '\n';
state = eStart;
}
break;
case eDone: break; // keep compiler happy
}
}
cif::trim(value);
return std::make_tuple(id, value);
}
// --------------------------------------------------------------------
class PDBFileParser
{
public:
PDBFileParser()
: mData(nullptr)
, mRec(nullptr)
{
}
~PDBFileParser()
{
PDBRecord *r = mData;
while (r != nullptr)
{
PDBRecord *d = r;
r = d->mNext;
delete d;
}
}
void Parse(std::istream &is, cif::file &result);
private:
// ----------------------------------------------------------------
struct DBREF
{
std::string PDBIDCode;
char chainID;
int seqBegin;
char insertBegin = ' ';
int seqEnd;
char insertEnd = ' ';
std::string database;
std::string dbAccession;
std::string dbIdCode;
int dbSeqBegin;
char dbinsBeg;
int dbSeqEnd;
char dbinsEnd;
};
struct HET
{
std::string hetID;
char chainID;
int seqNum;
char iCode;
int numHetAtoms = 0;
std::string text;
std::string asymID;
std::vector<PDBRecord *> atoms;
bool processed = false;
bool branch = false;
PDBRecord *asn = nullptr;
HET(const std::string &hetID, char chainID, int seqNum, char iCode, int numHetAtoms = 0, const std::string &text = {})
: hetID(hetID)
, chainID(chainID)
, seqNum(seqNum)
, iCode(iCode)
, numHetAtoms(numHetAtoms)
, text(text)
{
}
};
struct UNOBS
{
int modelNr;
std::string res;
char chain;
int seq;
char iCode;
std::vector<std::string> atoms;
};
struct ATOM_REF
{
std::string name;
std::string resName;
int resSeq;
char chainID;
char iCode;
char altLoc;
bool operator==(const ATOM_REF &rhs) const
{
return name == rhs.name and
resName == rhs.resName and
resSeq == rhs.resSeq and
(altLoc == rhs.altLoc or altLoc == ' ' or rhs.altLoc == ' ') and
chainID == rhs.chainID and
iCode == rhs.iCode;
}
bool operator!=(const ATOM_REF &rhs) const
{
return not operator==(rhs);
}
bool operator<(const ATOM_REF &rhs) const
{
int d = chainID - rhs.chainID;
if (d == 0)
d = resSeq - rhs.resSeq;
if (d == 0)
d = iCode - rhs.iCode;
// if (d == 0) d = resName.compare(rhs.resName);
if (d == 0)
d = name.compare(rhs.name);
if (d == 0 and altLoc != ' ' and rhs.altLoc != ' ')
d = altLoc - rhs.altLoc;
return d < 0;
}
friend std::ostream &operator<<(std::ostream &os, const ATOM_REF &a)
{
os << a.name << ' ' << a.resName << ' ' << a.chainID << ' ' << a.resSeq << (a.iCode == ' ' ? "" : std::string{ a.iCode }) << (a.altLoc != ' ' ? std::string{ ' ', a.altLoc } : "");
return os;
}
};
struct LINK
{
ATOM_REF a, b;
std::string symOpA, symOpB;
float distance;
};
struct SUGAR
{
ATOM_REF c1;
int leaving_o;
ATOM_REF next;
};
class SUGAR_TREE : public std::vector<SUGAR>
{
public:
std::string entityName() const
{
return empty() ? "" : entityName(begin());
}
private:
std::string entityName(const_iterator sugar) const
{
std::string result;
for (auto i = begin(); i != end(); ++i)
{
if (i->next != sugar->c1)
continue;
auto n = entityName(i) + "-(1-" + std::to_string(i->leaving_o) + ")";
if (result.empty())
result = n;
else
result += "-[" + n + ']';
}
if (not result.empty() and result.back() != ']')
result += '-';
auto compound = cif::compound_factory::instance().create(sugar->c1.resName);
if (compound)
result += compound->name();
else if (sugar->c1.resName == "MAN")
result += "alpha-D-mannopyranose";
else if (sugar->c1.resName == "BMA")
result += "beta-D-mannopyranose";
else if (sugar->c1.resName == "NAG")
result += "2-acetamido-2-deoxy-beta-D-glucopyranose";
else if (sugar->c1.resName == "NDG")
result += "2-acetamido-2-deoxy-alpha-D-glucopyranose";
else if (sugar->c1.resName == "FUC")
result += "alpha-L-fucopyranose";
else if (sugar->c1.resName == "FUL")
result += "beta-L-fucopyranose";
else
result += sugar->c1.resName;
return result;
}
};
// ----------------------------------------------------------------
/*
To get from PDB chains to CIF entity and poly records we take the following steps:
First check if there is a Primary Structure Section. If there is, it should contain
a valid DBREF/SEQRES pair that allows the reconstruction of numbering of residues.
If that fails, we fall back to:
1. Collect the chains from the PDB file.
2. For each chain, split out the residues and waters, assign those to new entities
3. If there are multiple chains containing residues, align those to find unique polymers
4. Annotate the entity records with available information in the PDB file (COMPND e.g.)
5. Create the mapping structures from PDB numbering to CIF numbering.
*/
struct PDBCompound
{
int mMolID;
std::string mTitle;
std::set<char> mChains;
std::map<std::string, std::string> mInfo;
std::map<std::string, std::string> mSource;
int mCount = 0;
};
struct PDBSeqRes
{
std::string mMonID;
int mSeqNum;
char mIcode;
int mDbSeqNum = 0;
bool mSeen = false;
std::set<std::string> mAlts;
bool operator==(const PDBSeqRes &rhs) const
{
return mSeqNum == rhs.mSeqNum and mMonID == rhs.mMonID and mIcode == rhs.mIcode;
}
};
struct PDBChain
{
PDBChain(const std::string &structureID, char chainID, int molID)
: mDbref{ structureID, chainID }
, mWaters(0)
, mTerIndex(0)
, mMolID(molID)
, mNextSeqNum(1)
, mNextDbSeqNum(1)
{
}
DBREF mDbref;
std::vector<PDBSeqRes> mSeqres, mHet;
int mWaters;
int mTerIndex;
int mMolID;
// scratch values for reading SEQRES records
int mNextSeqNum;
int mNextDbSeqNum;
// scratch value for aligning
struct AtomRes
{
std::string mMonID;
int mSeqNum;
char mIcode;
bool operator==(const AtomRes &rhs) const { return mSeqNum == rhs.mSeqNum and mIcode == rhs.mIcode; }
bool operator!=(const AtomRes &rhs) const { return mSeqNum != rhs.mSeqNum or mIcode != rhs.mIcode; }
};
std::vector<AtomRes> mResiduesSeen;
int AlignResToSeqRes();
bool SameSequence(const PDBChain &rhs) const;
};
// ----------------------------------------------------------------
PDBCompound &GetOrCreateCompound(int molID)
{
auto i = std::find_if(mCompounds.begin(), mCompounds.end(), [molID](PDBCompound &comp) -> bool
{ return comp.mMolID == molID; });
if (i == mCompounds.end())
{
mCompounds.push_back(PDBCompound{ molID });
mMolID2EntityID[molID] = std::to_string(mNextEntityNr++);
i = prev(mCompounds.end());
}
return *i;
}
// locate the PDBChain record for a chain ID, or create it with dummy data if missing
PDBChain &GetChainForID(char chainID, int numRes = 0)
{
auto i = std::find_if(mChains.begin(), mChains.end(), [chainID](PDBChain &ch) -> bool
{ return ch.mDbref.chainID == chainID; });
if (i == mChains.end())
{
// locate the compound for this chain, if any (does that happen?)
int molID = 0;
for (auto &cmp : mCompounds)
{
if (cmp.mChains.count(chainID) > 0)
{
molID = cmp.mMolID;
break;
}
}
mChains.emplace_back(mStructureID, chainID, molID);
i = prev(mChains.end());
}
return *i;
};
void InsertChemComp(const std::string &chemComp)
{
if (find(mChemComp.begin(), mChemComp.end(), chemComp) == mChemComp.end())
mChemComp.push_back(chemComp);
}
void InsertAtomType(const std::string &atomType)
{
if (find(mAtomTypes.begin(), mAtomTypes.end(), atomType) == mAtomTypes.end())
mAtomTypes.push_back(atomType);
}
// ----------------------------------------------------------------
template <typename Predicate>
PDBRecord *FindRecord(Predicate &&pred)
{
PDBRecord *result;
for (result = mData; result != nullptr; result = result->mNext)
{
if (pred(*result))
break;
}
return result;
}
PDBRecord *FindRecord(const char *name)
{
return FindRecord([name](PDBRecord &rec) -> bool
{ return rec.is(name); });
}
// ----------------------------------------------------------------
char vC(std::size_t column) const
{
return mRec->vC(column);
}
std::string vS(std::size_t columnFirst, std::size_t columnLast = std::numeric_limits<std::size_t>::max()) const
{
return mRec->vS(columnFirst, columnLast);
}
std::string vF(std::size_t columnFirst, std::size_t columnLast) const
{
return mRec->vF(columnFirst, columnLast);
}
int vI(int columnFirst, int columnLast) const
{
return mRec->vI(columnFirst, columnLast);
}
// ----------------------------------------------------------------
// Map a PDB residue location to a seqnum in a struct_asym
std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode) const
{
auto key = std::make_tuple(chainID, resSeq, iCode);
try
{
return mChainSeq2AsymSeq.at(key);
}
catch (const std::exception &ex)
{
throw_with_nested(std::runtime_error(std::string("Residue ") + chainID + std::to_string(resSeq) + iCode + " could not be mapped"));
}
}
std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode, std::error_code &ec) const
{
auto key = std::make_tuple(chainID, resSeq, iCode);
std::tuple<std::string, int, bool> result;
if (not mChainSeq2AsymSeq.count(key))
{
ec = error::make_error_code(error::pdbErrors::residueNotFound);
if (cif::VERBOSE > 0)
std::cerr << "Residue " << chainID << resSeq << iCode << " could not be mapped\n";
}
else
result = mChainSeq2AsymSeq.at(key);
return result;
}
// ----------------------------------------------------------------
void PreParseInput(std::istream &is);
void GetNextRecord();
void Match(const std::string &expected, bool throwIfMissing);
void ParseTitle();
void ParseCitation(const std::string &id);
void ParseRemarks();
// void ParseRemark3();
// std::size_t ParseRemark3(const std::string& program, const Remark3Template templ[], std::size_t N);
// std::string NextRemark3Line();
void ParseRemark200();
void ParseRemark350();
void ParsePrimaryStructure();
void ParseHeterogen();
void ConstructEntities();
void ConstructSugarTrees(int &asymNr);
void ParseSecondaryStructure();
void ParseConnectivtyAnnotation();
void ParseMiscellaneousFeatures();
void ParseCrystallographic();
void ParseCoordinateTransformation();
void ParseCoordinate(int modelNr);
void ParseConnectivty();
void ParseBookkeeping();
// ----------------------------------------------------------------
category *getCategory(std::string name)
{
return &mDatablock[name];
}
std::vector<std::string> SplitCSV(const std::string &value);
std::string pdb2cifDate(std::string s, std::error_code &ec)
{
std::smatch m;
const std::regex
rx1(R"((\d{2})-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))"),
rx2(R"((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))");
try
{
if (regex_match(s, m, rx1))
{
int day = stoi(m[1].str());
auto mi = kMonths.find(m[2].str());
if (mi == kMonths.end())
throw std::runtime_error("Invalid month: '" + m[2].str() + '\'');
int month = mi->second;
int year = 1900 + stoi(m[3].str());
if (year < 1950)
year += 100;
s = cif::format("{:04}-{:02}-{:02}", year, month, day);
}
else if (regex_match(s, m, rx2))
{
auto mi = kMonths.find(m[1].str());
if (mi == kMonths.end())
throw std::runtime_error("Invalid month: '" + m[1].str() + '\'');
int month = mi->second;
int year = 1900 + stoi(m[2].str());
if (year < 1950)
year += 100;
s = cif::format("{:04}-{:02}", year, month);
}
else
ec = error::make_error_code(error::pdbErrors::invalidDate);
}
catch (const std::exception &ex)
{
if (cif::VERBOSE > 0)
std::cerr << ex.what() << '\n';
ec = error::make_error_code(error::pdbErrors::invalidDate);
}
return s;
}
std::string pdb2cifDate(std::string s)
{
std::error_code ec;
auto result = pdb2cifDate(s, ec);
if (ec and cif::VERBOSE > 0)
std::cerr << "Invalid date(" << s << "): " << ec.message() << '\n';
return result;
}
std::string pdb2cifAuth(std::string author)
{
cif::trim(author);
const std::regex rx(R"(((?:[A-Z]+\.)+)(.+))");
std::smatch m;
if (regex_match(author, m, rx))
author = m[2].str() + ", " + m[1].str();
bool upper = true;
for (auto &c : author)
{
if (ispunct(c) or isspace(c))
upper = true;
else if (upper)
upper = false;
else
c = cif::tolower(c);
}
return author;
}
std::string pdb2cifSymmetry(std::string s)
{
static const std::regex sgRx(R"((\d{1,3})(\d{3}))");
if (not s.empty())
{
std::smatch m;
if (not std::regex_match(s, m, sgRx))
throw std::runtime_error("invalid symmetry value '" + s + '\'');
s = m[1].str() + "_" + m[2].str();
}
return s;
}
std::string pdb2cifCharge(std::string c)
{
std::regex rx(R"((\d+)(\+|-))");
std::smatch m;
if (std::regex_match(c, m, rx))
{
if (m[2].str() == "-")
c = '-' + m[1].str();
else
c = m[1].str();
}
return c;
}
std::vector<char> altLocsForAtom(char chainID, int seqNum, char iCode, std::string atomName);
void MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds);
std::tuple<ATOM_REF, bool> FindLink(const std::string &name1, const std::string &resName1, int resSeq1, char altLoc1, char chainID1, char iCode1,
const std::string &name2, const std::string &resName2 = "")
{
return FindLink(ATOM_REF{ name1, resName1, resSeq1, altLoc1, chainID1, iCode1 }, name2, resName2);
}
std::tuple<ATOM_REF, bool> FindLink(const ATOM_REF &atom, const std::string &name2, const std::string &resName2 = "") const
{
auto i = std::find_if(mLinks.begin(), mLinks.end(), [&](const LINK &link)
{ return (link.a == atom and link.b.name == name2 and (resName2.empty() or link.b.resName == resName2)) or
(link.b == atom and link.a.name == name2 and (resName2.empty() or link.a.resName == resName2)); });
if (i != mLinks.end())
return { i->a == atom ? i->b : i->a, true };
return {};
}
// ----------------------------------------------------------------
PDBRecord *mData;
PDBRecord *mRec;
cif::datablock mDatablock;
std::string mStructureID;
std::string mModelTypeDetails;
std::string mOriginalDate;
std::string mExpMethod = "X-RAY DIFFRACTION";
int mCitationAuthorNr = 1, mCitationEditorNr = 1;
int mNextMolID = 1, mNextEntityNr = 1;
int mNextSoftwareOrd = 1;
struct SEQADV
{
std::string resName;
char chainID;
int seqNum;
char iCode;
std::string database;
std::string dbAccession;
std::string dbRes;
int dbSeq;
std::string conflict;
};
std::vector<SEQADV> mSeqadvs;
std::list<PDBCompound> mCompounds;
std::list<PDBChain> mChains;
std::vector<HET> mHets;
std::map<std::string, std::string> mHetnams;
std::map<std::string, std::string> mHetsyns;
std::map<std::string, std::string> mFormuls;
std::string mWaterHetID;
std::vector<std::string> mChemComp, mAtomTypes;
std::map<std::string, std::string> mRemark200;
std::string mRefinementSoftware;
int mAtomID = 0;
int mPdbxDifOrdinal = 0;
std::vector<UNOBS> mUnobs;
std::vector<LINK> mLinks;
// various maps between numbering schemes
std::map<std::tuple<char, int, char>, std::tuple<std::string, int, bool>> mChainSeq2AsymSeq;
std::map<int, std::string> mMolID2EntityID;
std::map<std::string, std::string> mHet2EntityID;
std::map<std::string, std::string> mBranch2EntityID;
std::map<std::string, std::string> mAsymID2EntityID;
std::map<std::string, std::string> mMod2parent;
std::set<std::string> mSugarEntities;
};
// --------------------------------------------------------------------
std::vector<char> PDBFileParser::altLocsForAtom(char inChainID, int inResSeq, char inICode, std::string inAtomName)
{
// well, maybe this could be optimized...
std::set<char> result;
for (auto r = mData; r != nullptr; r = r->mNext)
{
if (r->is("ATOM ") or r->is("HETATM")) // 1 - 6 Record name "ATOM "
{ // ...
std::string name = r->vS(13, 16); // 13 - 16 Atom name Atom name.
char altLoc = r->vC(17); // 17 Character altLoc Alternate location indicator.
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
if (chainID == inChainID and resSeq == inResSeq and iCode == inICode and name == inAtomName and altLoc != ' ')
result.insert(altLoc);
}
}
return { result.begin(), result.end() };
}
void PDBFileParser::MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds)
{
for (const auto &[key, value] : mChainSeq2AsymSeq)
{
if (std::get<0>(key) == chainID)
asymIds.push_back(std::get<0>(value));
}
std::sort(asymIds.begin(), asymIds.end(), [](const std::string &a, const std::string &b)
{
int d = static_cast<int>(a.length() - b.length());
if (d == 0)
d = a.compare(b);
return d < 0; });
asymIds.erase(std::unique(asymIds.begin(), asymIds.end()), asymIds.end());
}
// --------------------------------------------------------------------
void PDBFileParser::PreParseInput(std::istream &is)
{
std::string lookahead;
uint32_t lineNr = 1;
getline(is, lookahead);
if (lookahead.back() == '\r')
lookahead.pop_back();
auto contNr = [&lookahead](int offset, int len) -> int
{
std::string cs = lookahead.substr(offset, len);
cif::trim(cs);
int result = 0;
if (not cs.empty())
{
auto r = std::from_chars(cs.data(), cs.data() + cs.length(), result);
if ((bool)r.ec)
throw std::runtime_error("Continuation std::string '" + cs + "' is not valid");
}
return result;
};
PDBRecord *last = nullptr;
std::set<std::string> dropped;
for (;;)
{
if (lookahead.empty())
{
if (is.eof())
break;
if (cif::VERBOSE > 0)
std::cerr << "Line number " << lineNr << " is empty!\n";
getline(is, lookahead);
++lineNr;
continue;
}
std::string type = lookahead.substr(0, 6);
std::string value;
if (lookahead.length() > 6)
value = cif::trim_right_copy(lookahead.substr(6));
lookahead.clear();
uint32_t curLineNr = lineNr;
getline(is, lookahead);
++lineNr;
if (kSupportedRecords.count(type) == 0)
{
cif::trim(type);
if (type != "END") // special case
dropped.insert(type);
lookahead.clear();
continue;
}
// see if we need to append continuation values
if (type == "AUTHOR" or
type == "EXPDTA" or
type == "MDLTYP" or
type == "KEYWDS" or
type == "SPLIT " or
type == "SPRSDE" or
type == "TITLE ")
{
int n = 2;
while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{
value += cif::trim_right_copy(lookahead.substr(10));
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "COMPND")
{
int n = 2;
value += '\n';
while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{
value += cif::trim_right_copy(lookahead.substr(10));
value += '\n';
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "REVDAT")
{
int revNr = stoi(value.substr(1, 3));
int n = 2;
while (lookahead.substr(0, 6) == type and
stoi(lookahead.substr(7, 3)) == revNr and
contNr(10, 2) == n)
{
value += lookahead.substr(38);
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "CAVEAT")
{
int n = 2;
while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{
value += cif::trim_right_copy(lookahead.substr(13));
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "OBSLTE")
{
while (lookahead.substr(0, 6) == type)
{
value += lookahead.substr(31);
getline(is, lookahead);
++lineNr;
}
}
else if (type == "SOURCE")
{
value += '\n';
int n = 2;
while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
{
value += cif::trim_copy(lookahead.substr(10));
value += '\n';
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "FORMUL")
{
try
{
int compNr;
try
{
compNr = stoi(value.substr(1, 3));
}
catch (const std::exception &ex)
{
if (cif::VERBOSE >= 0)
std::cerr << "Dropping FORMUL line (" << (lineNr - 1) << ") with invalid component number '" << value.substr(1, 3) << '\'' << '\n';
continue;
// throw_with_nested(std::runtime_error("Invalid component number '" + value.substr(1, 3) + '\''));
}
int n = 2;
try
{
while (lookahead.substr(0, 6) == type and
stoi(lookahead.substr(7, 3)) == compNr and
contNr(16, 2) == n)
{
value += cif::trim_right_copy(lookahead.substr(19));
;
getline(is, lookahead);
++lineNr;
++n;
}
}
catch (const std::invalid_argument &ex)
{
continue;
// throw_with_nested(std::runtime_error("Invalid component number '" + lookahead.substr(7, 3) + '\''));
}
}
catch (const std::exception &ex)
{
if (cif::VERBOSE >= 0)
std::cerr << "Error parsing FORMUL at line " << lineNr << '\n';
throw;
}
}
else if (type == "HETNAM" or
type == "HETSYN")
{
int n = 2;
while (lookahead.substr(0, 6) == type and contNr(8, 2) == n)
{
value += cif::trim_right_copy(lookahead.substr(16));
;
getline(is, lookahead);
++lineNr;
++n;
}
}
else if (type == "SITE ")
{
std::string siteName = value.substr(5, 3);
cif::trim_right(value);
std::size_t n = value.length() - 12;
value += std::string(11 - (n % 11), ' ');
while (lookahead.substr(0, 6) == type and lookahead.substr(11, 3) == siteName)
{
std::string s = lookahead.substr(18);
cif::trim_right(s);
s += std::string(11 - (s.length() % 11), ' ');
value += s;
// TODO: improve this... either use numRes or don't lump together all text
// value += " " + cif::trim_right_copy();
getline(is, lookahead);
++lineNr;
}
}
else if (type == "REMARK")
{
type += value.substr(0, 4);
// parse it now, makes life easier later on
if (type == "REMARK 200" or type == "REMARK 240")
{
auto i = value.find(":");
if (i != std::string::npos)
{
std::string k = value.substr(4, i - 4);
std::string v = value.substr(i + 1);
cif::trim(k);
while (k.find(" ") != std::string::npos)
cif::replace_all(k, " ", " ");
cif::trim(v);
if (iequals(v, "NONE") or iequals(v, "N/A") or iequals(v, "NAN"))
mRemark200[k] = ".";
else if (not iequals(v, "NULL"))
mRemark200[k] = v;
}
}
}
PDBRecord *cur = new (value.length()) PDBRecord(curLineNr, type, value);
if (last == nullptr)
last = mData = cur;
else
last->mNext = cur;
last = cur;
cif::trim(type);
if (type == "LINK" or type == "LINKR")
{
LINK link = {};
link.a.name = cur->vS(13, 16); // 13 - 16 Atom name1 Atom name.
link.a.altLoc = cur->vC(17); // 17 Character altLoc1 Alternate location indicator.
link.a.resName = cur->vS(18, 20); // 18 - 20 Residue name resName1 Residue name.
link.a.chainID = cur->vC(22); // 22 Character chainID1 Chain identifier.
link.a.resSeq = cur->vI(23, 26); // 23 - 26 Integer resSeq1 Residue sequence number.
link.a.iCode = cur->vC(27); // 27 AChar iCode1 Insertion code.
link.b.name = cur->vS(43, 46); // 43 - 46 Atom name2 Atom name.
link.b.altLoc = cur->vC(47); // 47 Character altLoc2 Alternate location indicator.
link.b.resName = cur->vS(48, 50); // 48 - 50 Residue name resName2 Residue name.
link.b.chainID = cur->vC(52); // 52 Character chainID2 Chain identifier.
link.b.resSeq = cur->vI(53, 56); // 53 - 56 Integer resSeq2 Residue sequence number.
link.b.iCode = cur->vC(57); // 57 AChar iCode2 Insertion code.
link.symOpA = cur->vS(60, 65); // 60 - 65 SymOP sym1 Symmetry operator atom 1.
link.symOpB = cur->vS(67, 72); // 67 - 72 SymOP sym2 Symmetry operator atom 2.
if (type == "LINK") // 1 - 6 Record name "LINK "
{
auto f = cur->vF(74, 78);
auto r = cif::from_chars(f.data(), f.data() + f.length(), link.distance);
if ((bool)r.ec and cif::VERBOSE > 0)
std::cerr << "Error parsing link distance at line " << cur->mLineNr << '\n';
}
// 74 78 Real(5.2) Length Link distance
mLinks.push_back(link);
}
if (type == "END")
break;
}
if (not dropped.empty())
{
if (cif::VERBOSE >= 0)
std::cerr << "Dropped unsupported records: " << cif::join(dropped, ", ") << '\n';
}
if (mData == nullptr)
throw std::runtime_error("Empty file?");
mRec = mData;
}
void PDBFileParser::GetNextRecord()
{
if (mRec != nullptr)
mRec = mRec->mNext;
if (mRec == nullptr)
{
static PDBRecord *end = new (0) PDBRecord({ 0, "END ", "" });
mRec = end;
}
}
void PDBFileParser::Match(const std::string &expected, bool throwIfMissing)
{
assert(mRec);
if (mRec->mName != expected)
{
if (throwIfMissing)
throw std::runtime_error("Expected record " + expected + " but found " + mRec->mName);
if (cif::VERBOSE > 0)
std::cerr << "Expected record " + expected + " but found " + mRec->mName << '\n';
}
}
std::vector<std::string> PDBFileParser::SplitCSV(const std::string &value)
{
auto vs = cif::split<std::string>(value, ",");
for (auto &v : vs)
cif::trim(v);
return vs;
}
void PDBFileParser::ParseTitle()
{
// strict ordering required
// HEADER
// 1 - 6 Record name "HEADER"
// 11 - 50 String(40) classification Classifies the molecule(s).
// 51 - 59 Date depDate Deposition date. This is the date the
// coordinates were received at the PDB.
// 63 - 66 IDcode idCode This identifier is unique within the PDB.
Match("HEADER", false);
std::string keywords;
if (mRec->is("HEADER"))
{
mStructureID = vS(63, 66);
keywords = vS(11, 50);
mOriginalDate = pdb2cifDate(vS(51, 59));
cif::trim(keywords);
GetNextRecord();
}
cif::trim(mStructureID);
if (mStructureID.empty())
mStructureID = "nohd";
mDatablock.set_name(mStructureID);
auto cat = getCategory("entry");
// cat->addColumn("id");
cat->emplace({ { "id", mStructureID } });
// OBSLTE
if (mRec->is("OBSLTE"))
{
// 1 - 6 Record name "OBSLTE"
// 9 - 10 Continuation continuation Allows concatenation of multiple records
// 12 - 20 Date repDate Date that this datablock was replaced.
// 22 - 25 IDcode idCode ID code of this datablock.
// 32 - 35 IDcode rIdCode ID code of datablock that replaced this one.
// 37 - 40 ...
std::string old = vS(22, 25);
std::string date = pdb2cifDate(vS(12, 20));
cat = getCategory("pdbx_database_PDB_obs");
std::string value = mRec->vS(32);
for (auto i : cif::split<std::string>(value, " ", true))
{
cat->emplace({ { "id", "OBSLTE" },
{ "date", date },
{ "replace_pdb_id", old },
{ "pdb_id", i } });
}
GetNextRecord();
}
// TITLE
Match("TITLE ", false);
std::string title;
if (mRec->is("TITLE ")) // 1 - 6 Record name "TITLE "
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records.
title = vS(11); // 11 - 80 String title Title of the experiment.
GetNextRecord();
}
// SPLIT
if (mRec->is("SPLIT "))
{
// 1 - 6 Record name "SPLIT "
// 9 - 10 Continuation continuation Allows concatenation of multiple records.
// 12 - 15 IDcode idCode ID code of related datablock.
throw std::runtime_error("SPLIT PDB files are not supported");
}
// CAVEAT
int caveatID = 1;
while (mRec->is("CAVEAT")) // 1 - 6 Record name "CAVEAT"
{
// clang-format off
getCategory("database_PDB_caveat")->emplace({
{ "id", caveatID++ },
{ "text", std::string{ mRec->vS(20) } } // 20 - 79 String comment Free text giving the reason for the CAVEAT.
});
// clang-format on
GetNextRecord();
}
// COMPND
Match("COMPND", false);
// 1 - 6 Record name "COMPND"
// 8 - 10 Continuation continuation Allows concatenation of multiple records.
// 11 - 80 Specification compound Description of the molecular components.
// list
if (mRec->is("COMPND"))
{
std::string value{ mRec->vS(11) };
if (value.find(':') == std::string::npos)
{
// special case for dumb, stripped files
auto &comp = GetOrCreateCompound(1);
comp.mInfo["MOLECULE"] = value;
}
else
{
SpecificationListParser p(value);
for (;;)
{
std::string key, val;
std::tie(key, val) = p.GetNextSpecification();
if (key.empty())
break;
if (not iequals(key, "MOL_ID") and mCompounds.empty())
{
if (cif::VERBOSE > 0)
std::cerr << "Ignoring invalid COMPND record\n";
break;
}
if (key == "MOL_ID")
{
auto &comp = GetOrCreateCompound(stoi(val));
comp.mTitle = title;
}
else if (key == "CHAIN")
{
for (auto c : cif::split<std::string>(val, ","))
{
cif::trim(c);
mCompounds.back().mChains.insert(c[0]);
}
}
else
mCompounds.back().mInfo[key] = val;
}
}
GetNextRecord();
}
// SOURCE
Match("SOURCE", false);
if (mRec->is("SOURCE"))
{
// 1 - 6 Record name "SOURCE"
// 8 - 10 Continuation continuation Allows concatenation of multiple records.
// 11 - 79 Specification srcName Identifies the source of the
// List macromolecule in a token: value format.
std::map<std::string, std::string> *source = nullptr;
// value = { mRec->vS(11) };
// for (auto si = ba::make_split_iterator(value, ba::token_finder(ba::is_any_of(";"), ba::token_compress_on)); not si.eof(); ++si)
// {
// std::string s(si->begin(), si->end());
// if (s.empty())
// continue;
//
// auto colon = s.find(": ");
// if (colon == std::string::npos)
// {
// if (cif::VERBOSE > 0)
// std::cerr << "invalid source field, missing colon (" << s << ')' << '\n';
// continue;
// }
SpecificationListParser p(vS(11));
for (;;)
{
std::string key, val;
std::tie(key, val) = p.GetNextSpecification();
if (key.empty())
break;
if (key == "MOL_ID")
{
for (auto &c : mCompounds)
{
if (c.mMolID == stoi(val))
{
source = &c.mSource;
break;
}
}
continue;
}
if (source == nullptr)
throw std::runtime_error("At line " + std::to_string(mRec->mLineNr) + ": missing MOL_ID in SOURCE");
(*source)[key] = val;
}
GetNextRecord();
}
// KEYWDS
Match("KEYWDS", false);
std::string pdbxKeywords;
if (mRec->is("KEYWDS")) // 1 - 6 Record name "KEYWDS"
{ // 9 - 10 Continuation continuation Allows concatenation of records if necessary.
pdbxKeywords = vS(11); // 11 - 79 List keywds Comma-separated list of keywords relevant
// to the datablock.
GetNextRecord();
}
if (not(keywords.empty() and pdbxKeywords.empty()))
{
// clang-format off
getCategory("struct_keywords")->emplace({
{ "entry_id", mStructureID },
{ "pdbx_keywords", keywords },
{ "text", pdbxKeywords }
});
// clang-format on
}
// EXPDTA
Match("EXPDTA", false);
if (mRec->is("EXPDTA"))
{
mExpMethod = vS(11);
cat = getCategory("exptl");
auto crystals = cif::split<std::string>(mRemark200["NUMBER OF CRYSTALS USED"], "; ");
if (crystals.empty())
crystals.push_back("");
auto ci = crystals.begin();
for (auto expMethod : cif::split<std::string>(mExpMethod, ";"))
{
cif::trim(expMethod);
if (expMethod.empty())
continue;
// clang-format off
cat->emplace({
{ "entry_id", mStructureID },
{ "method", expMethod },
{ "crystals_number", ci != crystals.end() ? *ci : "" }
});
// clang-format ob
}
GetNextRecord();
}
// NUMMDL
if (mRec->is("NUMMDL"))
{
if (cif::VERBOSE > 0)
std::cerr << "skipping unimplemented NUMMDL record\n";
GetNextRecord();
}
// MDLTYP
if (mRec->is("MDLTYP"))
{
mModelTypeDetails = vS(11);
GetNextRecord();
}
// AUTHOR
Match("AUTHOR", false);
if (mRec->is("AUTHOR"))
{
int n = 1;
cat = getCategory("audit_author");
std::string value = { mRec->vS(11) };
for (auto author : cif::split<std::string>(value, ",", true))
{
// clang-format off
cat->emplace({
{ "name", pdb2cifAuth(author) },
{ "pdbx_ordinal", n }
});
// clang-format on
++n;
}
GetNextRecord();
}
// REVDAT
bool firstRevDat = true;
struct RevDat
{
int revNum;
std::string date, dateOriginal, replaces;
int modType;
std::vector<std::string> types;
bool operator<(const RevDat &rhs) const { return revNum < rhs.revNum; }
};
std::vector<RevDat> revdats;
while (mRec->is("REVDAT"))
{
// 1 - 6 Record name "REVDAT"
int revNum = vI(8, 10); // 8 - 10 Integer modNum Modification number.
// 11 - 12 Continuation continuation Allows concatenation of multiple records.
std::string date = pdb2cifDate(vS(14, 22)); // 14 - 22 Date modDate Date of modification (or release for
// new entries) in DD-MMM-YY format. This is
// not repeated on continued lines.
std::string modID = vS(24, 27); // 24 - 27 IDCode modID ID code of this datablock. This is not repeated on
// continuation lines.
int modType = vI(32, 32); // 32 Integer modType An integer identifying the type of
// modification. For all revisions, the
// modification type is listed as 1
std::string detail = vS(40); // 40 - 45 LString(6) record Modification detail.
// 47 - 52 LString(6) record Modification detail.
// 54 - 59 LString(6) record Modification detail.
// 61 - 66 LString(6) record Modification detail.
revdats.push_back({ revNum, date, modType == 0 ? mOriginalDate : "", modID, modType });
revdats.back().types = cif::split<std::string>(detail, " ");
if (firstRevDat)
{
// clang-format off
getCategory("database_2")->emplace({
{ "database_id", "PDB" },
{ "database_code", modID }
});
// clang-format on
}
GetNextRecord();
firstRevDat = false;
}
/*
This is internal stuff for PDB, don't write it ???
*/
sort(revdats.begin(), revdats.end());
for (auto &revdat : revdats)
{
// clang-format off
getCategory("database_PDB_rev")->emplace({
{ "num", revdat.revNum },
{ "date", revdat.date },
{ "date_original", revdat.dateOriginal },
{ "replaces", revdat.replaces },
{ "mod_type", revdat.modType }
});
// clang-format on
for (auto &type : revdat.types)
{
if (type.empty())
continue;
// clang-format off
getCategory("database_PDB_rev_record")->emplace({
{ "rev_num", revdat.revNum },
{ "type", type }
});
// clang-format on
}
}
//*/
// SPRSDE
if (mRec->is("SPRSDE"))
{
if (cif::VERBOSE > 0)
std::cerr << "skipping unimplemented SPRSDE record\n";
GetNextRecord();
}
// JRNL
if (mRec->is("JRNL "))
ParseCitation("primary");
}
void PDBFileParser::ParseCitation(const std::string &id)
{
const char *rec = mRec->mName;
std::string auth, titl, edit, publ, refn, pmid, doi;
std::string pubname, volume, astm, country, issn, csd;
std::string pageFirst;
int year = 0;
auto extend = [](std::string &s, const std::string &p)
{
if (not s.empty())
s += ' ';
s += cif::trim_copy(p);
};
while (mRec->is(rec) and (id == "primary" or vC(12) == ' '))
{
std::string k = vS(13, 16);
if (k == "AUTH")
extend(auth, vS(20, 79));
else if (k == "TITL")
extend(titl, vS(20, 79));
else if (k == "EDIT")
extend(edit, vS(20, 79));
else if (k == "REF")
{
if (pubname.empty())
{
extend(pubname, vS(20, 47));
if (vS(50, 51) == "V.")
volume = cif::trim_copy(vS(52, 55));
pageFirst = vS(57, 61);
year = vI(63, 66);
}
else
extend(pubname, vS(20, 47));
}
else if (k == "PUBL")
extend(publ, vS(20, 70));
else if (k == "REFN")
{
if (vS(20, 23) == "ASTN")
astm = vS(25, 30);
country = vS(33, 34);
if (vS(36, 39) == "ISSN")
issn = vS(41, 65);
}
else if (k == "PMID")
pmid = vS(20, 79);
else if (k == "DOI")
doi = vS(20, 79);
GetNextRecord();
}
auto cat = getCategory("citation");
// clang-format off
cat->emplace({
{ "id", id },
{ "title", titl },
{ "journal_abbrev", pubname },
{ "journal_volume", volume },
{ "page_first", pageFirst },
{ "year", year > 0 ? std::to_string(year) : "" },
{ "journal_id_ASTM", astm },
{ "country", country },
{ "journal_id_ISSN", issn },
{ "journal_id_CSD", csd },
{ "book_publisher", publ },
{ "pdbx_database_id_PubMed", pmid },
{ "pdbx_database_id_DOI", doi }
});
// clang-format on
if (not auth.empty())
{
cat = getCategory("citation_author");
for (auto author : cif::split<std::string>(auth, ",", true))
{
cat->emplace({ { "citation_id", id },
{ "name", pdb2cifAuth(author) },
{ "ordinal", mCitationAuthorNr } });
++mCitationAuthorNr;
}
}
if (not edit.empty())
{
cat = getCategory("citation_editor");
for (auto editor : cif::split<std::string>(edit, ",", true))
{
cat->emplace({ { "citation_id", id },
{ "name", pdb2cifAuth(editor) },
{ "ordinal", mCitationEditorNr } });
++mCitationEditorNr;
}
}
}
void PDBFileParser::ParseRemarks()
{
std::string sequenceDetails, compoundDetails, sourceDetails;
while (cif::starts_with(mRec->mName, "REMARK"))
{
int remarkNr = vI(8, 10);
try
{
switch (remarkNr)
{
case 1:
while (mRec->is("REMARK 1") or mRec->is("REMARK 001"))
{
if (mRec->mVlen > 15 and vS(12, 20) == "REFERENCE")
{
std::string id = vS(22, 70);
GetNextRecord();
ParseCitation(id);
}
else
GetNextRecord();
}
break;
case 3:
// we skip REMARK 3 until we know the mapping
while (mRec->is("REMARK 3"))
GetNextRecord();
break;
case 4:
// who cares...
while (mRec->is("REMARK 4"))
GetNextRecord();
break;
case 100:
{
const std::regex rx(R"(THE (\S+) ID CODE IS (\S+?)\.?\s*)");
std::smatch m;
std::string r = vS(12);
if (std::regex_match(r, m, rx))
{
auto cat = getCategory("database_2");
cat->emplace({ { "database_id", m[1].str() },
{ "database_code", m[2].str() } });
}
GetNextRecord();
break;
}
case 200:
{
// we already parsed most of this remark, but the "REMARK:" part might have been split.
bool remark = false;
do
{
std::string r = mRec->vS(12);
if (cif::starts_with(r, "REMARK: "))
{
mRemark200["REMARK"] = r.substr(8);
remark = true;
}
else if (remark)
{
if (r.empty())
remark = false;
else
mRemark200["REMARK"] += r;
}
GetNextRecord();
} while (mRec->is("REMARK 200"));
break;
}
case 280:
{
std::string density_Matthews, densityPercentSol, conditions;
const std::regex rx1(R"(SOLVENT CONTENT, VS +\(%\): *(.+))"),
rx2(R"(MATTHEWS COEFFICIENT, VM \(ANGSTROMS\*\*3/DA\): *(.+))");
std::smatch m;
do
{
std::string r = vS(12);
if (conditions.empty())
{
if (std::regex_match(r, m, rx1))
densityPercentSol = m[1].str();
else if (std::regex_match(r, m, rx2))
density_Matthews = m[1].str();
else if (cif::starts_with(r, "CRYSTALLIZATION CONDITIONS: "))
conditions = r.substr(28);
}
else
conditions = conditions + ' ' + r;
GetNextRecord();
} while (mRec->is("REMARK 280"));
std::string desc = mRemark200["REMARK"];
if (desc == "NULL")
desc.clear();
// clang-format off
getCategory("exptl_crystal")->emplace({
{ "id", 1 },
{ "density_Matthews", iequals(density_Matthews, "NULL") ? "" : density_Matthews },
{ "density_percent_sol", iequals(densityPercentSol, "NULL") ? "" : densityPercentSol },
{ "description", desc }
});
// clang-format on
// now try to parse the conditions
const std::regex rx3(R"(TEMPERATURE +(\d+)K)"), rx4(R"(PH *(?:: *)?(\d+(?:\.\d+)?))") /*, rx5(R"(\b(\d+)C\b)")*/;
std::string temp, ph, method;
for (auto s : cif::split<std::string>(conditions, ",", true))
{
cif::trim(s);
if (std::regex_search(s, m, rx3))
temp = m[1].str();
if (std::regex_search(s, m, rx4))
ph = m[1].str();
if (s.length() < 60 and
(cif::icontains(s, "drop") or cif::icontains(s, "vapor") or cif::icontains(s, "batch")))
{
if (not method.empty())
method = method + ", " + s;
else
method = s;
}
}
if (not(method.empty() and temp.empty() and ph.empty() and (conditions.empty() or conditions == "NULL")))
{
// clang-format off
getCategory("exptl_crystal_grow")->emplace({
{ "crystal_id", 1 },
{ "method", method },
{ "temp", temp },
{ "pH", ph },
{ "pdbx_details", conditions }
});
// clang-format on
}
break;
}
// case 290:
//
// break;
case 350:
// postponed since we don't have the required information yet
for (; mRec->is("REMARK 350"); GetNextRecord())
;
break;
case 400:
{
std::stringstream s;
GetNextRecord();
if (vS(12) == "COMPOUND")
GetNextRecord();
while (mRec->is("REMARK 400"))
{
s << vS(12) << '\n';
GetNextRecord();
}
compoundDetails = s.str();
break;
}
case 450:
{
std::stringstream s;
GetNextRecord();
if (vS(12) == "SOURCE")
GetNextRecord();
while (mRec->is("REMARK 450"))
{
s << vS(12) << '\n';
GetNextRecord();
}
sourceDetails = s.str();
break;
}
case 465:
{
bool headerSeen = false;
std::regex rx(R"( *MODELS *(\d+)-(\d+))");
int models[2] = { -1, -1 };
for (; mRec->is("REMARK 465"); GetNextRecord())
{
if (not headerSeen)
{
std::string line = vS(12);
std::smatch m;
if (std::regex_match(line, m, rx))
{
models[0] = std::stoi(m[1].str());
models[1] = stoi(m[2].str());
}
else
headerSeen = cif::contains(line, "RES C SSSEQI");
continue;
}
if (models[0] == models[1])
models[0] = models[1] = vI(12, 14);
std::string res = vS(16, 18);
char chain = vC(20);
int seq = vI(22, 26);
char iCode = vC(27);
for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
mUnobs.push_back({ modelNr, res, chain, seq, iCode });
}
break;
}
case 470:
{
bool headerSeen = false;
std::regex rx(R"( *MODELS *(\d+)-(\d+))");
int models[2] = { -1, -1 };
for (; mRec->is("REMARK 470"); GetNextRecord())
{
if (not headerSeen)
{
std::string line = vS(12);
std::smatch m;
if (std::regex_match(line, m, rx))
{
models[0] = stoi(m[1].str());
models[1] = stoi(m[2].str());
}
else
headerSeen = cif::contains(line, "RES CSSEQI ATOMS");
continue;
}
if (models[0] == models[1])
models[0] = models[1] = vI(12, 14);
std::string res = vS(16, 18);
char chain = vC(20);
int seq = vI(21, 24);
char iCode = vC(25);
std::string atomStr = mRec->vS(29);
auto atoms = cif::split<std::string>(atomStr, " ", true);
for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
mUnobs.push_back({ modelNr, res, chain, seq, iCode, atoms });
}
break;
}
case 500:
{
GetNextRecord();
enum State
{
eStart,
eCCinSAU,
eCC,
eCBL,
eCBA,
eTA,
eCTg,
ePG,
eMCP,
eChC
} state = eStart;
bool headerSeen = false;
int id = 0;
for (; mRec->is("REMARK 500"); GetNextRecord())
{
std::string line = vS(12);
if (line == "GEOMETRY AND STEREOCHEMISTRY")
continue;
switch (state)
{
case eStart:
{
if (line.empty() or not cif::starts_with(line, "SUBTOPIC: "))
continue;
std::string subtopic = line.substr(10);
if (subtopic == "CLOSE CONTACTS IN SAME ASYMMETRIC UNIT")
state = eCCinSAU;
else if (subtopic == "CLOSE CONTACTS")
state = eCC;
else if (subtopic == "COVALENT BOND LENGTHS")
state = eCBL;
else if (subtopic == "COVALENT BOND ANGLES")
state = eCBA;
else if (subtopic == "TORSION ANGLES")
state = eTA;
else if (subtopic == "NON-CIS, NON-TRANS")
state = eCTg;
else if (subtopic == "PLANAR GROUPS")
state = ePG;
else if (subtopic == "MAIN CHAIN PLANARITY")
state = eMCP;
else if (subtopic == "CHIRAL CENTERS")
state = eChC;
else if (cif::VERBOSE > 0)
throw std::runtime_error("Unknown subtopic in REMARK 500: " + subtopic);
headerSeen = false;
id = 0;
break;
}
case eCCinSAU:
{
if (not headerSeen)
headerSeen =
line == "ATM1 RES C SSEQI ATM2 RES C SSEQI DISTANCE";
else if (line.empty())
state = eStart;
else
{
std::string atom1 = vS(13, 16);
std::string res1 = vS(19, 21);
std::string alt1 = vS(17, 17);
char chain1 = vC(23);
int seq1 = vI(25, 29);
std::string iCode1 = vS(30, 30);
std::string atom2 = vS(34, 37);
std::string alt2 = vS(38, 38);
std::string res2 = vS(40, 42);
char chain2 = vC(44);
int seq2 = vI(46, 50);
std::string iCode2 = vS(51, 51);
std::string distance = vF(63, 71);
// clang-format off
getCategory("pdbx_validate_close_contact")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", 1 },
{ "auth_atom_id_1", atom1 },
{ "auth_asym_id_1", std::string{ chain1 } },
{ "auth_comp_id_1", res1 },
{ "auth_seq_id_1", seq1 },
{ "PDB_ins_code_1", iCode1 },
{ "label_alt_id_1", alt1 },
{ "auth_atom_id_2", atom2 },
{ "auth_asym_id_2", std::string{ chain2 } },
{ "auth_comp_id_2", res2 },
{ "auth_seq_id_2", seq2 },
{ "PDB_ins_code_2", iCode2 },
{ "label_alt_id_2", alt2 },
{ "dist", distance }
});
// clang-format on
}
break;
}
case eCC:
{
if (not headerSeen)
headerSeen = line == "ATM1 RES C SSEQI ATM2 RES C SSEQI SSYMOP DISTANCE";
else if (line.empty())
state = eStart;
else
{
std::string atom1 = vS(13, 16);
std::string res1 = vS(19, 21);
char chain1 = vC(23);
int seq1 = vI(25, 29);
std::string atom2 = vS(34, 37);
std::string res2 = vS(40, 42);
char chain2 = vC(44);
int seq2 = vI(46, 50);
std::string symop;
try
{
symop = pdb2cifSymmetry(vS(54, 59));
}
catch (const std::exception &ex)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping REMARK 500 at line " << mRec->mLineNr << " due to invalid symmetry operation\n";
continue;
}
std::string distance = vF(63, 71);
// clang-format off
getCategory("pdbx_validate_symm_contact")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", 1 },
{ "auth_atom_id_1", atom1 },
{ "auth_asym_id_1", std::string{ chain1 } },
{ "auth_comp_id_1", res1 },
{ "auth_seq_id_1", seq1 },
// { "PDB_ins_code_1", "" },
// { "label_alt_id_1", "" },
{ "site_symmetry_1", "1_555" },
{ "auth_atom_id_2", atom2 },
{ "auth_asym_id_2", std::string{ chain2 } },
{ "auth_comp_id_2", res2 },
{ "auth_seq_id_2", seq2 },
// { "PDB_ins_code_2", "" },
// { "label_alt_id_2", "" },
{ "site_symmetry_2", symop },
{ "dist", distance }
});
// clang-format on
}
break;
}
case eCBL:
{
if (not headerSeen)
{
if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)")
throw std::runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI ATM1 RES CSSEQI ATM2 DEVIATION";
}
else if (line.empty())
state = eStart;
else
{
int model = vI(11, 13);
std::string resNam1 = vS(15, 17);
std::string chainID1{ vC(19) };
int seqNum1 = vI(20, 23);
std::string iCode1{ vC(24) };
std::string alt1 = vS(30, 30);
std::string atm1 = vS(26, 29);
std::string resNam2 = vS(33, 35);
std::string chainID2{ vC(37) };
int seqNum2 = vI(38, 41);
std::string iCode2{ vC(42) };
std::string alt2 = vS(48, 48);
std::string atm2 = vS(44, 47);
std::string deviation = vF(51, 57);
if (iCode1 == " ")
iCode1.clear();
if (iCode2 == " ")
iCode2.clear();
// clang-format off
getCategory("pdbx_validate_rmsd_bond")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", model ? model : 1 },
{ "auth_atom_id_1", atm1 },
{ "auth_asym_id_1", chainID1 },
{ "auth_comp_id_1", resNam1 },
{ "auth_seq_id_1", seqNum1 },
{ "PDB_ins_code_1", iCode1 },
{ "label_alt_id_1", alt1 },
{ "auth_atom_id_2", atm2 },
{ "auth_asym_id_2", chainID2 },
{ "auth_comp_id_2", resNam2 },
{ "auth_seq_id_2", seqNum2 },
{ "PDB_ins_code_2", iCode2 },
{ "label_alt_id_2", alt2 },
{ "bond_deviation", deviation }
});
// clang-format on
}
break;
}
case eCBA:
if (not headerSeen)
{
if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)")
throw std::runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI ATM1 ATM2 ATM3";
}
else if (line.empty())
state = eStart;
else if (vS(64) == "DEGREES")
{
int model = vI(11, 13);
std::string resNam = vS(15, 17);
std::string chainID{ vC(19) };
int seqNum = vI(20, 23);
std::string iCode{ vC(24) };
if (iCode == " ")
iCode.clear();
std::string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) };
std::string deviation = vF(57, 62);
if (deviation == "*****")
deviation.clear();
// clang-format off
getCategory("pdbx_validate_rmsd_angle")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", model ? model : 1 },
{ "auth_atom_id_1", atoms[0] },
{ "auth_asym_id_1", chainID },
{ "auth_comp_id_1", resNam },
{ "auth_seq_id_1", seqNum },
{ "PDB_ins_code_1", iCode },
{ "auth_atom_id_2", atoms[1] },
{ "auth_asym_id_2", chainID },
{ "auth_comp_id_2", resNam },
{ "auth_seq_id_2", seqNum },
{ "PDB_ins_code_2", iCode },
{ "auth_atom_id_3", atoms[2] },
{ "auth_asym_id_3", chainID },
{ "auth_comp_id_3", resNam },
{ "auth_seq_id_3", seqNum },
{ "PDB_ins_code_3", iCode },
{ "angle_deviation", deviation }
});
// clang-format on
}
break;
case eTA:
if (not headerSeen)
{
if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)")
throw std::runtime_error("Unexpected format in REMARK 500");
headerSeen = line == "M RES CSSEQI PSI PHI";
}
else if (line.empty())
state = eStart;
else
{
int model = vI(11, 13);
std::string resNam = vS(15, 17);
std::string chainID{ vC(19) };
int seqNum = vI(20, 23);
std::string iCode{ vC(24) };
if (iCode == " ")
iCode.clear();
std::string psi = vF(27, 35);
std::string phi = vF(37, 45);
// clang-format off
getCategory("pdbx_validate_torsion")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", model ? model : 1 },
{ "auth_comp_id", resNam },
{ "auth_asym_id", chainID },
{ "auth_seq_id", seqNum },
{ "PDB_ins_code", iCode },
{ "phi", phi },
{ "psi", psi }
});
// clang-format on
}
break;
case eCTg:
if (not headerSeen)
headerSeen = line == "MODEL OMEGA";
else if (line.empty())
state = eStart;
else
{
int model = vI(45, 48);
std::string resNam1 = vS(12, 14);
std::string chainID1{ vC(16) };
int seqNum1 = vI(17, 21);
std::string iCode1{ vC(22) };
if (iCode1 == " ")
iCode1.clear();
std::string resNam2 = vS(27, 29);
std::string chainID2{ vC(31) };
int seqNum2 = vI(32, 36);
std::string iCode2{ vC(37) };
if (iCode2 == " ")
iCode2.clear();
std::string omega = vF(54, 60);
// clang-format off
getCategory("pdbx_validate_peptide_omega")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", model ? model : 1 },
{ "auth_comp_id_1", resNam1 },
{ "auth_asym_id_1", chainID1 },
{ "auth_seq_id_1", seqNum1 },
{ "PDB_ins_code_1", iCode1 },
{ "auth_comp_id_2", resNam2 },
{ "auth_asym_id_2", chainID2 },
{ "auth_seq_id_2", seqNum2 },
{ "PDB_ins_code_2", iCode2 },
{ "omega", omega }
});
// clang-format on
}
break;
case ePG:
if (not headerSeen)
headerSeen = line == "M RES CSSEQI RMS TYPE";
else if (line.empty())
state = eStart;
else
{
int model = vI(11, 13);
std::string resNam = vS(15, 17);
std::string chainID{ vC(19) };
int seqNum = vI(20, 23);
std::string iCode{ vC(24) };
if (iCode == " ")
iCode.clear();
std::string rmsd = vF(32, 36);
std::string type = vS(41);
// clang-format off
getCategory("pdbx_validate_planes")->emplace({
{ "id", std::to_string(++id) },
{ "PDB_model_num", model ? model : 1 },
{ "auth_comp_id", resNam },
{ "auth_asym_id", chainID },
{ "auth_seq_id", seqNum },
{ "PDB_ins_code", iCode },
{ "rmsd", rmsd },
{ "type", type }
});
// clang-format on
}
break;
default:
state = eStart;
break;
}
}
break;
}
case 610:
{
bool headerSeen = false;
for (; mRec->is("REMARK 610"); GetNextRecord())
{
if (not headerSeen)
{
std::string line = vS(12);
headerSeen = cif::contains(line, "RES C SSEQI");
continue;
}
int modelNr = vI(12, 14);
if (modelNr == 0)
modelNr = 1;
std::string res = vS(16, 18);
char chain = vC(20);
int seq = vI(22, 25);
char iCode = vC(26);
auto compound = cif::compound_factory::instance().create(res);
if (compound == nullptr)
continue;
std::vector<std::string> atoms;
for (auto atom : compound->atoms())
{
if (atom.type_symbol != cif::H)
atoms.push_back(atom.id);
}
mUnobs.push_back({ modelNr, res, chain, seq, iCode, { atoms } });
}
break;
}
case 800:
{
const std::regex rx1(R"(SITE_IDENTIFIER: (.+))"),
rx2(R"(EVIDENCE_CODE: (.+))"),
rx3(R"(SITE_DESCRIPTION: (binding site for residue ([[:alnum:]]{1,3}) ([[:alnum:]]) (\d+)|.+))", std::regex_constants::icase);
std::string id, evidence, desc;
std::string pdbxAuthAsymID, pdbxAuthCompID, pdbxAuthSeqID, pdbxAuthInsCode;
std::smatch m;
enum State
{
sStart,
sID,
sEvidence,
sDesc,
sDesc2
} state = sStart;
auto store = [&]()
{
// Find the matching SITE record
auto site = FindRecord([id](PDBRecord &r) -> bool
{ return r.is("SITE ") and r.vS(12, 14) == id; });
if (site == nullptr)
throw std::runtime_error("Invalid REMARK 800, no SITE record for id " + id);
// next record, store what we have
// clang-format off
getCategory("struct_site")->emplace({
{ "id", id },
{ "details", desc },
{ "pdbx_auth_asym_id", pdbxAuthAsymID },
{ "pdbx_auth_comp_id", pdbxAuthCompID },
{ "pdbx_auth_seq_id", pdbxAuthSeqID },
{ "pdbx_num_residues", site->vI(16, 17) },
{ "pdbx_evidence_code", evidence }
});
// clang-format on
};
for (; mRec->is("REMARK 800"); GetNextRecord())
{
std::string s = mRec->vS(12);
if (s.empty())
continue;
switch (state)
{
case sStart:
if (s == "SITE")
state = sID;
else if (cif::VERBOSE > 0)
throw std::runtime_error("Invalid REMARK 800 record, expected SITE");
break;
case sID:
if (std::regex_match(s, m, rx1))
{
id = m[1].str();
state = sEvidence;
}
else if (cif::VERBOSE > 0)
throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
break;
case sEvidence:
if (regex_match(s, m, rx2))
{
evidence = m[1].str();
state = sDesc;
}
else if (cif::VERBOSE > 0)
throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
break;
case sDesc:
if (regex_match(s, m, rx3))
{
desc = m[1].str();
pdbxAuthCompID = m[2].str();
pdbxAuthAsymID = m[3].str();
pdbxAuthSeqID = m[4].str();
state = sDesc2;
}
break;
case sDesc2:
if (regex_match(s, m, rx1))
{
store();
id = m[1].str();
state = sEvidence;
evidence.clear();
desc.clear();
}
else
desc = desc + ' ' + s;
break;
}
}
if (not id.empty())
store();
break;
}
case 999:
{
std::stringstream s;
GetNextRecord();
if (vS(12) == "SEQUENCE")
GetNextRecord();
while (mRec->is("REMARK 999"))
{
s << vS(12) << '\n';
GetNextRecord();
}
sequenceDetails = s.str();
break;
}
// these are skipped
case 2:
case 290:
case 300:
case 620:
GetNextRecord();
break;
default:
{
std::string skipped = mRec->mName;
std::stringstream s;
if (not mRec->vS(11).empty())
s << mRec->vS(11) << '\n';
GetNextRecord();
while (mRec->is(skipped.c_str()))
{
s << mRec->vS(11) << '\n';
GetNextRecord();
}
// clang-format off
getCategory("pdbx_database_remark")->emplace({
{ "id", remarkNr },
{ "text", s.str() }
});
// clang-format on
break;
}
}
}
catch (const std::exception &ex)
{
std::throw_with_nested(std::runtime_error("Error parsing REMARK " + std::to_string(remarkNr)));
}
}
if (not(compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty()))
{
// clang-format off
getCategory("pdbx_entry_details")->emplace({
{ "entry_id", mStructureID },
{ "compound_details", compoundDetails },
{ "sequence_details", sequenceDetails },
{ "source_details", sourceDetails }
});
// clang-format on
}
// store remark 200 info (special case)
if (not mRemark200.empty())
ParseRemark200();
}
void PDBFileParser::ParseRemark200()
{
auto rm200 = [&](const char *name, int diffrnNr) -> std::string
{
int nr = 0;
std::string result;
for (auto s : cif::split<std::string>(mRemark200[name], ";"))
{
if (++nr != diffrnNr)
continue;
cif::trim(s);
if (s == "NULL")
s.clear();
result = std::move(s);
break;
}
return result;
};
auto inRM200 = [this](std::initializer_list<const char *> s) -> bool
{
bool result = false;
for (auto *n : s)
{
if (not this->mRemark200[n].empty())
{
result = true;
break;
}
}
return result;
};
/*
The category computing is no longer used.
if (inRM200({"INTENSITY-INTEGRATION SOFTWARE", "DATA SCALING SOFTWARE", "SOFTWARE USED"}) or
not mRefinementSoftware.empty())
getCategory("computing")->emplace({
{ "entry_id", mStructureID },
{ "pdbx_data_reduction_ii", mRemark200["INTENSITY-INTEGRATION SOFTWARE"] },
{ "pdbx_data_reduction_ds", mRemark200["DATA SCALING SOFTWARE"] },
{ "structure_solution", mRemark200["SOFTWARE USED"] },
{ "structure_refinement", mRefinementSoftware }
});
*/
struct
{
const char *a;
const char *b;
} kSWMap[] = {
{ "data reduction", "INTENSITY-INTEGRATION SOFTWARE" },
{ "data scaling", "DATA SCALING SOFTWARE" },
{ "phasing", "SOFTWARE USED" },
};
for (auto &sw : kSWMap)
{
if (mRemark200[sw.b].empty())
continue;
// clang-format off
getCategory("software")->emplace({
{ "name", mRemark200[sw.b] },
{ "classification", sw.a },
{ "version", "." },
{ "pdbx_ordinal", mNextSoftwareOrd++ }
});
// clang-format on
}
std::string scatteringType;
if (mRemark200["EXPERIMENT TYPE"] == "X-RAY DIFFRACTION")
scatteringType = "x-ray";
else if (mRemark200["EXPERIMENT TYPE"] == "NEUTRON DIFFRACTION")
scatteringType = "neutron";
std::set<std::string> diffrnWaveLengths;
for (int diffrnNr = 1;; ++diffrnNr)
{
std::string ambientTemp = rm200("TEMPERATURE (KELVIN)", diffrnNr);
if (ambientTemp.empty())
break;
if (cif::ends_with(ambientTemp, "K"))
ambientTemp.erase(ambientTemp.length() - 1, 1);
// clang-format off
getCategory("diffrn")->emplace({
{ "id", diffrnNr },
{ "ambient_temp", ambientTemp },
// { "ambient_temp_details", seqID },
{ "crystal_id", 1 } });
// clang-format on
std::string collectionDate;
std::error_code ec;
collectionDate = pdb2cifDate(rm200("DATE OF DATA COLLECTION", diffrnNr), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << ec.message() << " for pdbx_collection_date\n";
// The date field can become truncated when multiple values are available
if (diffrnNr != 1)
collectionDate.clear();
}
// clang-format off
getCategory("diffrn_detector")->emplace({
{ "diffrn_id", diffrnNr },
{ "detector", rm200("DETECTOR TYPE", diffrnNr) },
{ "type", rm200("DETECTOR MANUFACTURER", diffrnNr) },
{ "pdbx_collection_date", collectionDate },
{ "details", rm200("OPTICS", diffrnNr) }
});
// clang-format on
if (inRM200({ "MONOCHROMATIC OR LAUE (M/L)", "MONOCHROMATOR", "DIFFRACTION PROTOCOL" }) or not scatteringType.empty())
// clang-format off
getCategory("diffrn_radiation")->emplace({
{ "diffrn_id", diffrnNr },
{ "wavelength_id", 1 },
{ "pdbx_monochromatic_or_laue_m_l", rm200("MONOCHROMATIC OR LAUE (M/L)", diffrnNr) },
{ "monochromator", rm200("MONOCHROMATOR", diffrnNr) },
{ "pdbx_diffrn_protocol", rm200("DIFFRACTION PROTOCOL", diffrnNr) },
{ "pdbx_scattering_type", scatteringType }
});
// clang-format on
std::string wl = rm200("WAVELENGTH OR RANGE (A)", diffrnNr);
auto wavelengths = cif::split<std::string>(wl, ", -", true);
diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end());
std::string source;
if (rm200("SYNCHROTRON (Y/N)", diffrnNr) == "Y")
{
// clang-format off
getCategory("diffrn_source")->emplace({
{ "diffrn_id", diffrnNr },
{ "source", "SYNCHROTRON" },
{ "type", rm200("RADIATION SOURCE", diffrnNr) + " BEAMLINE " + rm200("BEAMLINE", diffrnNr) },
{ "pdbx_synchrotron_site", rm200("RADIATION SOURCE", diffrnNr) },
{ "pdbx_synchrotron_beamline", rm200("BEAMLINE", diffrnNr) },
{ "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
{ "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
});
// clang-format on
}
else if (inRM200({ "X-RAY GENERATOR MODEL", "RADIATION SOURCE", "BEAMLINE", "WAVELENGTH OR RANGE (A)" }))
{
// clang-format off
getCategory("diffrn_source")->emplace({
{ "diffrn_id", diffrnNr },
{ "source", rm200("RADIATION SOURCE", diffrnNr) },
{ "type", rm200("X-RAY GENERATOR MODEL", diffrnNr) },
{ "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
{ "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
});
// clang-format on
}
}
int wavelengthNr = 1;
for (auto wl : diffrnWaveLengths)
{
if (cif::ends_with(wl, "A"))
wl.erase(wl.length() - 1, 1);
// clang-format off
getCategory("diffrn_radiation_wavelength")->emplace({
{ "id", wavelengthNr++ },
{ "wavelength", wl.empty() ? "." : wl },
{ "wt", "1.0" }
});
// clang-format on
}
if (inRM200({ "METHOD USED TO DETERMINE THE STRUCTURE", "STARTING MODEL" }))
{
auto cat = getCategory("refine");
assert(cat->empty());
std::string resolution = mRemark200["RESOLUTION RANGE HIGH (A)"];
if (resolution.empty())
resolution = ".";
// clang-format off
cat->emplace({
{ "pdbx_method_to_determine_struct", mRemark200["METHOD USED TO DETERMINE THE STRUCTURE"] },
{ "pdbx_starting_model", mRemark200["STARTING MODEL"] },
{ "ls_d_res_high", resolution },
{ "pdbx_diffrn_id", 1 },
{ "pdbx_refine_id", mExpMethod },
{ "entry_id", mStructureID } });
// clang-format on
}
if (inRM200({ "REJECTION CRITERIA (SIGMA(I))", "RESOLUTION RANGE HIGH (A)", "RESOLUTION RANGE LOW (A)", "NUMBER OF UNIQUE REFLECTIONS", "COMPLETENESS FOR RANGE (%)", "<I/SIGMA(I)> FOR THE DATA SET", "R MERGE (I)", "R SYM (I)", "DATA REDUNDANCY" }))
{
auto cat = getCategory("reflns");
// clang-format off
cat->emplace({
{ "entry_id", mStructureID },
{ "observed_criterion_sigma_I", mRemark200["REJECTION CRITERIA (SIGMA(I))"] },
{ "d_resolution_high", mRemark200["RESOLUTION RANGE HIGH (A)"] },
{ "d_resolution_low", mRemark200["RESOLUTION RANGE LOW (A)"] },
{ "number_obs", mRemark200["NUMBER OF UNIQUE REFLECTIONS"] },
{ "percent_possible_obs", mRemark200["COMPLETENESS FOR RANGE (%)"] },
{ "pdbx_netI_over_sigmaI", mRemark200["<I/SIGMA(I)> FOR THE DATA SET"] },
{ "pdbx_Rmerge_I_obs", mRemark200["R MERGE (I)"] },
{ "pdbx_Rsym_value", mRemark200["R SYM (I)"] },
{ "pdbx_redundancy", mRemark200["DATA REDUNDANCY"] },
{ "pdbx_ordinal", 1 },
{ "pdbx_diffrn_id", 1 }
});
// clang-format on
}
if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE HIGH (A)" })) // that one field is mandatory...
{
// clang-format off
getCategory("reflns_shell")->emplace({
{ "d_res_high", mRemark200["HIGHEST RESOLUTION SHELL, RANGE HIGH (A)"] },
{ "d_res_low", mRemark200["HIGHEST RESOLUTION SHELL, RANGE LOW (A)"] },
{ "percent_possible_all", mRemark200["COMPLETENESS FOR SHELL (%)"] },
{ "Rmerge_I_obs", mRemark200["R MERGE FOR SHELL (I)"] },
{ "pdbx_Rsym_value", mRemark200["R SYM FOR SHELL (I)"] },
{ "meanI_over_sigI_obs", mRemark200["<I/SIGMA(I)> FOR SHELL"] },
{ "pdbx_redundancy", mRemark200["DATA REDUNDANCY IN SHELL"] },
{ "pdbx_ordinal", 1 },
{ "pdbx_diffrn_id", 1 }
});
// clang-format on
}
else if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE LOW (A)", "COMPLETENESS FOR SHELL (%)",
"R MERGE FOR SHELL (I)", "R SYM FOR SHELL (I)", "<I/SIGMA(I)> FOR SHELL", "DATA REDUNDANCY IN SHELL" }))
{
if (cif::VERBOSE > 0)
std::cerr << "Not writing reflns_shell record since d_res_high is missing\n";
}
}
void PDBFileParser::ParseRemark350()
{
auto saved = mRec;
enum State
{
eStart,
eInfo,
eAnd,
eApply,
eBioMT
} state = eStart;
const std::regex
kRX1(R"(BIOMOLECULE: (\d+))"),
kRX2(R"(([^:]+): (.+?)(?: (ANGSTROM\*\*2|KCAL/MOL))?)"),
kRX8(R"(APPLY THE FOLLOWING TO CHAINS: (.+))"),
kRX9(R"(AND CHAINS: (.+))"),
kRX10(R"(BIOMT([123])\s+(\d+)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?))");
int biomolecule = 0, operID = 0;
std::vector<std::string> operExpression;
std::map<std::string, std::string> values;
std::vector<std::string> asymIdList;
std::smatch m;
std::vector<double> mat, vec;
for (mRec = FindRecord("REMARK 350"); mRec != nullptr and mRec->is("REMARK 350"); GetNextRecord())
{
std::string line = vS(11);
switch (state)
{
case eStart:
if (regex_match(line, m, kRX1))
{
biomolecule = stoi(m[1].str());
state = eInfo;
}
break;
case eInfo:
if (regex_match(line, m, kRX8))
{
state = eApply;
std::string value = m[1].str();
for (auto chain : cif::split<std::string>(value, ", ", true))
{
if (chain.empty()) // happens when we have a AND CHAIN line
{
state = eAnd;
break;
}
if (chain.length() != 1)
throw std::runtime_error("Invalid REMARK 350");
MapChainID2AsymIDS(chain[0], asymIdList);
}
}
else if (regex_match(line, m, kRX2))
values[m[1].str()] = m[2].str();
break;
case eAnd:
if (regex_match(line, m, kRX9))
{
state = eApply;
std::string value = m[1].str();
for (auto chain : cif::split<std::string>(value, ", ", true))
{
if (chain.empty()) // happens when we have another AND CHAIN line
{
state = eAnd;
break;
}
MapChainID2AsymIDS(chain[0], asymIdList);
}
continue;
}
// fall through
case eApply:
if (regex_match(line, m, kRX10))
{
int mt = stoi(m[1].str());
if (mt != 1)
throw std::runtime_error("Invalid REMARK 350");
operID = stoi(m[2].str());
operExpression.push_back(std::to_string(operID));
mat.push_back(stod(m[3].str()));
mat.push_back(stod(m[4].str()));
mat.push_back(stod(m[5].str()));
vec.push_back(stod(m[6].str()));
state = eBioMT;
}
break;
case eBioMT:
if (regex_match(line, m, kRX10))
{
int mt = stoi(m[1].str());
if (mt == 1)
{
operID = stoi(m[2].str());
operExpression.push_back(std::to_string(operID));
}
else if (operID != stoi(m[2].str()))
throw std::runtime_error("Invalid REMARK 350");
mat.push_back(stod(m[3].str()));
mat.push_back(stod(m[4].str()));
mat.push_back(stod(m[5].str()));
vec.push_back(stod(m[6].str()));
if (mt == 3)
{
if (vec.size() != 3 or mat.size() != 9)
throw std::runtime_error("Invalid REMARK 350");
if (operID == 1)
{
std::string oligomer = values["AUTHOR DETERMINED BIOLOGICAL UNIT"];
if (oligomer.empty())
oligomer = values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"];
to_lower(oligomer);
int count = 0;
std::smatch m2;
if (std::regex_match(oligomer, m2, std::regex(R"((\d+)-meric)")))
{
count = stoi(m2[1].str());
}
else if (cif::ends_with(oligomer, "meric"))
{
std::string cs = oligomer.substr(0, oligomer.length() - 5);
if (cs == "mono")
count = 1;
else if (cs == "di")
count = 2;
else if (cs == "tri")
count = 3;
else if (cs == "tetra")
count = 4;
else if (cs == "hexa")
count = 6;
else if (cs == "octa")
count = 8;
else if (cs == "dodeca")
count = 12;
}
std::string details;
if (values["AUTHOR DETERMINED BIOLOGICAL UNIT"].empty())
{
if (not values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
details = "software_defined_assembly";
}
else if (values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
details = "author_defined_assembly";
else
details = "author_and_software_defined_assembly";
// clang-format off
getCategory("pdbx_struct_assembly")->emplace({
{ "id", biomolecule },
{ "details", details },
{ "method_details", values["SOFTWARE USED"] },
{ "oligomeric_details", oligomer },
{ "oligomeric_count", count > 0 ? std::to_string(count) : "" }
});
auto cat = getCategory("pdbx_struct_assembly_prop");
if (not values["TOTAL BURIED SURFACE AREA"].empty())
cat->emplace({
{ "biol_id", biomolecule },
{ "type", "ABSA (A^2)" },
{ "value", values["TOTAL BURIED SURFACE AREA"] }
});
if (not values["CHANGE IN SOLVENT FREE ENERGY"].empty())
cat->emplace({
{ "biol_id", biomolecule },
{ "type", "MORE" },
{ "value", values["CHANGE IN SOLVENT FREE ENERGY"] }
});
if (not values["SURFACE AREA OF THE COMPLEX"].empty())
cat->emplace({
{ "biol_id", biomolecule },
{ "type", "SSA (A^2)" },
{ "value", values["SURFACE AREA OF THE COMPLEX"] }
});
// clang-format on
values.clear();
}
std::string type = mat == std::vector<double>{ 1, 0, 0, 0, 1, 0, 0, 0, 1 } and vec == std::vector<double>{ 0, 0, 0 } ? "identity operation" : "crystal symmetry operation";
auto pdbx_struct_oper_list = getCategory("pdbx_struct_oper_list");
if (not pdbx_struct_oper_list->contains(cif::key("id") == operID))
getCategory("pdbx_struct_oper_list")->emplace({ // clang-format off
{ "id", operID },
{ "type", type },
// { "name", "" },
// { "symmetryOperation", "" },
{ "matrix[1][1]", cif::format("{:12.10f}", mat[0]) },
{ "matrix[1][2]", cif::format("{:12.10f}", mat[1]) },
{ "matrix[1][3]", cif::format("{:12.10f}", mat[2]) },
{ "vector[1]", cif::format("{:12.10f}", vec[0]) },
{ "matrix[2][1]", cif::format("{:12.10f}", mat[3]) },
{ "matrix[2][2]", cif::format("{:12.10f}", mat[4]) },
{ "matrix[2][3]", cif::format("{:12.10f}", mat[5]) },
{ "vector[2]", cif::format("{:12.10f}", vec[1]) },
{ "matrix[3][1]", cif::format("{:12.10f}", mat[6]) },
{ "matrix[3][2]", cif::format("{:12.10f}", mat[7]) },
{ "matrix[3][3]", cif::format("{:12.10f}", mat[8]) },
{ "vector[3]", cif::format("{:12.10f}", vec[2]) }
});
// clang-format on
mat.clear();
vec.clear();
}
}
else if (regex_match(line, m, kRX1))
{
if (not(vec.empty() and mat.empty()))
throw std::runtime_error("Invalid REMARK 350");
// clang-format off
getCategory("pdbx_struct_assembly_gen")->emplace({
{ "assembly_id", biomolecule },
{ "oper_expression", cif::join(operExpression, ",") },
{ "asym_id_list", cif::join(asymIdList, ",") }
});
// clang-format on
biomolecule = stoi(m[1].str());
asymIdList.clear();
operExpression.clear();
state = eInfo;
}
break;
}
}
if (not operExpression.empty())
{
// clang-format off
getCategory("pdbx_struct_assembly_gen")->emplace({
{ "assembly_id", biomolecule },
{ "oper_expression", cif::join(operExpression, ",") },
{ "asym_id_list", cif::join(asymIdList, ",") }
});
// clang-format on
}
mRec = saved;
}
void PDBFileParser::ParsePrimaryStructure()
{
// First locate the DBREF record. Might be missing
DBREF cur = { mStructureID };
while (cif::starts_with(mRec->mName, "DBREF"))
{
if (mRec->is("DBREF ")) // 1 - 6 Record name "DBREF "
{
cur.PDBIDCode = vS(8, 11); // 8 - 11 IDcode idCode ID code of this datablock.
cur.chainID = vC(13); // 13 Character chainID Chain identifier.
cur.seqBegin = vI(15, 18); // 15 - 18 Integer seqBegin Initial sequence number of the
// PDB sequence segment.
cur.insertBegin = vC(19); // 19 AChar insertBegin Initial insertion code of the
// PDB sequence segment.
cur.seqEnd = vI(21, 24); // 21 - 24 Integer seqEnd Ending sequence number of the
// PDB sequence segment.
cur.insertEnd = vC(25); // 25 AChar insertEnd Ending insertion code of the
// PDB sequence segment.
cur.database = vS(27, 32); // 27 - 32 LString database Sequence database name.
cur.dbAccession = vS(34, 41); // 34 - 41 LString dbAccession Sequence database accession code.
cur.dbIdCode = vS(43, 54); // 43 - 54 LString dbIdCode Sequence database identification code.
cur.dbSeqBegin = vI(56, 60); // 56 - 60 Integer dbseqBegin Initial sequence number of the
// database seqment.
cur.dbinsBeg = vC(61); // 61 AChar idbnsBeg Insertion code of initial residue of the
// segment, if PDB is the reference.
cur.dbSeqEnd = vI(63, 67); // 63 - 67 Integer dbseqEnd Ending sequence number of the
// database segment.
cur.dbinsEnd = vC(68); // 68 AChar dbinsEnd Insertion code of the ending residue of
// the segment, if PDB is the reference.
auto &chain = GetChainForID(cur.chainID);
chain.mDbref = cur;
}
else if (mRec->is("DBREF1")) // 1 - 6 Record name "DBREF1"
{
cur.PDBIDCode = vS(8, 11); // 8 - 11 IDcode idCode ID code of this datablock.
cur.chainID = vC(13); // 13 Character chainID Chain identifier.
cur.seqBegin = vI(15, 18); // 15 - 18 Integer seqBegin Initial sequence number of the
// PDB sequence segment, right justified.
cur.insertBegin = vC(19); // 19 AChar insertBegin Initial insertion code of the
// PDB sequence segment.
cur.seqEnd = vI(21, 24); // 21 - 24 Integer seqEnd Ending sequence number of the
// PDB sequence segment, right justified.
cur.insertEnd = vC(25); // 25 AChar insertEnd Ending insertion code of the
// PDB sequence segment.
cur.database = vS(27, 32); // 27 - 32 LString database Sequence database name.
cur.dbIdCode = vS(48, 67); // 48 - 67 LString dbIdCode Sequence database identification code,
}
else if (mRec->is("DBREF2")) // 1 - 6 Record name "DBREF2"
{ // 8 - 11 IDcode idCode ID code of this datablock.
if (vC(13) != cur.chainID) // 13 Character chainID Chain identifier.
throw std::runtime_error("Chain ID's for DBREF1/DBREF2 records do not match");
cur.dbAccession = vS(19, 40); // 19 - 40 LString dbAccession Sequence database accession code,
// left justified.
cur.dbSeqBegin = vI(46, 55); // 46 - 55 Integer seqBegin Initial sequence number of the
// Database segment, right justified.
cur.dbSeqEnd = vI(58, 67); // 58 - 67 Integer seqEnd Ending sequence number of the
// Database segment, right justified.
auto &chain = GetChainForID(cur.chainID);
chain.mDbref = cur;
}
GetNextRecord();
}
// update chains
for (auto &chain : mChains)
{
chain.mNextSeqNum = chain.mDbref.seqBegin;
chain.mNextDbSeqNum = chain.mDbref.dbSeqBegin;
}
while (mRec->is("SEQADV"))
{ // 1 - 6 Record name "SEQADV"
mSeqadvs.push_back({
// 8 - 11 IDcode idCode ID code of this datablock.
vS(13, 15), // 13 - 15 Residue name resName Name of the PDB residue in conflict.
vC(17), // 17 Character chainID PDB chain identifier.
vI(19, 22), // 19 - 22 Integer seqNum PDB sequence number.
vC(23), // 23 AChar iCode PDB insertion code.
vS(25, 28), // 25 - 28 LString database
vS(30, 38), // 30 - 38 LString dbAccession Sequence database accession number.
vS(40, 42), // 40 - 42 Residue name dbRes Sequence database residue name.
vI(44, 48), // 44 - 48 Integer dbSeq Sequence database sequence number.
vS(50, 70) // 50 - 70 LString conflict Conflict comment.
});
GetNextRecord();
}
while (mRec->is("SEQRES")) // 1 - 6 Record name "SEQRES"
{ // 8 - 10 Integer serNum Serial number of the SEQRES record for the
// current chain. Starts at 1 and increments
// by one each line. Reset to 1 for each chain.
char chainID = vC(12); // 12 Character chainID Chain identifier. This may be any single
// legal character, including a blank which is
// is used if there is only one chain.
int numRes = vI(14, 17); // 14 - 17 Integer numRes Number of residues in the chain.
// This value is repeated on every record.
std::string monomers = vS(20, 70); // 20 - 22 Residue name resName Residue name.
// ...
auto &chain = GetChainForID(chainID, numRes);
for (auto monID : cif::split<std::string>(monomers, " ", true))
{
if (monID.empty())
continue;
chain.mSeqres.push_back({ monID, chain.mNextSeqNum++, ' ', chain.mNextDbSeqNum++ });
InsertChemComp(monID);
}
GetNextRecord();
}
// First pass over MODRES, only store relevant information required in ConstructEntities
while (mRec->is("MODRES")) // 1 - 6 Record name "MODRES"
{ // 8 - 11 IDcode idCode ID code of this datablock.
std::string resName = vS(13, 15); // 13 - 15 Residue name resName Residue name used in this datablock.
// char chainID = vC(17); // 17 Character chainID Chain identifier.
// int seqNum = vI(19, 22); // 19 - 22 Integer seqNum Sequence number.
// char iCode = vC(23); // 23 AChar iCode Insertion code.
std::string stdRes = vS(25, 27); // 25 - 27 Residue name stdRes Standard residue name.
// std::string comment = vS(30, 70); // 30 - 70 String comment Description of the residue modification.
mMod2parent[resName] = stdRes;
GetNextRecord();
}
}
void PDBFileParser::ParseHeterogen()
{
while (mRec->is("HET "))
{ // 1 - 6 Record name "HET "
std::string hetID = vS(8, 10); // 8 - 10 LString(3) hetID Het identifier, right-justified.
char chainID = vC(13); // 13 Character ChainID Chain identifier.
int seqNum = vI(14, 17); // 14 - 17 Integer seqNum Sequence number.
char iCode = vC(18); // 18 AChar iCode Insertion code.
int numHetAtoms = vI(21, 25); // 21 - 25 Integer numHetAtoms Number of HETATM records for the group
// present in the datablock.
std::string text = vS(31, 70); // 31 - 70 String text Text describing Het group.
mHets.emplace_back(hetID, chainID, seqNum, iCode, numHetAtoms, text);
GetNextRecord();
}
for (;;)
{
if (mRec->is("HETNAM")) // 1 - 6 Record name "HETNAM"
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records.
std::string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
std::string text = vS(16); // 16 - 70 String text Chemical name.
mHetnams[hetID] = text;
InsertChemComp(hetID);
GetNextRecord();
continue;
}
if (mRec->is("HETSYN")) // 1 - 6 Record name "HETSYN"
{ // 9 - 10 Continuation continuation Allows concatenation of multiple records.
std::string hetID = vS(12, 14); // 12 - 14 LString(3) hetID Het identifier, right-justified.
std::string syn = vS(16); // 16 - 70 SList hetSynonyms List of synonyms.
mHetsyns[hetID] = syn;
GetNextRecord();
continue;
}
break;
}
while (mRec->is("FORMUL")) // 1 - 6 Record name "FORMUL"
{ // 9 - 10 Integer compNum Component number.
std::string hetID = vS(13, 15); // 13 - 15 LString(3) hetID Het identifier.
// 17 - 18 Integer continuation Continuation number.
char waterMark = vC(19); // 19 Character asterisk "*" for water.
std::string formula = vS(20); // 20 - 70 String text Chemical formula.
mFormuls[hetID] = formula;
if (waterMark == '*')
mWaterHetID = hetID;
GetNextRecord();
}
}
void PDBFileParser::ConstructEntities()
{
// We parsed the Primary Structure and Heterogen sections, if available.
// But if we didn't parse anything, we need to fake the data based on residues in ATOM records
// First iterate all ATOM records and store the residues as found in these records
int modelNr = 1;
typedef std::map<std::tuple<char, int, char, char>, std::string> CompTypeMap;
CompTypeMap residuesSeen; // used to validate PDB files...
for (auto r = mData; r != nullptr; r = r->mNext)
{
if (r->is("MODEL "))
{
modelNr = r->vI(11, 14);
if (modelNr != 1)
break;
continue;
}
if (r->is("ATOM ") or r->is("HETATM")) // 1 - 6 Record name "ATOM "
{ // ...
std::string name = r->vS(13, 16); // 13 - 16 Atom name Atom name.
char altLoc = r->vC(17); // 17 Character altLoc Alternate location indicator.
std::string resName = r->vS(18, 20); // 18 - 20 Residue name resName Residue name.
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
// first validate, too sad this is required...
CompTypeMap::key_type k = std::make_tuple(chainID, resSeq, iCode, altLoc);
if (residuesSeen.count(k) == 0)
residuesSeen[k] = resName;
else if (residuesSeen[k] != resName)
throw std::runtime_error("inconsistent residue type for " + std::string{ chainID } + std::to_string(resSeq) + iCode + altLoc + "\n" +
" (" + residuesSeen[k] + " != " + resName + ")");
auto &chain = GetChainForID(chainID);
PDBChain::AtomRes ar{ resName, resSeq, iCode };
if ((chain.mResiduesSeen.empty() or chain.mResiduesSeen.back() != ar) and
cif::compound_factory::instance().is_monomer(resName))
{
chain.mResiduesSeen.push_back(ar);
}
// now that we're iterating atoms anyway, clean up the mUnobs array
mUnobs.erase(remove_if(mUnobs.begin(), mUnobs.end(), [=](UNOBS &a)
{
bool result = false;
if (modelNr == a.modelNr and
resName == a.res and
chainID == a.chain and
resSeq == a.seq and
iCode == a.iCode)
{
auto i = find(a.atoms.begin(), a.atoms.end(), name);
if (i != a.atoms.end())
{
a.atoms.erase(i);
result = a.atoms.empty();
}
}
return result; }),
mUnobs.end());
continue;
}
if (r->is("TER ")) // 1 - 6 Record name "TER "
{ // 7 - 11 Integer serial Serial number.
// 18 - 20 Residue name resName Residue name.
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
// 23 - 26 Integer resSeq Residue sequence number.
// 27 AChar iCode Insertion code.
auto &chain = GetChainForID(chainID);
if (chain.mTerIndex == 0) // Is this the first TER record? (Refmac writes out multiple TER records...)
chain.mTerIndex = static_cast<int>(chain.mResiduesSeen.size());
continue;
}
}
// prune completely empty chains?
mChains.erase(remove_if(mChains.begin(), mChains.end(), [](auto &chain)
{ return chain.mResiduesSeen.empty() and chain.mSeqres.empty(); }),
mChains.end());
for (auto &chain : mChains)
{
if (not(chain.mSeqres.empty() or chain.mResiduesSeen.empty()))
{
// seems safe to assume TER record is at the right location...
// However, some files don't have them at all.
// When mTerIndex == 0 this is most likely the case. Right?
if (chain.mTerIndex > 0)
chain.mResiduesSeen.erase(chain.mResiduesSeen.begin() + chain.mTerIndex, chain.mResiduesSeen.end());
int lastResidueIndex = chain.AlignResToSeqRes();
if (lastResidueIndex > 0 and lastResidueIndex + 1 < static_cast<int>(chain.mResiduesSeen.size()))
{
auto &r = chain.mResiduesSeen[lastResidueIndex + 1];
if (cif::VERBOSE > 0)
{
std::cerr << "Detected residues that cannot be aligned to SEQRES\n"
<< "First residue is " << chain.mDbref.chainID << ':' << r.mSeqNum << r.mIcode << '\n';
}
chain.mTerIndex = lastResidueIndex + 1;
}
}
else
{
// So, we did not have a SEQRES for this chain. Try to reconstruct it.
// Problem here is that TER records may be located incorrectly. So
// first lets shift the ter index until it is past the last known
// aminoacid or base.
for (int ix = chain.mTerIndex; ix < static_cast<int>(chain.mResiduesSeen.size()); ++ix)
{
std::string resName = chain.mResiduesSeen[ix].mMonID;
if (cif::compound_factory::instance().is_monomer(resName))
chain.mTerIndex = ix + 1;
InsertChemComp(resName);
}
// And now construct our 'SEQRES'...
for (int ix = 0; ix < chain.mTerIndex; ++ix)
{
auto &ar = chain.mResiduesSeen[ix];
chain.mSeqres.push_back({ ar.mMonID, ar.mSeqNum, ar.mIcode, ar.mSeqNum, true });
}
}
}
std::set<char> terminatedChains;
std::map<char, int> residuePerChainCounter;
for (auto r = mData; r != nullptr; r = r->mNext)
{
if (r->is("MODEL "))
{
modelNr = r->vI(11, 14);
if (modelNr != 1)
break;
continue;
}
if (r->is("ATOM ") or r->is("HETATM"))
{ // 1 - 6 Record name "ATOM "
// int serial = r->vI(7, 11); // 7 - 11 Integer serial Atom serial number.
// ...
char altLoc = vC(17); // 17 Character altLoc Alternate location indicator.
std::string resName = r->vS(18, 20); // 18 - 20 Residue name resName Residue name.
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
int resSeq = r->vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
char iCode = r->vC(27); // 27 AChar iCode Code for insertion of residues.
auto &chain = GetChainForID(chainID);
auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ resName, resSeq, iCode });
// might be a hetero
if (altLoc != ' ' and i == chain.mSeqres.end())
{
i = find_if(chain.mSeqres.begin(), chain.mSeqres.end(),
[resSeq, iCode](const PDBSeqRes &r) -> bool
{
return r.mSeqNum == resSeq and r.mIcode == iCode;
});
}
if (i != chain.mSeqres.end())
{
i->mSeen = true;
if (i->mMonID != resName)
i->mAlts.insert(resName);
}
else
{
auto &residues = chain.mHet;
if (residues.empty() or residues.back().mSeqNum != resSeq)
{
i = lower_bound(residues.begin(), residues.end(),
PDBSeqRes{ resName, resSeq, iCode },
[=](const PDBSeqRes &r1, const PDBSeqRes &r2) -> bool
{
return r1.mSeqNum < r2.mSeqNum;
});
residues.insert(i, { resName, resSeq, iCode, resSeq, true });
InsertChemComp(resName);
}
}
int residueCount = (residuePerChainCounter[chainID] += 1);
// There appears to be a program that writes out HETATM records as ATOM records....
if (not cif::compound_factory::instance().is_monomer(resName) or
terminatedChains.count(chainID) or
(chain.mTerIndex > 0 and residueCount >= chain.mTerIndex))
{
if (isWater(resName))
mWaterHetID = resName;
auto h = find_if(mHets.begin(), mHets.end(), [=](const HET &het) -> bool
{ return het.hetID == resName and het.chainID == chainID and
het.seqNum == resSeq and het.iCode == iCode; });
if (h == mHets.end())
{
mHets.push_back({ resName, chainID, resSeq, iCode, 0 }); // double perhaps, but that does not care
h = prev(mHets.end());
}
h->atoms.push_back(r);
}
continue;
}
if (r->is("TER "))
{
char chainID = r->vC(22); // 22 Character chainID Chain identifier.
terminatedChains.insert(chainID);
}
}
// Create missing compounds
for (auto &chain : mChains)
{
if (chain.mMolID != 0 or chain.mSeqres.empty())
continue;
// now this chain may contain the same residues as another one
for (auto &other : mChains)
{
if (&other == &chain or other.mMolID == 0)
continue;
if (chain.SameSequence(other))
{
chain.mMolID = other.mMolID;
break;
}
}
if (chain.mMolID != 0)
continue;
auto &comp = GetOrCreateCompound(mNextMolID++);
comp.mChains.insert(chain.mDbref.chainID);
chain.mMolID = comp.mMolID;
}
std::set<std::string> structTitle, structDescription;
// Create poly_scheme and write pdbx_poly_seq_scheme and create mapping table
auto cat = getCategory("pdbx_poly_seq_scheme");
int asymNr = 0;
for (auto &chain : mChains)
{
std::string asymID = cif::cif_id_for_number(asymNr++);
if (mMolID2EntityID.count(chain.mMolID) == 0)
continue;
std::string entityID = mMolID2EntityID[chain.mMolID];
mAsymID2EntityID[asymID] = entityID;
// clang-format off
getCategory("struct_asym")->emplace({
{ "id", asymID },
{ "pdbx_blank_PDB_chainid_flag", chain.mDbref.chainID == ' ' ? "Y" : "N" },
// pdbx_modified
{ "entity_id", entityID },
// details
});
// clang-format on
int seqNr = 1;
for (auto &res : chain.mSeqres)
{
mChainSeq2AsymSeq[std::make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = std::make_tuple(asymID, seqNr, true);
std::string seqID = std::to_string(seqNr);
++seqNr;
std::set<std::string> monIds = { res.mMonID };
monIds.insert(res.mAlts.begin(), res.mAlts.end());
for (std::string monID : monIds)
{
std::string authMonID, authSeqNum, authInsCode{ '.' };
if (res.mSeen)
{
authMonID = monID;
authSeqNum = std::to_string(res.mSeqNum);
if (res.mIcode != ' ' and res.mIcode != 0)
authInsCode = std::string{ res.mIcode };
// clang-format off
cat->emplace({
{ "asym_id", asymID },
{ "entity_id", mMolID2EntityID[chain.mMolID] },
{ "seq_id", seqID },
{ "mon_id", monID },
{ "ndb_seq_num", seqID },
{ "pdb_seq_num", res.mSeqNum },
{ "auth_seq_num", authSeqNum },
{ "pdb_mon_id", authMonID },
{ "auth_mon_id", authMonID },
{ "pdb_strand_id", std::string{ chain.mDbref.chainID } },
{ "pdb_ins_code", authInsCode },
{ "hetero", res.mAlts.empty() ? "n" : "y" }
});
// clang-format on
}
else
{
if (res.mIcode != ' ' and res.mIcode != 0)
authInsCode = std::string{ res.mIcode } + "A";
// clang-format off
cat->emplace({
{ "asym_id", asymID },
{ "entity_id", mMolID2EntityID[chain.mMolID] },
{ "seq_id", seqID },
{ "mon_id", monID },
{ "ndb_seq_num", seqID },
{ "pdb_seq_num", res.mSeqNum },
{ "auth_seq_num", "." },
{ "pdb_mon_id", "." },
{ "auth_mon_id", "." },
{ "pdb_strand_id", std::string{ chain.mDbref.chainID } },
{ "pdb_ins_code", authInsCode },
{ "hetero", res.mAlts.empty() ? "n" : "y" }
});
// clang-format on
}
}
}
}
// We have now created all compounds, write them out
uint32_t structRefID = 0, structRefSeqAlignID = 0;
for (auto &cmp : mCompounds)
{
++structRefID;
std::string srcMethod;
if (not cmp.mSource["SYNTHETIC"].empty())
{
srcMethod = "syn";
// clang-format off
getCategory("pdbx_entity_src_syn")->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "pdbx_src_id", structRefID },
{ "organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
{ "ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
});
// clang-format on
}
else if (cmp.mInfo["ENGINEERED"] == "YES" or
not cmp.mSource["EXPRESSION_SYSTEM"].empty())
{
srcMethod = "man";
// clang-format off
getCategory("entity_src_gen")->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "pdbx_src_id", structRefID },
{ "gene_src_common_name", cmp.mSource["ORGANISM_COMMON"] },
{ "pdbx_gene_src_gene", cmp.mSource["GENE"] },
{ "gene_src_strain", cmp.mSource["STRAIN"] },
{ "gene_src_tissue", cmp.mSource["TISSUE"] },
{ "gene_src_tissue_fraction", cmp.mSource["TISSUE_FRACTION"] },
{ "pdbx_gene_src_cell_line", cmp.mSource["CELL_LINE"] },
{ "pdbx_gene_src_organelle", cmp.mSource["ORGANELLE"] },
{ "pdbx_gene_src_cell", cmp.mSource["CELL"] },
{ "pdbx_gene_src_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
{ "host_org_common_name", cmp.mSource["EXPRESSION_SYSTEM_COMMON"] },
{ "pdbx_gene_src_scientific_name", cmp.mSource["ORGANISM_SCIENTIFIC"] },
{ "pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
{ "pdbx_host_org_scientific_name", cmp.mSource["EXPRESSION_SYSTEM"] },
{ "pdbx_host_org_ncbi_taxonomy_id", cmp.mSource["EXPRESSION_SYSTEM_TAXID"] },
{ "pdbx_host_org_strain", cmp.mSource["EXPRESSION_SYSTEM_STRAIN"] },
{ "pdbx_host_org_variant", cmp.mSource["EXPRESSION_SYSTEM_VARIANT"] },
{ "pdbx_host_org_cell_line", cmp.mSource["EXPRESSION_SYSTEM_CELL_LINE"] },
{ "pdbx_host_org_cellular_location", cmp.mSource["EXPRESSION_SYSTEM_CELLULAR_LOCATION"] },
{ "pdbx_host_org_vector_type", cmp.mSource["EXPRESSION_SYSTEM_VECTOR_TYPE"] },
{ "pdbx_host_org_vector", cmp.mSource["EXPRESSION_SYSTEM_VECTOR"] },
{ "pdbx_host_org_gene", cmp.mSource["EXPRESSION_SYSTEM_GENE"] },
{ "plasmid_name", cmp.mSource["EXPRESSION_SYSTEM_PLASMID"] },
{ "pdbx_description", cmp.mSource["OTHER_DETAILS"] }
});
// clang-format on
}
else if (not cmp.mSource["ORGANISM_SCIENTIFIC"].empty())
{
srcMethod = "nat";
// clang-format off
getCategory("entity_src_nat")->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "pdbx_src_id", structRefID },
{ "common_name", cmp.mSource["ORGANISM_COMMON"] },
{ "strain", cmp.mSource["STRAIN"] },
{ "pdbx_secretion", cmp.mSource["SECRETION"] },
{ "pdbx_organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
{ "pdbx_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
{ "pdbx_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
{ "pdbx_plasmid_name", cmp.mSource["PLASMID"] },
{ "pdbx_organ", cmp.mSource["ORGAN"] },
});
// clang-format on
}
// clang-format off
getCategory("entity")->emplace({
{ "id", mMolID2EntityID[cmp.mMolID] },
{ "type", "polymer" },
{ "src_method", srcMethod },
{ "pdbx_description", cmp.mInfo["MOLECULE"] },
// { "pdbx_formula_weight", },
{ "pdbx_number_of_molecules", cmp.mChains.size() },
{ "details", cmp.mInfo["OTHER_DETAILS"] },
{ "pdbx_mutation", cmp.mInfo["MUTATION"] },
{ "pdbx_fragment", cmp.mInfo["FRAGMENT"] },
{ "pdbx_ec", cmp.mInfo["EC"] }
});
// clang-format on
if (not cmp.mInfo["SYNONYM"].empty())
{
// clang-format off
getCategory("entity_name_com")->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "name", cmp.mInfo["SYNONYM"] }
});
// clang-format on
}
std::string desc = cmp.mInfo["MOLECULE"];
if (not cmp.mInfo["EC"].empty())
desc += " (E.C." + cmp.mInfo["EC"] + ")";
if (not cmp.mTitle.empty())
structTitle.insert(cmp.mTitle);
if (not desc.empty())
structDescription.insert(desc);
auto ci = find_if(mChains.begin(), mChains.end(),
[cmp](PDBChain &c) -> bool
{ return cmp.mChains.count(c.mDbref.chainID); });
if (ci != mChains.end() and not ci->mDbref.dbIdCode.empty())
{
// clang-format off
getCategory("struct_ref")->emplace({
{ "id", structRefID },
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "db_name", ci->mDbref.database },
{ "db_code", ci->mDbref.dbIdCode },
{ "pdbx_db_accession", ci->mDbref.dbAccession },
// { "pdbx_align_begin", ci->mDbref.dbSeqBegin }
});
// clang-format on
}
bool nstdMonomer = false, nonstandardLinkage = false;
bool mightBePolyPeptide = true, mightBeDNA = true;
std::vector<std::string> chains;
std::string seq, seqCan;
// write out the chains for this compound
for (auto &chain : mChains)
{
if (chain.mMolID != cmp.mMolID)
continue;
// chain.mEntityID = cmp.mEntityID;
++structRefSeqAlignID;
DBREF &dbref = chain.mDbref;
if (not dbref.database.empty())
{
auto insToStr = [](char i) -> std::string
{
return i == ' ' or not isprint(i) ? "" : std::string{ i };
};
auto &pdbxPolySeqScheme = *getCategory("pdbx_poly_seq_scheme");
int seqAlignBeg = 0, seqAlignEnd = 0;
try
{
seqAlignBeg = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string{ dbref.chainID } and
key("pdb_seq_num") == dbref.seqBegin and
(key("pdb_ins_code") == insToStr(dbref.insertBegin) or key("pdb_ins_code") == cif::null),
"seq_id");
seqAlignEnd = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string{ dbref.chainID } and
key("pdb_seq_num") == dbref.seqEnd and
(key("pdb_ins_code") == insToStr(dbref.insertEnd) or key("pdb_ins_code") == cif::null),
"seq_id");
}
catch (...)
{
}
// clang-format off
getCategory("struct_ref_seq")->emplace({
{ "align_id", structRefSeqAlignID },
{ "ref_id", structRefID },
{ "pdbx_PDB_id_code", dbref.PDBIDCode },
{ "pdbx_strand_id", std::string{ chain.mDbref.chainID } },
{ "seq_align_beg", seqAlignBeg },
{ "pdbx_seq_align_beg_ins_code", insToStr(dbref.insertBegin) },
{ "seq_align_end", seqAlignEnd },
{ "pdbx_seq_align_end_ins_code", insToStr(dbref.insertEnd) },
{ "pdbx_db_accession", dbref.dbAccession },
{ "db_align_beg", dbref.dbSeqBegin },
{ "pdbx_db_align_beg_ins_code", insToStr(dbref.dbinsBeg) },
{ "db_align_end", dbref.dbSeqEnd },
{ "pdbx_db_align_end_ins_code", insToStr(dbref.dbinsEnd) },
{ "pdbx_auth_seq_align_beg", dbref.seqBegin },
{ "pdbx_auth_seq_align_end", dbref.seqEnd }
});
// clang-format on
// write the struct_ref_seq_dif
for (auto &seqadv : mSeqadvs)
{
if (seqadv.chainID != chain.mDbref.chainID or seqadv.resName.empty())
continue;
std::string asym, seqNum;
int labelSeq = -1;
std::error_code ec;
std::tie(asym, labelSeq, std::ignore) = MapResidue(seqadv.chainID, seqadv.seqNum, seqadv.iCode, ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "dropping unmatched SEQADV record\n";
continue;
}
seqNum = std::to_string(labelSeq);
// clang-format off
getCategory("struct_ref_seq_dif")->emplace({
{ "align_id", structRefSeqAlignID },
{ "pdbx_PDB_id_code", dbref.PDBIDCode },
{ "mon_id", seqadv.resName },
{ "pdbx_pdb_strand_id", seqadv.chainID },
{ "seq_num", seqNum },
{ "pdbx_pdb_ins_code", seqadv.iCode == ' ' ? std::string{} : std::string{ seqadv.iCode } },
{ "pdbx_seq_db_name", seqadv.database },
{ "pdbx_seq_db_accession_code", seqadv.dbAccession },
{ "db_mon_id", seqadv.dbRes },
{ "pdbx_seq_db_seq_num", seqadv.dbSeq },
{ "details", seqadv.conflict },
{ "pdbx_auth_seq_num", seqadv.seqNum },
{ "pdbx_ordinal", ++mPdbxDifOrdinal }
});
// clang-format on
}
}
if (not chains.empty()) // not the first one for this molID
{
chains.push_back(std::string{ chain.mDbref.chainID });
continue;
}
chains.push_back(std::string{ chain.mDbref.chainID });
std::size_t seqLen = 0, seqCanLen = 0;
for (auto &res : chain.mSeqres)
{
std::string letter, stdRes;
if (mMod2parent.count(res.mMonID))
stdRes = mMod2parent.at(res.mMonID);
if (cif::compound_factory::kAAMap.count(res.mMonID))
{
letter = cif::compound_factory::kAAMap.at(res.mMonID);
mightBeDNA = false;
}
else if (cif::compound_factory::kBaseMap.count(res.mMonID))
{
letter = cif::compound_factory::kBaseMap.at(res.mMonID);
mightBePolyPeptide = false;
}
else
{
nstdMonomer = true;
letter = '(' + res.mMonID + ')';
// sja...
auto compound = cif::compound_factory::instance().create(stdRes.empty() ? res.mMonID : stdRes);
if (compound != nullptr and
not iequals(compound->type(), "L-peptide linking") and
not iequals(compound->type(), "RNA linking"))
{
nonstandardLinkage = true;
}
}
if (seqLen + letter.length() > 80)
{
seq += '\n';
seqLen = 0;
}
seq += letter;
seqLen += letter.length();
if (letter.length() > 1)
{
if (not stdRes.empty() and cif::compound_factory::kAAMap.count(stdRes))
letter = cif::compound_factory::kAAMap.at(stdRes);
else if (cif::compound_factory::kBaseMap.count(res.mMonID))
letter = cif::compound_factory::kBaseMap.at(res.mMonID);
else
letter = 'X';
}
if (seqCanLen + letter.length() > 80)
{
seqCan += '\n';
seqCanLen = 0;
}
seqCan += letter;
seqCanLen += letter.length();
}
auto cat_ps = getCategory("entity_poly_seq");
for (std::size_t i = 0; i < chain.mSeqres.size(); ++i)
{
auto &rs = chain.mSeqres[i];
if (std::find(mChemComp.begin(), mChemComp.end(), rs.mMonID) == mChemComp.end())
mChemComp.emplace_back(rs.mMonID);
// clang-format off
cat_ps->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "num", i + 1 },
{ "mon_id", rs.mMonID },
{ "hetero", rs.mAlts.empty() ? "n" : "y" }
});
// clang-format on
for (auto &a : rs.mAlts)
{
// clang-format off
cat_ps->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "num", i + 1 },
{ "mon_id", a },
{ "hetero", "y" }
});
// clang-format on
}
}
}
std::string type;
if (mightBePolyPeptide and not mightBeDNA)
type = "polypeptide(L)";
else if (mightBeDNA and not mightBePolyPeptide)
type = "polyribonucleotide";
else
type = "other";
// clang-format off
getCategory("entity_poly")->emplace({
{ "entity_id", mMolID2EntityID[cmp.mMolID] },
{ "pdbx_seq_one_letter_code", seq },
{ "pdbx_seq_one_letter_code_can", seqCan },
{ "nstd_monomer", (nstdMonomer ? "yes" : "no") },
{ "pdbx_strand_id", cif::join(chains, ",") },
{ "nstd_linkage", nonstandardLinkage ? "yes" : "no" },
{ "type", type }
});
// clang-format on
}
if (not(structTitle.empty() and structDescription.empty()))
{
// clang-format off
getCategory("struct")->emplace({
{ "entry_id", mStructureID },
{ "title", cif::join(structTitle, ", ") },
{ "pdbx_descriptor", cif::join(structDescription, ", ") },
{ "pdbx_model_type_details", mModelTypeDetails }
});
// clang-format on
}
// build sugar trees first
// ConstructSugarTrees(asymNr);
// done with the sugar, resume operation as before
std::map<char, std::string> waterChains;
std::map<std::tuple<std::string, std::string>, int> ndbSeqNum; // for nonpoly scheme
std::map<std::string, int> entityAuthSeqNum; // for nonpoly scheme too
for (std::size_t i = 0; i < mHets.size(); ++i)
{
auto &heti = mHets[i];
if (not heti.asymID.empty())
continue;
if (heti.hetID == mWaterHetID or isWater(heti.hetID))
continue;
// See if this residue is part of SEQRES
auto &chain = GetChainForID(heti.chainID);
auto ih = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ heti.hetID, heti.seqNum, heti.iCode });
// If so, skip it, it is not an entity then
if (ih != chain.mSeqres.end())
continue;
heti.asymID = cif::cif_id_for_number(asymNr++);
}
std::set<std::string> writtenAsyms;
std::map<std::string, int> hetCount; // for pdbx_number_of_molecules
for (auto &het : mHets)
hetCount[het.hetID] += 1;
for (auto &het : mHets)
{
std::string hetID = het.hetID;
auto &chain = GetChainForID(het.chainID);
// See if this residue is part of SEQRES
auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ hetID, het.seqNum, het.iCode });
// If so, skip it, it is not an entity then
if (i != chain.mSeqres.end())
continue;
// See if we've already added it to the entities
if (mHet2EntityID.count(hetID) == 0)
{
std::string entityID = std::to_string(mNextEntityNr++);
mHet2EntityID[hetID] = entityID;
if (hetID == mWaterHetID)
{
// clang-format off
getCategory("entity")->emplace({
{ "id", entityID },
{ "type", "water" },
{ "src_method", "nat" },
{ "pdbx_description", "water" },
{ "pdbx_number_of_molecules", hetCount[hetID] }
});
// clang-format on
}
else
{
if (mHetnams[hetID].empty())
{
auto compound = cif::compound_factory::instance().create(hetID);
if (compound != nullptr)
mHetnams[hetID] = compound->name();
}
// clang-format off
getCategory("entity")->emplace({
{ "id", entityID },
{ "type", "non-polymer" },
{ "src_method", "syn" },
{ "pdbx_description", mHetnams[hetID] },
{ "details", mHetsyns[hetID] },
{ "pdbx_number_of_molecules", hetCount[hetID] }
});
// clang-format on
}
// write a pdbx_entity_nonpoly record
std::string name = mHetnams[hetID];
if (name.empty() and hetID == mWaterHetID)
name = "water";
// clang-format off
getCategory("pdbx_entity_nonpoly")->emplace({
{ "entity_id", entityID },
{ "name", name },
{ "comp_id", hetID }
});
// clang-format on
}
// create an asym for this het/chain combo, if needed
std::string asymID = het.asymID;
auto k = std::make_tuple(het.chainID, het.seqNum, het.iCode);
if (mChainSeq2AsymSeq.count(k) == 0)
{
if (hetID == mWaterHetID or isWater(hetID))
{
if (waterChains.count(het.chainID) == 0)
{
asymID = cif::cif_id_for_number(asymNr++);
waterChains[het.chainID] = asymID;
}
else
asymID = waterChains[het.chainID];
}
else
asymID = het.asymID;
assert(asymID.empty() == false);
mAsymID2EntityID[asymID] = mHet2EntityID[hetID];
// NOTE, a nonpoly residue has no label_seq_id
// but in pdbx_nonpoly_scheme there is such a number.
// Since this number is not used anywhere else we
// just use it here and do not store it in the table
mChainSeq2AsymSeq[k] = std::make_tuple(asymID, 0, false);
if (writtenAsyms.count(asymID) == 0)
{
writtenAsyms.insert(asymID);
// clang-format off
getCategory("struct_asym")->emplace({
{ "id", asymID },
{ "pdbx_blank_PDB_chainid_flag", het.chainID == ' ' ? "Y" : "N" },
// pdbx_modified
{ "entity_id", mHet2EntityID[hetID] },
// details
});
// clang-format on
}
}
int seqNr = ++ndbSeqNum[std::make_tuple(hetID, asymID)];
int authSeqNr = ++entityAuthSeqNum[hetID];
std::string iCode{ het.iCode };
cif::trim(iCode);
if (iCode.empty())
iCode = { '.' };
// clang-format off
getCategory("pdbx_nonpoly_scheme")->emplace({
{ "asym_id", asymID },
{ "entity_id", mHet2EntityID[hetID] },
{ "mon_id", hetID },
{ "ndb_seq_num", seqNr },
{ "pdb_seq_num", het.seqNum },
{ "auth_seq_num", authSeqNr }, // Yes
{ "pdb_mon_id", hetID },
{ "auth_mon_id", hetID },
{ "pdb_strand_id", std::string{ het.chainID } },
{ "pdb_ins_code", iCode }
});
// clang-format on
// mapping needed?
mChainSeq2AsymSeq[std::make_tuple(het.chainID, het.seqNum, het.iCode)] = std::make_tuple(asymID, seqNr, false);
}
int modResID = 1;
std::set<std::string> modResSet;
for (auto rec = FindRecord("MODRES"); rec != nullptr and rec->is("MODRES");
rec = rec->mNext) // 1 - 6 Record name "MODRES"
{ // 8 - 11 IDcode idCode ID code of this datablock.
std::string resName = rec->vS(13, 15); // 13 - 15 Residue name resName Residue name used in this datablock.
char chainID = rec->vC(17); // 17 Character chainID Chain identifier.
int seqNum = rec->vI(19, 22); // 19 - 22 Integer seqNum Sequence number.
char iCode = rec->vC(23); // 23 AChar iCode Insertion code.
std::string stdRes = rec->vS(25, 27); // 25 - 27 Residue name stdRes Standard residue name.
std::string comment = rec->vS(30, 70); // 30 - 70 String comment Description of the residue modification.
std::string asymID;
int seq;
std::error_code ec;
std::tie(asymID, seq, std::ignore) = MapResidue(chainID, seqNum, iCode, ec);
if (ec) // no need to write a modres if it could not be found
{
if (cif::VERBOSE > 0)
std::cerr << "dropping unmapped MODRES record\n";
continue;
}
// clang-format off
getCategory("pdbx_struct_mod_residue")->emplace({
{ "id", modResID++ },
{ "label_asym_id", asymID },
{ "label_seq_id", seq },
{ "label_comp_id", resName },
{ "auth_asym_id", std::string(1, chainID) },
{ "auth_seq_id", seqNum },
{ "auth_comp_id", resName },
{ "PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
{ "parent_comp_id", stdRes },
{ "details", comment }
});
// clang-format on
modResSet.insert(resName);
}
// // chem compounds
for (auto cc : mChemComp)
{
auto compound = cif::compound_factory::instance().create(
mMod2parent.count(cc) ? mMod2parent[cc] : cc);
std::string name;
std::string formula;
std::string type;
std::string nstd = ".";
std::optional<float> formulaWeight;
if (compound != nullptr)
{
name = compound->name();
type = compound->type();
if (iequals(type, "L-peptide linking") or iequals(type, "peptide linking"))
nstd = "y";
formula = compound->formula();
formulaWeight = compound->formula_weight();
}
if (name.empty())
name = mHetnams[cc];
if (type.empty())
type = "NON-POLYMER";
if (formula.empty())
{
formula = mFormuls[cc];
const std::regex rx(R"(\d+\((.+)\))");
std::smatch m;
if (std::regex_match(formula, m, rx))
formula = m[1].str();
}
if (modResSet.count(cc))
nstd = "n";
// clang-format off
getCategory("chem_comp")->emplace({
{ "id", cc },
{ "name", name },
{ "formula", formula },
{ "formula_weight", formulaWeight, 3 },
{ "mon_nstd_flag", nstd },
{ "type", type }
});
// clang-format on
}
getCategory("chem_comp")->reorder_by_index();
// unobserved can now be written as well
int idRes = 0, idAtom = 0;
sort(mUnobs.begin(), mUnobs.end(), [](const UNOBS &a, const UNOBS &b) -> bool
{
int d = a.modelNr - b.modelNr;
if (d == 0)
d = a.seq - b.seq;
return d < 0; });
for (auto &unobs : mUnobs)
{
bool isPolymer = false;
std::string asymID, compID = unobs.res;
int seqNr = 0;
std::error_code ec;
std::tie(asymID, seqNr, isPolymer) = MapResidue(unobs.chain, unobs.seq, unobs.iCode, ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "error mapping unobserved residue\n";
continue;
}
if (unobs.atoms.empty())
{
// clang-format off
getCategory("pdbx_unobs_or_zero_occ_residues")->emplace({
{ "id", std::to_string(++idRes) },
{ "polymer_flag", isPolymer ? "Y" : "N" },
{ "occupancy_flag", 1 },
{ "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
{ "auth_asym_id", std::string{ unobs.chain } },
{ "auth_comp_id", unobs.res },
{ "auth_seq_id", unobs.seq },
{ "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
{ "label_asym_id", asymID },
{ "label_comp_id", compID }, // TODO: change to correct comp_id
{ "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" }
});
// clang-format on
}
else
{
for (auto &atom : unobs.atoms)
{
// clang-format off
getCategory("pdbx_unobs_or_zero_occ_atoms")->emplace({
{ "id", std::to_string(++idAtom) },
{ "polymer_flag", isPolymer ? "Y" : "N" },
{ "occupancy_flag", 1 },
{ "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
{ "auth_asym_id", std::string{ unobs.chain } },
{ "auth_comp_id", unobs.res },
{ "auth_seq_id", unobs.seq },
{ "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
{ "auth_atom_id", atom },
{ "label_asym_id", asymID },
{ "label_comp_id", compID }, // TODO: change to correct comp_id
{ "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" },
{ "label_atom_id", atom }
});
// clang-format on
}
}
}
}
void PDBFileParser::ConstructSugarTrees(int &asymNr)
{
for (;;)
{
// find a first NAG/NDG
auto si = std::find_if(mHets.begin(), mHets.end(), [](const HET &h)
{ return (h.hetID == "NAG" or h.hetID == "NDG") and not(h.processed or h.branch); });
if (si != mHets.end())
{
si->processed = true;
// take the location of the C1 atom(s?)
std::set<char> ci;
for (auto a : si->atoms)
{
std::string name = a->vS(13, 16); // 13 - 16 Atom name Atom name.
if (name != "C1")
continue;
ci.insert(a->vC(17)); // 17 Character altLoc Alternate location indicator.
}
if (ci.empty())
continue;
for (auto alt : ci)
{
ATOM_REF c1{ "C1", si->hetID, si->seqNum, si->chainID, si->iCode, alt };
const auto &[asn, linked] = FindLink(c1, "ND2", "ASN");
if (not linked)
continue;
std::stack<ATOM_REF> c1s;
c1s.push(c1);
SUGAR_TREE sugarTree;
sugarTree.push_back({ c1 });
// naive implementation
while (not c1s.empty())
{
c1 = c1s.top();
c1s.pop();
for (auto o : { "O1", "O2", "O3", "O4", "O5", "O6" })
{
ATOM_REF leaving = c1;
leaving.name = o;
const auto &[nc1, linked_c1] = FindLink(leaving, "C1");
if (linked_c1)
{
sugarTree.push_back({ nc1, o[1] - '0', c1 });
c1s.push(nc1);
}
}
}
if (sugarTree.size() < 2) // not really a tree
continue;
auto branchName = sugarTree.entityName();
auto entityID = mBranch2EntityID[branchName];
// See if we've already added it to the entities
if (entityID.empty())
{
entityID = std::to_string(mNextEntityNr++);
mBranch2EntityID[branchName] = entityID;
// clang-format off
getCategory("entity")->emplace({
{ "id", entityID },
{ "type", "branched" },
{ "src_method", "man" },
{ "pdbx_description", branchName }
});
getCategory("pdbx_entity_branch")->emplace({
{ "entity_id", entityID },
{ "type", "oligosaccharide" }
});
// clang-format on
int num = 0;
std::map<ATOM_REF, int> branch_list;
for (auto &s : sugarTree)
{
// clang-format off
getCategory("pdbx_entity_branch_list")->emplace({
{ "entity_id", entityID },
{ "comp_id", s.c1.resName },
{ "num", ++num },
{ "hetero", ci.size() == 1 ? "n" : "y" }
});
// clang-format on
branch_list[s.c1] = num;
}
auto &branch_link = *getCategory("pdbx_entity_branch_link");
for (auto &s : sugarTree)
{
if (s.leaving_o == 0)
continue;
// clang-format off
branch_link.emplace({
{ "link_id", branch_link.size() + 1 },
{ "entity_id", entityID },
{ "entity_branch_list_num_1", branch_list[s.c1] },
{ "comp_id_1", s.c1.resName },
{ "atom_id_1", s.c1.name },
{ "leaving_atom_id_1", "O1" },
{ "entity_branch_list_num_2", branch_list[s.next] },
{ "comp_id_2", s.next.resName },
{ "atom_id_2", "O" + std::to_string(s.leaving_o) },
{ "leaving_atom_id_2", "HO" + std::to_string(s.leaving_o) },
{ "value_order", "sing" } /// ??
});
// clang-format on
}
}
mSugarEntities.insert(entityID);
// create an asym for this sugar tree
std::string asymID = cif::cif_id_for_number(asymNr++);
mAsymID2EntityID[asymID] = entityID;
// clang-format off
getCategory("struct_asym")->emplace({
{ "id", asymID },
{ "pdbx_blank_PDB_chainid_flag", si->chainID == ' ' ? "Y" : "N" },
{ "pdbx_modified", "N" },
{ "entity_id", entityID }
});
// clang-format on
std::string iCode{ si->iCode };
cif::trim(iCode);
if (iCode.empty())
iCode = { '.' };
int num = 0;
for (auto s : sugarTree)
{
// clang-format off
getCategory("pdbx_branch_scheme")->emplace({
{ "asym_id", asymID },
{ "entity_id", entityID },
{ "mon_id", s.c1.resName },
{ "num", ++num },
{ "pdb_asym_id", asymID },
{ "pdb_mon_id", s.c1.resName },
{ "pdb_seq_num", num },
{ "auth_asym_id", std::string{ s.c1.chainID } },
{ "auth_mon_id", s.next.resName },
{ "auth_seq_num", s.c1.resSeq },
{ "hetero", ci.size() == 1 ? "n" : "y" }
});
// clang-format on
auto k = std::make_tuple(s.c1.chainID, s.c1.resSeq, s.c1.iCode);
assert(mChainSeq2AsymSeq.count(k) == 0);
mChainSeq2AsymSeq[k] = std::make_tuple(asymID, num, false);
// mark all hets as part of tree
for (auto &h : mHets)
{
if (h.hetID == s.c1.resName and h.chainID == s.c1.chainID and h.seqNum == s.c1.resSeq and h.iCode == s.c1.iCode)
{
h.branch = true;
break; // should be only one of course... right?
}
}
}
break;
}
continue;
}
break;
}
// remove the branched HET's
mHets.erase(std::remove_if(mHets.begin(), mHets.end(), [](auto &h)
{ return h.branch; }),
mHets.end());
}
void PDBFileParser::ParseSecondaryStructure()
{
bool firstHelix = true;
while (mRec->is("HELIX "))
{
// 1 - 6 Record name "HELIX "
// 8 - 10 Integer serNum Serial number of the helix. This starts
// at 1 and increases incrementally.
// 12 - 14 LString(3) helixID Helix identifier. In addition to a serial
// number, each helix is given an
// alphanumeric character helix identifier.
// 16 - 18 Residue name initResName Name of the initial residue.
// 20 Character initChainID Chain identifier for the chain containing
// this helix.
// 22 - 25 Integer initSeqNum Sequence number of the initial residue.
// 26 AChar initICode Insertion code of the initial residue.
// 28 - 30 Residue name endResName Name of the terminal residue of the helix.
// 32 Character endChainID Chain identifier for the chain containing
// this helix.
// 34 - 37 Integer endSeqNum Sequence number of the terminal residue.
// 38 AChar endICode Insertion code of the terminal residue.
// 39 - 40 Integer helixClass Helix class (see below).
// 41 - 70 String comment Comment about this helix.
// 72 - 76 Integer length Length of this helix.
std::string begAsymID, endAsymID;
int begSeq, endSeq;
std::error_code ec;
std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(20), vI(22, 25), vC(26), ec);
if (not ec)
std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(32), vI(34, 37), vC(38), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Could not map residue for HELIX " << vI(8, 10) << '\n';
}
else
{
auto cat = getCategory("struct_conf");
// clang-format off
cat->emplace({
{ "conf_type_id", "HELX_P" },
{ "id", "HELX_P" + std::to_string(vI(8, 10)) },
{ "pdbx_PDB_helix_id", vS(12, 14) },
{ "beg_label_comp_id", vS(16, 18) },
{ "beg_label_asym_id", begAsymID },
{ "beg_label_seq_id", begSeq },
{ "pdbx_beg_PDB_ins_code", vS(26, 26) },
{ "end_label_comp_id", vS(28, 30) },
{ "end_label_asym_id", endAsymID },
{ "end_label_seq_id", endSeq },
{ "pdbx_end_PDB_ins_code", vS(38, 38) },
{ "beg_auth_comp_id", vS(16, 18) },
{ "beg_auth_asym_id", vS(20, 20) },
{ "beg_auth_seq_id", vI(22, 25) },
{ "end_auth_comp_id", vS(28, 30) },
{ "end_auth_asym_id", vS(32, 32) },
{ "end_auth_seq_id", vI(34, 37) },
{ "pdbx_PDB_helix_class", vS(39, 40) },
{ "details", vS(41, 70) },
{ "pdbx_PDB_helix_length", vI(72, 76) }
});
// clang-format off
if (firstHelix)
{
cat = getCategory("struct_conf_type");
cat->emplace({ { "id", "HELX_P" } });
firstHelix = false;
}
}
GetNextRecord();
}
std::set<std::string> sheetsSeen;
int rangeID = 1;
while (mRec->is("SHEET "))
{
// 1 - 6 Record name "SHEET "
// 8 - 10 Integer strand Strand number which starts at 1 for each
// strand within a sheet and increases by one.
// 12 - 14 LString(3) sheetID Sheet identifier.
// 15 - 16 Integer numStrands Number of strands in sheet.
// 18 - 20 Residue name initResName Residue name of initial residue.
// 22 Character initChainID Chain identifier of initial residue
// in strand.
// 23 - 26 Integer initSeqNum Sequence number of initial residue
// in strand.
// 27 AChar initICode Insertion code of initial residue
// in strand.
// 29 - 31 Residue name endResName Residue name of terminal residue.
// 33 Character endChainID Chain identifier of terminal residue.
// 34 - 37 Integer endSeqNum Sequence number of terminal residue.
// 38 AChar endICode Insertion code of terminal residue.
// 39 - 40 Integer sense Sense of strand with respect to previous
// strand in the sheet. 0 if first strand,
// 1 if parallel,and -1 if anti-parallel.
// 42 - 45 Atom curAtom Registration. Atom name in current strand.
// 46 - 48 Residue name curResName Registration. Residue name in current strand
// 50 Character curChainID Registration. Chain identifier in
// current strand.
// 51 - 54 Integer curResSeq Registration. Residue sequence number
// in current strand.
// 55 AChar curICode Registration. Insertion code in
// current strand.
// 57 - 60 Atom prevAtom Registration. Atom name in previous strand.
// 61 - 63 Residue name prevResName Registration. Residue name in
// previous strand.
// 65 Character prevChainID Registration. Chain identifier in
// previous strand.
// 66 - 69 Integer prevResSeq Registration. Residue sequence number
// in previous strand.
// 70 AChar prevICode Registration. Insertion code in
// previous strand.
std::string sheetID = cif::trim_copy(vS(12, 14));
if (sheetsSeen.count(sheetID) == 0)
{
sheetsSeen.insert(sheetID);
rangeID = 1;
getCategory("struct_sheet")->emplace({
{ "id", sheetID },
{ "number_strands", vI(15, 16) },
});
}
int sense = vI(39, 40);
if (sense != 0)
{
// clang-format off
getCategory("struct_sheet_order")->emplace({
{ "sheet_id", sheetID },
{ "range_id_1", rangeID },
{ "range_id_2", rangeID + 1 },
{ "sense", sense == -1 ? "anti-parallel" : "parallel" }
});
// clang-format on
}
std::string begAsymID, endAsymID;
int begSeq, endSeq;
std::error_code ec;
std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
if (not ec)
std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(33), vI(34, 37), vC(38), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping SHEET record " << vI(8, 10) << '\n';
}
else
{
// clang-format off
getCategory("struct_sheet_range")->emplace({
{ "sheet_id", sheetID },
{ "id", vI(8, 10) },
{ "beg_label_comp_id", vS(18, 20) },
{ "beg_label_asym_id", begAsymID },
{ "beg_label_seq_id", begSeq },
{ "pdbx_beg_PDB_ins_code", vS(27, 27) },
{ "end_label_comp_id", vS(29, 31) },
{ "end_label_asym_id", endAsymID },
{ "end_label_seq_id", endSeq },
{ "pdbx_end_PDB_ins_code", vS(38, 38) },
{ "beg_auth_comp_id", vS(18, 20) },
{ "beg_auth_asym_id", vS(22, 22) },
{ "beg_auth_seq_id", vI(23, 26) },
{ "end_auth_comp_id", vS(29, 31) },
{ "end_auth_asym_id", vS(33, 33) },
{ "end_auth_seq_id", vI(34, 37) },
});
// clang-format on
if (sense != 0 and mRec->mVlen > 34)
{
std::string r1AsymID, r2AsymID;
int r1Seq, r2Seq;
std::tie(r1AsymID, r1Seq, std::ignore) = MapResidue(vC(65), vI(66, 69), vC(70), ec);
if (not ec)
std::tie(r2AsymID, r2Seq, std::ignore) = MapResidue(vC(50), vI(51, 54), vC(55), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "skipping unmatched pdbx_struct_sheet_hbond record\n";
}
else
// clang-format off
getCategory("pdbx_struct_sheet_hbond")->emplace({
{ "sheet_id", sheetID },
{ "range_id_1", rangeID },
{ "range_id_2", rangeID + 1 },
{ "range_1_label_atom_id", vS(57, 60) },
{ "range_1_label_comp_id", vS(61, 63) },
{ "range_1_label_asym_id", r1AsymID },
{ "range_1_label_seq_id", r1Seq },
{ "range_1_PDB_ins_code", vS(70, 70) },
{ "range_1_auth_atom_id", vS(57, 60) },
{ "range_1_auth_comp_id", vS(61, 63) },
{ "range_1_auth_asym_id", vS(65, 65) },
{ "range_1_auth_seq_id", vI(66, 69) },
{ "range_2_label_atom_id", vS(42, 45) },
{ "range_2_label_comp_id", vS(46, 48) },
{ "range_2_label_asym_id", r2AsymID },
{ "range_2_label_seq_id", r2Seq },
{ "range_2_PDB_ins_code", vS(55, 55) },
{ "range_2_auth_atom_id", vS(42, 45) },
{ "range_2_auth_comp_id", vS(46, 48) },
{ "range_2_auth_asym_id", vS(50, 50) },
{ "range_2_auth_seq_id", vI(51, 54) }
});
// clang-format on
}
if (sense != 0)
++rangeID;
}
GetNextRecord();
}
}
static bool IsMetal(const std::string &resName, const std::string &atomID)
{
bool result = false;
try
{
auto compound = cif::compound_factory::instance().create(resName);
if (compound != nullptr)
{
auto at = cif::atom_type_traits(compound->get_atom_by_atom_id(atomID).type_symbol);
result = at.is_metal();
}
}
catch (...)
{
}
return result;
}
void PDBFileParser::ParseConnectivtyAnnotation()
{
int ssBondNr = 0;
int linkNr = 0;
bool firstCovale = true, firstMetalc = true;
// Aaargh... Coot writes the records in the wrong order...
for (;; GetNextRecord())
{
if (mRec->is("SSBOND"))
{
if (ssBondNr == 0)
{
getCategory("struct_conn_type")->emplace({
{ "id", "disulf" },
});
}
// 1 - 6 Record name "SSBOND"
// 8 - 10 Integer serNum Serial number.
// 12 - 14 LString(3) "CYS" Residue name.
// 16 Character chainID1 Chain identifier.
// 18 - 21 Integer seqNum1 Residue sequence number.
// 22 AChar icode1 Insertion code.
// 26 - 28 LString(3) "CYS" Residue name.
// 30 Character chainID2 Chain identifier.
// 32 - 35 Integer seqNum2 Residue sequence number.
// 36 AChar icode2 Insertion code.
// 60 - 65 SymOP sym1 Symmetry operator for residue 1.
// 67 - 72 SymOP sym2 Symmetry operator for residue 2.
// 74 78 Real(5.2) Length Disulfide bond distance
std::string p1Asym, p2Asym;
int p1Seq = 0, p2Seq = 0;
std::error_code ec;
std::tie(p1Asym, p1Seq, std::ignore) = MapResidue(vC(16), vI(18, 21), vC(22), ec);
if (not ec)
std::tie(p2Asym, p2Seq, std::ignore) = MapResidue(vC(30), vI(32, 35), vC(36), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping SSBOND " << vI(8, 10) << '\n';
continue;
}
std::vector<char> alt1 = altLocsForAtom(vC(16), vI(18, 21), vC(22), "SG");
std::vector<char> alt2 = altLocsForAtom(vC(30), vI(32, 35), vC(36), "SG");
if (alt1.empty())
alt1.push_back(0);
if (alt2.empty())
alt2.push_back(0);
std::string sym1, sym2;
try
{
sym1 = pdb2cifSymmetry(vS(60, 65));
sym2 = pdb2cifSymmetry(vS(67, 72));
}
catch (const std::exception &ex)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping SSBOND " << vI(8, 10) << " due to invalid symmetry operation\n";
continue;
}
for (auto a1 : alt1)
{
for (auto a2 : alt2)
{
// clang-format off
getCategory("struct_conn")->emplace({
{ "id", "disulf" + std::to_string(++ssBondNr) },
{ "conn_type_id", "disulf" },
{ "ptnr1_label_asym_id", p1Asym },
{ "pdbx_ptnr1_label_alt_id", a1 ? std::string{ a1 } : std::string() },
{ "ptnr1_label_comp_id", vS(12, 14) },
{ "ptnr1_label_seq_id", p1Seq ? std::to_string(p1Seq) : "." },
{ "ptnr1_label_atom_id", "SG" },
{ "ptnr1_symmetry", sym1 },
{ "ptnr2_label_asym_id", p2Asym },
{ "pdbx_ptnr2_label_alt_id", a2 ? std::string{ a2 } : std::string() },
{ "ptnr2_label_comp_id", vS(26, 28) },
{ "ptnr2_label_seq_id", p2Seq ? std::to_string(p2Seq) : "." },
{ "ptnr2_label_atom_id", "SG" },
{ "ptnr1_auth_asym_id", vS(16, 16) },
{ "ptnr1_auth_comp_id", vS(12, 14) },
{ "ptnr1_auth_seq_id", vI(18, 21) },
{ "ptnr2_auth_asym_id", vS(30, 30) },
{ "ptnr2_auth_comp_id", vS(26, 28) },
{ "ptnr2_auth_seq_id", vI(32, 35) },
{ "ptnr2_symmetry", sym2 },
{ "pdbx_dist_value", vS(74, 78) },
});
// clang-format on
}
}
continue;
}
if (mRec->is("LINK ") or mRec->is("LINKR "))
{
if (cif::VERBOSE > 0 and mRec->is("LINKR "))
std::cerr << "Accepting non-standard LINKR record, but ignoring extra information\n";
// 1 - 6 Record name "LINK "
std::string name1 = vS(13, 16); // 13 - 16 Atom name1 Atom name.
// 17 Character altLoc1 Alternate location indicator.
std::string resName1 = vS(18, 20); // 18 - 20 Residue name resName1 Residue name.
// 22 Character chainID1 Chain identifier.
// 23 - 26 Integer resSeq1 Residue sequence number.
// 27 AChar iCode1 Insertion code.
std::string name2 = vS(43, 46); // 43 - 46 Atom name2 Atom name.
// 47 Character altLoc2 Alternate location indicator.
std::string resName2 = vS(48, 50); // 48 - 50 Residue name resName2 Residue name.
// 52 Character chainID2 Chain identifier.
// 53 - 56 Integer resSeq2 Residue sequence number.
// 57 AChar iCode2 Insertion code.
// 60 - 65 SymOP sym1 Symmetry operator atom 1.
// 67 - 72 SymOP sym2 Symmetry operator atom 2.
// 74 78 Real(5.2) Length Link distance
std::string type = "covale";
if (IsMetal(resName1, name1) or IsMetal(resName2, name2))
type = "metalc";
if (type == "covale" and firstCovale)
{
getCategory("struct_conn_type")->emplace({
{ "id", type },
});
firstCovale = false;
}
if (type == "metalc" and firstMetalc)
{
getCategory("struct_conn_type")->emplace({
{ "id", type },
});
firstMetalc = false;
}
++linkNr;
std::string p1Asym, p2Asym;
int p1Seq = 0, p2Seq = 0;
bool isResseq1 = false, isResseq2 = false;
std::error_code ec;
std::tie(p1Asym, p1Seq, isResseq1) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
if (not ec)
std::tie(p2Asym, p2Seq, isResseq2) = MapResidue(vC(52), vI(53, 56), vC(57), ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping LINK record at line " << mRec->mLineNr << '\n';
continue;
}
std::string distance, ccp4LinkID;
if (mRec->is("LINK "))
{
distance = vS(74, 78);
double d;
auto r = cif::from_chars(distance.data(), distance.data() + distance.length(), d);
if ((bool)r.ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Distance value '" << distance << "' is not a valid float in LINK record\n";
swap(ccp4LinkID, distance); // assume this is a ccp4_link_id... oh really?
}
}
else // LINKR
ccp4LinkID = vS(74, 78); // the link ID
std::string sym1, sym2;
try
{
sym1 = pdb2cifSymmetry(vS(60, 65));
sym2 = pdb2cifSymmetry(vS(67, 72));
}
catch (const std::exception &ex)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping LINK record at line " << mRec->mLineNr << " due to invalid symmetry operation\n";
continue;
}
// clang-format off
getCategory("struct_conn")->emplace({
{ "id", type + std::to_string(linkNr) },
{ "conn_type_id", type },
// { "ccp4_link_id", ccp4LinkID },
{ "ptnr1_label_asym_id", p1Asym },
{ "ptnr1_label_comp_id", vS(18, 20) },
{ "ptnr1_label_seq_id", (isResseq1 and p1Seq) ? std::to_string(p1Seq) : "." },
{ "ptnr1_label_atom_id", vS(13, 16) },
{ "pdbx_ptnr1_label_alt_id", vS(17, 17) },
{ "pdbx_ptnr1_PDB_ins_code", vS(27, 27) },
{ "pdbx_ptnr1_standard_comp_id", "" },
{ "ptnr1_symmetry", sym1 },
{ "ptnr2_label_asym_id", p2Asym },
{ "ptnr2_label_comp_id", vS(48, 50) },
{ "ptnr2_label_seq_id", (isResseq2 and p2Seq) ? std::to_string(p2Seq) : "." },
{ "ptnr2_label_atom_id", vS(43, 46) },
{ "pdbx_ptnr2_label_alt_id", vS(47, 47) },
{ "pdbx_ptnr2_PDB_ins_code", vS(57, 57) },
{ "ptnr1_auth_asym_id", vS(22, 22) },
{ "ptnr1_auth_comp_id", vS(18, 20) },
{ "ptnr1_auth_seq_id", vI(23, 26) },
{ "ptnr2_auth_asym_id", vS(52, 52) },
{ "ptnr2_auth_comp_id", vS(48, 50) },
{ "ptnr2_auth_seq_id", vI(53, 56) },
// { "ptnr1_auth_atom_id", vS(13, 16) },
// { "ptnr2_auth_atom_id", vS(43, 46) },
{ "ptnr2_symmetry", sym2 },
{ "pdbx_dist_value", distance }
});
// clang-format on
continue;
}
if (mRec->is("CISPEP"))
{
// 1 - 6 Record name "CISPEP"
int serNum = vI(8, 10); // 8 - 10 Integer serNum Record serial number.
std::string pep1 = vS(12, 14); // 12 - 14 LString(3) pep1 Residue name.
char chainID1 = vC(16); // 16 Character chainID1 Chain identifier.
int seqNum1 = vI(18, 21); // 18 - 21 Integer seqNum1 Residue sequence number.
char iCode1 = vC(22); // 22 AChar icode1 Insertion code.
std::string pep2 = vS(26, 28); // 26 - 28 LString(3) pep2 Residue name.
char chainID2 = vC(30); // 30 Character chainID2 Chain identifier.
int seqNum2 = vI(32, 35); // 32 - 35 Integer seqNum2 Residue sequence number.
char iCode2 = vC(36); // 36 AChar icode2 Insertion code.
int modNum = vI(44, 46); // 44 - 46 Integer modNum Identifies the specific model.
std::string measure = vF(54, 59); // 54 - 59 Real(6.2) measure Angle measurement in degrees.
if (modNum == 0)
modNum = 1;
std::string lAsym1, lAsym2;
int lResSeq1, lResSeq2;
std::error_code ec;
std::tie(lAsym1, lResSeq1, std::ignore) = MapResidue(chainID1, seqNum1, iCode1, ec);
if (not ec)
std::tie(lAsym2, lResSeq2, std::ignore) = MapResidue(chainID2, seqNum2, iCode2, ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "Dropping CISPEP record at line " << mRec->mLineNr << '\n';
continue;
}
std::string iCode1str = iCode1 == ' ' ? std::string() : std::string{ iCode1 };
std::string iCode2str = iCode2 == ' ' ? std::string() : std::string{ iCode2 };
// clang-format off
getCategory("struct_mon_prot_cis")->emplace({
{ "pdbx_id", serNum },
{ "label_comp_id", pep1 },
{ "label_seq_id", lResSeq1 },
{ "label_asym_id", lAsym1 },
{ "label_alt_id", "." },
{ "pdbx_PDB_ins_code", iCode1str },
{ "auth_comp_id", pep1 },
{ "auth_seq_id", seqNum1 },
{ "auth_asym_id", std::string{ chainID1 } },
{ "pdbx_label_comp_id_2", pep2 },
{ "pdbx_label_seq_id_2", lResSeq2 },
{ "pdbx_label_asym_id_2", lAsym2 },
{ "pdbx_PDB_ins_code_2", iCode2str },
{ "pdbx_auth_comp_id_2", pep2 },
{ "pdbx_auth_seq_id_2", seqNum2 },
{ "pdbx_auth_asym_id_2", std::string{ chainID2 } },
{ "pdbx_PDB_model_num", modNum },
{ "pdbx_omega_angle", measure }
});
// clang-format on
continue;
}
break;
}
}
void PDBFileParser::ParseMiscellaneousFeatures()
{
int structSiteGenID = 1;
while (mRec->is("SITE "))
{ // 1 - 6 Record name "SITE "
// 8 - 10 Integer seqNum Sequence number.
std::string siteID = vS(12, 14); // 12 - 14 LString(3) siteID Site name.
int numRes = vI(16, 17); // 16 - 17 Integer numRes Number of residues that compose the site.
int o = 19;
auto cat = getCategory("struct_site_gen");
for (int i = 0; i < numRes; ++i)
{
std::string resName = vS(o, o + 2); // 19 - 21 Residue name resName1 Residue name for first residue that
// creates the site.
char chainID = vC(o + 4); // 23 Character chainID1 Chain identifier for first residue of site.
int seq = vI(o + 5, o + 8); // 24 - 27 Integer seq1 Residue sequence number for first residue
// of the site.
char iCode = vC(o + 9); // 28 AChar iCode1 Insertion code for first residue of the site.
int labelSeq;
std::string asym;
bool isResseq;
std::error_code ec;
std::tie(asym, labelSeq, isResseq) = MapResidue(chainID, seq, iCode, ec);
if (ec)
{
if (cif::VERBOSE > 0)
std::cerr << "skipping struct_site_gen record\n";
}
else
// clang-format off
cat->emplace({
{ "id", structSiteGenID++ },
{ "site_id", siteID },
{ "pdbx_num_res", numRes },
{ "label_comp_id", resName },
{ "label_asym_id", asym },
{ "label_seq_id", (labelSeq > 0 and isResseq) ? std::to_string(labelSeq) : std::string(".") },
{ "pdbx_auth_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
{ "auth_comp_id", resName },
{ "auth_asym_id", std::string{ chainID } },
{ "auth_seq_id", seq },
{ "label_atom_id", "." },
{ "label_alt_id", "." },
});
// clang-format on
o += 11;
}
GetNextRecord();
}
}
void PDBFileParser::ParseCrystallographic()
{
if (mRec->is("CRYST1"))
{
Match("CRYST1", true);
// clang-format off
getCategory("cell")->emplace({
{ "entry_id", mStructureID }, // 1 - 6 Record name "CRYST1"
{ "length_a", vF(7, 15) }, // 7 - 15 Real(9.3) a a (Angstroms).
{ "length_b", vF(16, 24) }, // 16 - 24 Real(9.3) b b (Angstroms).
{ "length_c", vF(25, 33) }, // 25 - 33 Real(9.3) c c (Angstroms).
{ "angle_alpha", vF(34, 40) }, // 34 - 40 Real(7.2) alpha alpha (degrees).
{ "angle_beta", vF(41, 47) }, // 41 - 47 Real(7.2) beta beta (degrees).
{ "angle_gamma", vF(48, 54) }, // 48 - 54 Real(7.2) gamma gamma (degrees).
/* goes into symmetry */ // 56 - 66 LString sGroup Space group.
{ "Z_PDB", vF(67, 70) } // 67 - 70 Integer z Z value.
});
// clang-format on
std::string spaceGroup, intTablesNr;
try
{
spaceGroup = vS(56, 66);
intTablesNr = std::to_string(get_space_group_number(spaceGroup));
}
catch (...)
{
}
// clang-format off
getCategory("symmetry")->emplace({
{ "entry_id", mStructureID },
{ "space_group_name_H-M", spaceGroup },
{ "Int_Tables_number", intTablesNr }
});
GetNextRecord();
}
}
void PDBFileParser::ParseCoordinateTransformation()
{
std::string m[3][3], v[3];
if (cif::starts_with(mRec->mName, "ORIGX"))
{
for (std::string n : { "1", "2", "3" })
{
int x = stoi(n) - 1;
Match("ORIGX" + n, true); // 1 - 6 Record name "ORIGXn" n=1, 2, or 3
m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) o[n][1] On1
m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) o[n][2] On2
m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) o[n][3] On3
v[x] = vF(46, 55); // 46 - 55 Real(10.5) t[n] Tn
GetNextRecord();
}
// clang-format off
getCategory("database_PDB_matrix")->emplace({
{ "entry_id", mStructureID },
{ "origx[1][1]", m[0][0] },
{ "origx[1][2]", m[0][1] },
{ "origx[1][3]", m[0][2] },
{ "origx[2][1]", m[1][0] },
{ "origx[2][2]", m[1][1] },
{ "origx[2][3]", m[1][2] },
{ "origx[3][1]", m[2][0] },
{ "origx[3][2]", m[2][1] },
{ "origx[3][3]", m[2][2] },
{ "origx_vector[1]", v[0] },
{ "origx_vector[2]", v[1] },
{ "origx_vector[3]", v[2] },
});
// clang-format on
}
if (cif::starts_with(mRec->mName, "SCALE"))
{
for (std::string n : { "1", "2", "3" })
{
int x = stoi(n) - 1;
Match("SCALE" + n, true); // 1 - 6 Record name "SCALEn" n=1, 2, or 3
m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) s[n][1] Sn1
m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) s[n][2] Sn2
m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) s[n][3] Sn3
v[x] = vF(46, 55); // 46 - 55 Real(10.5) u[n] Un
GetNextRecord();
}
// clang-format off
getCategory("atom_sites")->emplace({
{ "entry_id", mStructureID },
{ "fract_transf_matrix[1][1]", m[0][0] },
{ "fract_transf_matrix[1][2]", m[0][1] },
{ "fract_transf_matrix[1][3]", m[0][2] },
{ "fract_transf_matrix[2][1]", m[1][0] },
{ "fract_transf_matrix[2][2]", m[1][1] },
{ "fract_transf_matrix[2][3]", m[1][2] },
{ "fract_transf_matrix[3][1]", m[2][0] },
{ "fract_transf_matrix[3][2]", m[2][1] },
{ "fract_transf_matrix[3][3]", m[2][2] },
{ "fract_transf_vector[1]", v[0] },
{ "fract_transf_vector[2]", v[1] },
{ "fract_transf_vector[3]", v[2] },
});
// clang-format on
}
while (cif::starts_with(mRec->mName, "MTRIX1"))
{
int serial = 0, igiven = 0;
for (std::string n : { "1", "2", "3" })
{
int x = stoi(n) - 1;
Match("MTRIX" + n, true); // 1 - 6 Record name "MTRIXn" n=1, 2, or 3
serial = vI(8, 10); // 8 - 10 Integer serial Serial number.
m[x][0] = vF(11, 20); // 11 - 20 Real(10.6) m[n][1] Mn1
m[x][1] = vF(21, 30); // 21 - 30 Real(10.6) m[n][2] Mn2
m[x][2] = vF(31, 40); // 31 - 40 Real(10.6) m[n][3] Mn3
v[x] = vF(46, 55); // 46 - 55 Real(10.5) v[n] Vn
igiven = vC(60) == '1'; // 60 Integer iGiven 1 if coordinates for the representations
// which are approximately related by the
GetNextRecord(); // transformations of the molecule are
} // contained in the datablock. Otherwise, blank.
// clang-format off
getCategory("struct_ncs_oper")->emplace({
{ "id", serial },
{ "matrix[1][1]", m[0][0] },
{ "matrix[1][2]", m[0][1] },
{ "matrix[1][3]", m[0][2] },
{ "matrix[2][1]", m[1][0] },
{ "matrix[2][2]", m[1][1] },
{ "matrix[2][3]", m[1][2] },
{ "matrix[3][1]", m[2][0] },
{ "matrix[3][2]", m[2][1] },
{ "matrix[3][3]", m[2][2] },
{ "vector[1]", v[0] },
{ "vector[2]", v[1] },
{ "vector[3]", v[2] },
{ "code", igiven ? "given" : "" }
});
// clang-format on
}
}
void PDBFileParser::ParseCoordinate(int modelNr)
{
// oh oh, we have to sort our atom_site records by ascending asym_id
// This routine used to be so trivial...
typedef std::tuple<std::string, int, bool, PDBRecord *, PDBRecord *> atomRec;
std::vector<atomRec> atoms;
while (mRec->is("ATOM ") or mRec->is("HETATM")) // 1 - 6 Record name "ATOM "
{
char chainID = vC(22); // 22 Character chainID Chain identifier.
int resSeq = vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
char iCode = vC(27);
std::string asymID;
int seqID;
bool isResseq;
std::tie(asymID, seqID, isResseq) = MapResidue(chainID, resSeq, iCode);
PDBRecord *atom = mRec;
PDBRecord *anisou = nullptr;
GetNextRecord();
if (mRec->is("ANISOU"))
{
anisou = mRec;
GetNextRecord();
}
atoms.emplace_back(asymID, seqID, isResseq, atom, anisou);
/*if?... */ while (mRec->is("TER "))
{
Match("TER ", true);
GetNextRecord();
}
}
auto last = mRec;
// use stable sort here
auto rLess = [](const atomRec &a, const atomRec &b) -> bool
{
int d;
std::string chainA = std::get<0>(a);
std::string chainB = std::get<0>(b);
if (chainA.length() != chainB.length())
d = static_cast<int>(chainA.length() - chainB.length());
else
d = std::get<0>(a).compare(std::get<0>(b));
if (d == 0)
d = std::get<1>(a) - std::get<1>(b);
return d < 0;
};
stable_sort(atoms.begin(), atoms.end(), rLess);
// now reiterate the atoms to reorder alternates
for (std::size_t i = 0; i + 1 < atoms.size(); ++i)
{
char altLoc = std::get<3>(atoms[i])->vC(17);
if (altLoc == ' ' or altLoc == 0)
continue;
auto b = atoms.begin() + i;
auto e = b;
std::map<std::string, int> atomIndex; // index number of first occurrence of a atom name
while (e != atoms.end() and rLess(*b, *e) == false)
{
std::string name = std::get<3>(*e)->vS(13, 16);
if (atomIndex.count(name) == 0)
atomIndex[name] = static_cast<int>(atomIndex.size() + 1);
++e;
}
auto aLess = [&](atomRec &a, atomRec &b) -> bool
{
std::string na = std::get<3>(a)->vS(13, 16);
std::string nb = std::get<3>(b)->vS(13, 16);
int d = atomIndex[na] - atomIndex[nb];
if (d == 0)
d = std::get<3>(a)->vC(17) - std::get<3>(b)->vC(17);
assert(d != 0);
return d < 0;
};
sort(b, e, aLess);
i += distance(b, e) - 1;
}
// while (mRec->is("ATOM ") or mRec->is("HETATM")) // 1 - 6 Record name "ATOM "
for (auto &a : atoms)
{
std::string asymID;
int seqID;
bool isResseq;
PDBRecord *atom;
PDBRecord *anisou;
std::tie(asymID, seqID, isResseq, atom, anisou) = a;
mRec = atom;
++mAtomID;
std::string groupPDB = mRec->is("ATOM ") ? "ATOM" : "HETATM";
// int serial = vI(7, 11); // 7 - 11 Integer serial Atom serial number.
std::string name = vS(13, 16); // 13 - 16 Atom name Atom name.
char altLoc = vC(17); // 17 Character altLoc Alternate location indicator.
std::string resName = vS(18, 20); // 18 - 20 Residue name resName Residue name.
char chainID = vC(22); // 22 Character chainID Chain identifier.
int resSeq = vI(23, 26); // 23 - 26 Integer resSeq Residue sequence number.
char iCode = vC(27); // 27 AChar iCode Code for insertion of residues.
std::string x = vF(31, 38); // 31 - 38 Real(8.3) x Orthogonal coordinates for X in Angstroms.
std::string y = vF(39, 46); // 39 - 46 Real(8.3) y Orthogonal coordinates for Y in Angstroms.
std::string z = vF(47, 54); // 47 - 54 Real(8.3) z Orthogonal coordinates for Z in Angstroms.
std::string occupancy = vF(55, 60); // 55 - 60 Real(6.2) occupancy Occupancy.
std::string tempFactor = vF(61, 66); // 61 - 66 Real(6.2) tempFactor Temperature factor.
std::string element = vS(77, 78); // 77 - 78 LString(2) element Element symbol, right-justified.
std::string charge = vS(79, 80); // 79 - 80 LString(2) charge Charge on the atom.
if (element.empty())
throw std::runtime_error("Empty element column in PDB file at line " + std::to_string(mRec->mLineNr));
std::string entityID = mAsymID2EntityID[asymID];
charge = pdb2cifCharge(charge);
// if (cif::compound_factory::instance().is_known_peptide(resName) or cif::compound_factory::instance().is_known_base(resName))
if (resName == "UNK" or cif::compound_factory::kAAMap.count(resName) or cif::compound_factory::kBaseMap.count(resName))
{
if (groupPDB == "HETATM")
{
if (cif::VERBOSE > 0)
std::cerr << "Changing atom from HETATM to ATOM at line " << mRec->mLineNr << '\n';
groupPDB = "ATOM";
}
}
else
{
if (groupPDB == "ATOM")
{
if (cif::VERBOSE > 0)
std::cerr << "Changing atom from ATOM to HETATM at line " << mRec->mLineNr << '\n';
groupPDB = "HETATM";
}
}
// if the atom is part of a sugar, we need to replace the auth_seq_id/resSeq
if (mSugarEntities.count(entityID))
{
using namespace cif::literals;
auto &branch_scheme = *getCategory("pdbx_branch_scheme");
resSeq = branch_scheme.find1<int>("asym_id"_key == asymID and "auth_seq_num"_key == resSeq, "pdb_seq_num");
}
// clang-format off
getCategory("atom_site")->emplace({
{ "group_PDB", groupPDB },
{ "id", mAtomID },
{ "type_symbol", element },
{ "label_atom_id", name },
{ "label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
{ "label_comp_id", resName },
{ "label_asym_id", asymID },
{ "label_entity_id", entityID },
{ "label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
{ "pdbx_PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
{ "Cartn_x", x },
{ "Cartn_y", y },
{ "Cartn_z", z },
{ "occupancy", occupancy },
{ "B_iso_or_equiv", tempFactor },
{ "pdbx_formal_charge", charge },
{ "auth_seq_id", resSeq },
{ "auth_comp_id", resName },
{ "auth_asym_id", std::string{ chainID } },
{ "auth_atom_id", name },
{ "pdbx_PDB_model_num", modelNr }
});
// clang-format on
InsertAtomType(element);
std::string check = vS(7, 11) + vS(77, 80);
if (anisou != nullptr)
{
mRec = anisou; // 1 - 6 Record name "ANISOU"
int u11 = vI(29, 35); // 29 - 35 Integer u[0][0] U(1,1)
int u22 = vI(36, 42); // 36 - 42 Integer u[1][1] U(2,2)
int u33 = vI(43, 49); // 43 - 49 Integer u[2][2] U(3,3)
int u12 = vI(50, 56); // 50 - 56 Integer u[0][1] U(1,2)
int u13 = vI(57, 63); // 57 - 63 Integer u[0][2] U(1,3)
int u23 = vI(64, 70); // 64 - 70 Integer u[1][2] U(2,3)
if (vS(7, 11) + vS(77, 80) != check)
throw std::runtime_error("ANISOU record should follow corresponding ATOM record");
auto f = [](float f) -> std::string
{
return cif::format("{:6.4f}", f);
};
// clang-format off
getCategory("atom_site_anisotrop")->emplace({
{ "id", mAtomID },
{ "type_symbol", element },
{ "pdbx_label_atom_id", name },
{ "pdbx_label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
{ "pdbx_label_comp_id", resName },
{ "pdbx_label_asym_id", asymID },
{ "pdbx_label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
{ "U[1][1]", f(u11 / 10000.f) },
{ "U[2][2]", f(u22 / 10000.f) },
{ "U[3][3]", f(u33 / 10000.f) },
{ "U[1][2]", f(u12 / 10000.f) },
{ "U[1][3]", f(u13 / 10000.f) },
{ "U[2][3]", f(u23 / 10000.f) },
{ "pdbx_auth_seq_id", resSeq },
{ "pdbx_auth_comp_id", resName },
{ "pdbx_auth_asym_id", std::string{ chainID } },
{ "pdbx_auth_atom_id", name }
});
// clang-format on
}
}
mRec = last;
}
void PDBFileParser::ParseConnectivty()
{
while (mRec->is("CONECT"))
GetNextRecord();
}
void PDBFileParser::ParseBookkeeping()
{
if (mRec->is("MASTER"))
{
Match("MASTER", false);
GetNextRecord();
}
Match("END ", false);
}
void PDBFileParser::Parse(std::istream &is, cif::file &result)
{
try
{
if (mDatablock.get_validator() == nullptr)
mDatablock.load_dictionary();
PreParseInput(is);
mRec = mData;
ParseTitle();
ParseRemarks();
ParsePrimaryStructure();
ParseHeterogen();
ConstructEntities();
ParseRemark350();
ParseSecondaryStructure();
ParseConnectivtyAnnotation();
ParseMiscellaneousFeatures();
ParseCrystallographic();
ParseCoordinateTransformation();
uint32_t modelNr = 1;
bool hasAtoms = false;
while (mRec->is("MODEL ") or mRec->is("ATOM ") or mRec->is("HETATM"))
{
bool model = false;
if (mRec->is("MODEL "))
{
model = true;
modelNr = vI(11, 14);
GetNextRecord();
}
hasAtoms = hasAtoms or mRec->is("ATOM ") or mRec->is("HETATM");
ParseCoordinate(modelNr);
if (model)
{
Match("ENDMDL", true);
GetNextRecord();
}
}
if (not hasAtoms)
throw std::runtime_error("Either the PDB file has no atom records, or the field " + std::string(mRec->mName) + " is not at the correct location");
for (auto e : mAtomTypes)
getCategory("atom_type")->emplace({ { "symbol", e } });
// in V5, atom_type is sorted
getCategory("atom_type")->reorder_by_index();
ParseConnectivty();
ParseBookkeeping();
// almost done, now fix some outstanding issued that could not be done before
try
{
auto r = FindRecord("REMARK 3");
if (r != nullptr and Remark3Parser::parse(mExpMethod, r, mDatablock))
{
// make sure the "exptl" category is created
auto exptl = getCategory("exptl");
if (exptl->empty())
{
exptl->emplace({ { "entry_id", mStructureID },
{ "method", mExpMethod },
{ "crystals_number", mRemark200["NUMBER OF CRYSTALS USED"] } });
}
}
}
catch (const std::exception &ex)
{
if (cif::VERBOSE >= 0)
std::cerr << "Error parsing REMARK 3\n";
throw;
}
//
// auto cat = getCategory("pdbx_refine_tls_group");
// for (Row r: *cat)
// {
// // add the mapped locations
//
// try
// {
// std::string asymID;
// int resNum;
//
// cif::tie(asymID, resNum) = r.get("beg_auth_asym_id", "beg_auth_seq_id");
//
// r["beg_label_asym_id"] = asymID;
// r["beg_label_seq_id"] = resNum;
//
// cif::tie(asymID, resNum) = r.get("end_auth_asym_id", "end_auth_seq_id");
//
// r["end_label_asym_id"] = asymID;
// r["end_label_seq_id"] = resNum;
// }
// catch (const std::exception& ex)
// {
// continue;
// }
// }
using namespace cif::literals;
auto &atom_site = *getCategory("atom_site");
for (auto r : getCategory("struct_conn")->find("pdbx_dist_value"_key == 0 or "pdbx_dist_value"_key == cif::null))
{
const auto &[asym1, seq1, atom1, symm1, asym2, seq2, atom2, symm2] = r.get<std::string, std::string, std::string, std::string, std::string, std::string, std::string, std::string>(
"ptnr1_label_asym_id", "ptnr1_label_seq_id", "ptnr1_label_atom_id", "ptnr1_symmetry",
"ptnr2_label_asym_id", "ptnr2_label_seq_id", "ptnr2_label_atom_id", "ptnr2_symmetry");
float distance = 1.0f;
try
{
auto a1 = atom_site.find1("label_asym_id"_key == asym1 and "label_seq_id"_key == seq1 and "label_atom_id"_key == atom1);
auto a2 = atom_site.find1("label_asym_id"_key == asym2 and "label_seq_id"_key == seq2 and "label_atom_id"_key == atom2);
if (not a1 or not a2)
throw std::runtime_error("cannot find atom");
const auto &[x1, y1, z1] = a1.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");
const auto &[x2, y2, z2] = a2.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");
if ((symm1.empty() or symm1 == "1_555") and (symm2.empty() or symm2 == "1_555"))
distance = std::sqrt(
(x1 - x2) * (x1 - x2) +
(y1 - y2) * (y1 - y2) +
(z1 - z2) * (z1 - z2));
else if (cif::VERBOSE > 0)
std::cerr << "Cannot calculate distance for link since one of the atoms is in another dimension\n";
}
catch (std::exception &ex)
{
if (cif::VERBOSE > 0)
std::cerr << "Error finding atom for LINK distance calculation: " << ex.what() << '\n';
}
r["pdbx_dist_value"] = distance;
}
result.emplace_back(std::move(mDatablock));
}
catch (const std::exception &ex)
{
if (cif::VERBOSE >= 0)
{
std::cerr << "Error parsing PDB";
if (mRec != nullptr)
std::cerr << " at line " << mRec->mLineNr;
std::cerr << '\n';
}
throw;
}
}
// ----------------------------------------------------------------
// A blast like alignment. Returns index of last aligned residue.
// matrix is m x n, addressing i,j is 0 <= i < m and 0 <= j < n
// element m i,j is mapped to [i * n + j] and thus storage is row major
template <typename T>
class matrix
{
public:
using value_type = T;
matrix() = delete;
matrix(const matrix &) = delete;
matrix &operator=(const matrix &) = delete;
matrix(uint32_t m, uint32_t n, T v = T())
: m_m(m)
, m_n(n)
{
m_data = new value_type[m_m * m_n];
std::fill(m_data, m_data + (m_m * m_n), v);
}
~matrix()
{
delete[] m_data;
}
uint32_t dim_m() const { return m_m; }
uint32_t dim_n() const { return m_n; }
value_type operator()(uint32_t i, uint32_t j) const
{
assert(i < m_m);
assert(j < m_n);
return m_data[i * m_n + j];
}
value_type &operator()(uint32_t i, uint32_t j)
{
assert(i < m_m);
assert(j < m_n);
return m_data[i * m_n + j];
}
private:
value_type *m_data;
uint32_t m_m, m_n;
};
int PDBFileParser::PDBChain::AlignResToSeqRes()
{
// Use dynamic programming to align the found residues (in ATOM records) against
// the residues in the SEQRES records in order to obtain the residue numbering.
// sigh...
auto &rx = mSeqres;
auto &ry = mResiduesSeen;
int dimX = static_cast<int>(mSeqres.size());
if (dimX == 0)
throw std::runtime_error(std::string("SEQRES for chain ") + mDbref.chainID + " is empty");
int dimY = static_cast<int>(mResiduesSeen.size());
if (dimY == 0)
throw std::runtime_error(std::string("Number of residues in ATOM records for chain ") + mDbref.chainID + " is zero");
matrix<float> B(dimX, dimY), Ix(dimX, dimY), Iy(dimX, dimY);
matrix<int8_t> tb(dimX, dimY);
int x, y;
const float
kMatchReward = 5,
kMismatchCost = -10,
kGapOpen = 10, gapExtend = 0.1f;
float high = 0;
int highX = 0, highY = 0;
for (x = 0; x < dimX; ++x)
{
for (y = 0; y < dimY; ++y)
{
auto &a = rx[x];
auto &b = ry[y];
float Ix1 = x > 0 ? Ix(x - 1, y) : 0;
float Iy1 = y > 0 ? Iy(x, y - 1) : 0;
// score for alignment
float M;
if (a.mMonID == b.mMonID)
M = kMatchReward;
else
M = kMismatchCost;
// gap open cost is zero if the PDB ATOM records indicate that a gap
// should be here.
float gapOpen = kGapOpen;
if (y == 0 or (y + 1 < dimY and ry[y + 1].mSeqNum > ry[y].mSeqNum + 1))
gapOpen = 0;
if (x > 0 and y > 0)
M += B(x - 1, y - 1);
float s;
if (M >= Ix1 and M >= Iy1)
{
tb(x, y) = 0;
B(x, y) = s = M;
Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
}
else if (Ix1 >= Iy1)
{
tb(x, y) = 1;
B(x, y) = s = Ix1;
Ix(x, y) = Ix1 - gapExtend;
Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
if (Iy(x, y) < Iy1 - gapExtend)
Iy(x, y) = Iy1 - gapExtend;
}
else
{
tb(x, y) = -1;
B(x, y) = s = Iy1;
Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
if (Ix(x, y) < Ix1 - gapExtend)
Ix(x, y) = Ix1 - gapExtend;
Iy(x, y) = Iy1 - gapExtend;
}
if (/*(x == dimX - 1 or y == dimY - 1) and */ high < s)
{
high = s;
highX = x;
highY = y;
}
}
}
const int kFlagSeqNr = std::numeric_limits<int>::min();
// reset positions of seqres
for (auto &sr : rx)
{
sr.mSeqNum = kFlagSeqNr;
sr.mIcode = ' ';
}
// assign numbers
x = highX;
y = highY;
// C++ is getting closer to Pascal :-)
auto printAlignment = [&tb, highX, highY, &rx, &ry, this]()
{
std::cerr << std::string(22, '-') << '\n'
<< "Alignment for chain " << mDbref.chainID << '\n'
<< '\n';
std::vector<std::pair<std::string, std::string>> alignment;
int x = highX;
int y = highY;
for (x = highX, y = highY; x >= 0 and y >= 0;)
{
switch (tb(x, y))
{
case -1:
alignment.push_back(make_pair("...", ry[y].mMonID));
--y;
break;
case 1:
alignment.push_back(make_pair(rx[x].mMonID, "..."));
--x;
break;
case 0:
alignment.push_back(make_pair(rx[x].mMonID, ry[y].mMonID));
--x;
--y;
break;
}
}
while (x >= 0)
{
alignment.push_back(make_pair(rx[x].mMonID, "..."));
--x;
}
while (y >= 0)
{
alignment.push_back(make_pair("...", ry[y].mMonID));
--y;
}
reverse(alignment.begin(), alignment.end());
for (auto a : alignment)
std::cerr << " " << a.first << " -- " << a.second << '\n';
std::cerr << '\n';
};
if (cif::VERBOSE > 1)
printAlignment();
try
{
while (x >= 0 and y >= 0)
{
switch (tb(x, y))
{
case -1:
throw std::runtime_error("A residue found in the ATOM records (" + ry[y].mMonID +
" @ " + std::string{ mDbref.chainID } + ":" + std::to_string(ry[y].mSeqNum) +
((ry[y].mIcode == ' ' or ry[y].mIcode == 0) ? "" : std::string{ ry[y].mIcode }) +
") was not found in the SEQRES records");
break;
case 1:
if (cif::VERBOSE > 3)
std::cerr << "Missing residue in ATOM records: " << rx[x].mMonID << " at " << rx[x].mSeqNum << '\n';
--x;
break;
case 0:
if (rx[x].mMonID != ry[y].mMonID)
{
std::cerr << "Warning, unaligned residues at " << x << "/" << y << "(" << rx[x].mMonID << '/' << ry[y].mMonID << ") SEQRES does not agree with ATOM records\n";
rx[x].mMonID = ry[y].mMonID;
}
rx[x].mSeqNum = ry[y].mSeqNum;
rx[x].mIcode = ry[y].mIcode;
--x;
--y;
}
}
}
catch (const std::exception &ex)
{
if (cif::VERBOSE == 1)
printAlignment();
throw;
}
// assign numbers to the residues that don't have them yet
std::stack<int> unnumbered;
for (x = 0; x < dimX; ++x)
{
if (rx[x].mSeqNum == kFlagSeqNr)
{
if (x > 0 and rx[x - 1].mSeqNum != kFlagSeqNr)
rx[x].mSeqNum = rx[x - 1].mSeqNum + 1;
else
unnumbered.push(x);
}
}
while (unnumbered.empty() == false)
{
x = unnumbered.top();
if (x >= dimX - 1)
throw std::runtime_error("Could not assign sequence numbers");
rx[x].mSeqNum = rx[x + 1].mSeqNum - 1;
unnumbered.pop();
}
return highY;
}
bool PDBFileParser::PDBChain::SameSequence(const PDBChain &rhs) const
{
bool result = mSeqres.size() == rhs.mSeqres.size();
for (std::size_t i = 0; result and i < mSeqres.size(); ++i)
result = mSeqres[i].mMonID == rhs.mSeqres[i].mMonID;
return result;
}
// --------------------------------------------------------------------
void read_pdb_file(std::istream &pdbFile, cif::file &cifFile)
{
PDBFileParser p;
p.Parse(pdbFile, cifFile);
if (cifFile.empty())
{
if (VERBOSE >= 0)
std::cerr << "PDB is empty!\n";
}
else
{
cifFile.front().load_dictionary();
if (cifFile.front().get_validator() == nullptr)
cifFile.front().set_validator(&validator_factory::instance().get("mmcif_pdbx.dic"));
if (not cifFile.is_valid() and cif::VERBOSE >= 0)
std::cerr << "Resulting mmCIF file is not valid!\n";
}
}
// --------------------------------------------------------------------
file read(std::istream &is)
{
file result;
auto *buffer = is.rdbuf();
if (buffer)
{
char ch = std::char_traits<char>::to_char_type(buffer->sgetc());
// All PDB files should always start with a HEADER line
// and so the very first character in a valid PDB file
// is 'H'. It is as simple as that.
// Well, not quite, Unfortunately... People insisted that
// having only ATOM records also makes up a valid PDB file...
// Since mmCIF files cannot validly start with a letter character
// apart from the letter 'd', the test has changed into the following:
if (std::isalpha(ch) and std::toupper(ch) != 'D')
{
read_pdb_file(is, result);
fixup_pdbx(result);
}
else
{
try
{
result.load(is);
}
catch (const std::exception &ex)
{
std::throw_with_nested(std::runtime_error("Since the file did not start with a valid PDB HEADER line mmCIF was assumed, but that failed."));
}
if (not(result.empty() or result.front().empty()))
{
if (auto &db = result.front(); db.get("audit_conform") == nullptr)
reconstruct_pdbx(result);
else
{
try
{
// Try to see if we can create an mm::structure out of this data.
// If that fails, we need to reconstruct a PDBx file out of it.
cif::mm::structure s(result);
}
catch (const std::exception &e)
{
reconstruct_pdbx(result);
}
}
}
}
}
// Must be a PDB like file, right?
if (not result.empty())
{
auto &db = result.front();
if (db.get_validator() == nullptr)
db.set_validator(&validator_factory::instance().get("mmcif_pdbx.dic"));
if (db.is_valid())
db.get_validator()->fill_audit_conform(db["audit_conform"]);
}
return result;
}
file read(const std::filesystem::path &file)
{
try
{
gzio::ifstream in(file);
if (not in.is_open())
throw std::runtime_error("Could not open file " + file.string() + " for input");
return read(in);
}
catch (const std::exception &ex)
{
throw_with_nested(std::runtime_error("Error reading file " + file.string()));
}
}
} // namespace cif::pdb