libcifpp/src/pdb/pdb2cif.cpp

/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "pdb2cif_remark_3.hpp"

#include "cif++.hpp"

#include <iomanip>
#include <map>
#include <set>
#include <stack>
#include <stdexcept>

using cif::category;
using cif::datablock;
using cif::iequals;
using cif::key;
using cif::to_lower;
using cif::to_lower_copy;

// --------------------------------------------------------------------
// attempt to come up with better error handling

namespace error
{
enum pdbErrors
{
	residueNotFound = 1000,
	invalidDate
};

namespace detail
{
	class pdbCategory : public std::error_category
	{
	  public:
		const char *name() const noexcept
		{
			return "pdb";
		}

		std::string message(int value) const
		{
			switch (value)
			{
				case residueNotFound:
					return "Residue not found";

				case invalidDate:
					return "Invalid date";

				default:
					return "Error in PDB format";
			}
		}
	};
} // namespace detail

std::error_category &pdbCategory()
{
	static detail::pdbCategory impl;
	return impl;
}

inline std::error_code make_error_code(pdbErrors e)
{
	return std::error_code(static_cast<int>(e), pdbCategory());
}
} // namespace error

namespace std
{

template <>
struct is_error_code_enum<error::pdbErrors>
{
	static const bool value = true;
};

} // namespace std

namespace cif::pdb
{

// --------------------------------------------------------------------

const std::map<std::string, int> kMonths{
	{ "JAN", 1 },
	{ "FEB", 2 },
	{ "MAR", 3 },
	{ "APR", 4 },
	{ "MAY", 5 },
	{ "JUN", 6 },
	{ "JUL", 7 },
	{ "AUG", 8 },
	{ "SEP", 9 },
	{ "OCT", 10 },
	{ "NOV", 11 },
	{ "DEC", 12 },
};

const std::set<std::string> kSupportedRecords{
	"HEADER", "OBSLTE", "TITLE ", "SPLIT ", "CAVEAT", "COMPND", "SOURCE",
	"KEYWDS", "EXPDTA", "NUMMDL", "MDLTYP", "AUTHOR", "REVDAT", "SPRSDE",
	"JRNL  ", "REMARK", "DBREF ", "DBREF1", "DBREF2", "SEQADV", "SEQRES",
	"MODRES", "HET   ", "HETNAM", "HETSYN", "FORMUL", "HELIX ", "SHEET ",
	"SSBOND", "LINK  ", "CISPEP", "SITE  ", "CRYST1", "ORIGX1", "SCALE1",
	"MTRIX1", "ORIGX2", "SCALE2", "MTRIX2", "ORIGX3", "SCALE3", "MTRIX3",
	"MODEL ", "ATOM  ", "ANISOU", "TER   ", "HETATM", "ENDMDL", "CONECT",
	"MASTER", "END   ",

	// bah...
	"LINKR "
};

bool isWater(const std::string &resname)
{
	return resname == "HOH" or resname == "H2O" or resname == "OH2" or resname == "WAT" or resname == "DOD" or resname == "WAT";
}

// --------------------------------------------------------------------
//	Unfortunately, parsing a PDB file requires several passes over the
//	data. Therefore we first obtain all records where a record has the
//	value flattened out for continuation.

PDBRecord::PDBRecord(uint32_t lineNr, const std::string &name, const std::string &value)
	: mNext(nullptr)
	, mLineNr(lineNr)
	, mVlen(value.length())
{
	assert(name.length() <= 10);

	strcpy(mName, name.c_str());
	strcpy(mValue, value.c_str());
}

PDBRecord::~PDBRecord()
{
}

void *PDBRecord::operator new(std::size_t size, std::size_t vLen)
{
	return malloc(size + vLen + 1);
}

void PDBRecord::operator delete(void *p)
{
	free(p);
}

void PDBRecord::operator delete(void *p, std::size_t vLen)
{
	free(p);
}

bool PDBRecord::is(const char *name) const
{
	return iequals(mName, name);
}

char PDBRecord::vC(std::size_t column)
{
	char result = ' ';
	if (column - 7 < mVlen)
		result = mValue[column - 7];
	return result;
}

std::string PDBRecord::vS(std::size_t columnFirst, std::size_t columnLast)
{
	std::string result;

	if (columnLast > mVlen + 6)
		columnLast = mVlen + 6;

	if (columnFirst < mVlen + 7)
	{
		result = std::string{ mValue + columnFirst - 7, mValue + columnLast - 7 + 1 };
		cif::trim(result);
	}

	return result;
}

int PDBRecord::vI(int columnFirst, int columnLast)
{
	int result = 0;

	const char *e = mValue + mVlen;
	if (e > mValue + columnLast - 7 + 1)
		e = mValue + columnLast - 7 + 1;

	enum
	{
		start,
		digit,
		tail
	} state = start;
	bool negate = false;

	try
	{
		for (const char *p = mValue + columnFirst - 7; p < e; ++p)
		{
			switch (state)
			{
				case start:
					if (*p == '+')
						state = digit;
					else if (*p == '-')
					{
						negate = true;
						state = digit;
					}
					else if (isdigit(*p))
					{
						result = *p - '0';
						state = digit;
					}
					else if (not isspace(*p))
						throw std::runtime_error("Not a valid integer in PDB record");
					break;

				case digit:
					if (isspace(*p))
						state = tail;
					else if (not isdigit(*p))
						throw std::runtime_error("Not a valid integer in PDB record");
					else
						result = result * 10 + *p - '0';
					break;

				case tail:
					if (not isspace(*p))
						throw std::runtime_error("Not a valid integer in PDB record");
					break;
			}
		}
	}
	catch (const std::exception &ex)
	{
		if (cif::VERBOSE >= 0)
			std::cerr << "Trying to parse '" << std::string(mValue + columnFirst - 7, mValue + columnLast - 7) << '\'' << '\n';
		throw;
	}

	if (negate)
		result = -result;

	return result;
}

std::string PDBRecord::vF(std::size_t columnFirst, std::size_t columnLast)
{
	// for now... TODO: check format?
	return vS(columnFirst, columnLast);
}

// --------------------------------------------------------------------

class SpecificationListParser
{
  public:
	SpecificationListParser(const std::string &text)
		: mText(text)
		, mP(mText.begin())
	{
	}

	std::tuple<std::string, std::string> GetNextSpecification();

  private:
	std::string mText;
	std::string::iterator mP;
};

std::tuple<std::string, std::string> SpecificationListParser::GetNextSpecification()
{
	std::string id, value;

	std::string::iterator start = mP, backup;

	enum
	{
		eStart,
		eID,
		eColon,
		eValue,
		eNL,
		eNL_ID,
		eSemiColon,
		eError,
		eDone
	} state = eStart;

	while (mP != mText.end() and state != eDone)
	{
		char ch = *mP++;

		switch (state)
		{
			case eStart:
				if (isalnum(ch) or ch == '_')
				{
					id = { ch };
					value.clear();
					state = eID;
					start = mP;
				}
				else if (not isspace(ch))
				{
					if (cif::VERBOSE > 0)
						std::cerr << "skipping invalid character in SOURCE ID: " << ch << '\n';
				}
				break;

			case eID:
				if (isalnum(ch) or ch == '_')
					id += ch;
				else if (ch == ':')
					state = eColon;
				else
					state = eError;
				break;

			case eColon:
				if (ch == ';')
				{
					if (cif::VERBOSE > 0)
						std::cerr << "Empty value for SOURCE: " << id << '\n';
					state = eStart;
				}
				else if (not isspace(ch))
				{
					value = { ch };
					state = eValue;
				}
				break;

			case eValue:
				if (ch == '\n')
				{
					backup = mP;
					state = eNL;
				}
				else if (ch == ';')
				{
					backup = mP;
					state = eSemiColon;
				}
				else
					value += ch;
				break;

			case eSemiColon:
				if (ch == '\n')
					state = eDone;
				else if (ch != ' ')
				{
					value.insert(value.end(), backup, mP);
					state = eValue;
				}
				break;

			case eNL:
				if (isalnum(ch))
				{
					value += ' ';
					state = eNL_ID;
				}
				else if (isspace(ch))
					state = eValue;
				break;

			case eNL_ID:
				if (ch == ':')
				{
					mP = backup;
					state = eDone;
				}
				else if (ch == ';')
					state = eSemiColon;
				else if (not(isalnum(ch) or ch == '_'))
				{
					value.insert(value.end(), backup, mP);
					state = eValue;
				}
				break;

			case eError:
				if (ch == ';')
				{
					if (cif::VERBOSE > 0)
						std::cerr << "Skipping invalid header line: '" << std::string(start, mP) << '\n';
					state = eStart;
				}
				break;

			case eDone: break; // keep compiler happy
		}
	}

	cif::trim(value);

	return std::make_tuple(id, value);
}

// --------------------------------------------------------------------

class PDBFileParser
{
  public:
	PDBFileParser()
		: mData(nullptr)
		, mRec(nullptr)
	{
	}

	~PDBFileParser()
	{
		PDBRecord *r = mData;
		while (r != nullptr)
		{
			PDBRecord *d = r;
			r = d->mNext;
			delete d;
		}
	}

	void Parse(std::istream &is, cif::file &result);

  private:
	// ----------------------------------------------------------------

	struct DBREF
	{
		std::string PDBIDCode;
		char chainID;
		int seqBegin;
		char insertBegin = ' ';
		int seqEnd;
		char insertEnd = ' ';
		std::string database;
		std::string dbAccession;
		std::string dbIdCode;
		int dbSeqBegin;
		char dbinsBeg;
		int dbSeqEnd;
		char dbinsEnd;
	};

	struct HET
	{
		std::string hetID;
		char chainID;
		int seqNum;
		char iCode;
		int numHetAtoms = 0;
		std::string text;
		std::string asymID;
		std::vector<PDBRecord *> atoms;
		bool processed = false;
		bool branch = false;
		PDBRecord *asn = nullptr;

		HET(const std::string &hetID, char chainID, int seqNum, char iCode, int numHetAtoms = 0, const std::string &text = {})
			: hetID(hetID)
			, chainID(chainID)
			, seqNum(seqNum)
			, iCode(iCode)
			, numHetAtoms(numHetAtoms)
			, text(text)
		{
		}
	};

	struct UNOBS
	{
		int modelNr;
		std::string res;
		char chain;
		int seq;
		char iCode;
		std::vector<std::string> atoms;
	};

	struct ATOM_REF
	{
		std::string name;
		std::string resName;
		int resSeq;
		char chainID;
		char iCode;
		char altLoc;

		bool operator==(const ATOM_REF &rhs) const
		{
			return name == rhs.name and
			       resName == rhs.resName and
			       resSeq == rhs.resSeq and
			       (altLoc == rhs.altLoc or altLoc == ' ' or rhs.altLoc == ' ') and
			       chainID == rhs.chainID and
			       iCode == rhs.iCode;
		}

		bool operator!=(const ATOM_REF &rhs) const
		{
			return not operator==(rhs);
		}

		bool operator<(const ATOM_REF &rhs) const
		{
			int d = chainID - rhs.chainID;
			if (d == 0)
				d = resSeq - rhs.resSeq;
			if (d == 0)
				d = iCode - rhs.iCode;
			// if (d == 0) d = resName.compare(rhs.resName);
			if (d == 0)
				d = name.compare(rhs.name);
			if (d == 0 and altLoc != ' ' and rhs.altLoc != ' ')
				d = altLoc - rhs.altLoc;
			return d < 0;
		}

		friend std::ostream &operator<<(std::ostream &os, const ATOM_REF &a)
		{
			os << a.name << ' ' << a.resName << ' ' << a.chainID << ' ' << a.resSeq << (a.iCode == ' ' ? "" : std::string{ a.iCode }) << (a.altLoc != ' ' ? std::string{ ' ', a.altLoc } : "");
			return os;
		}
	};

	struct LINK
	{
		ATOM_REF a, b;
		std::string symOpA, symOpB;
		float distance;
	};

	struct SUGAR
	{
		ATOM_REF c1;
		int leaving_o;
		ATOM_REF next;
	};

	class SUGAR_TREE : public std::vector<SUGAR>
	{
	  public:
		std::string entityName() const
		{
			return empty() ? "" : entityName(begin());
		}

	  private:
		std::string entityName(const_iterator sugar) const
		{
			std::string result;

			for (auto i = begin(); i != end(); ++i)
			{
				if (i->next != sugar->c1)
					continue;

				auto n = entityName(i) + "-(1-" + std::to_string(i->leaving_o) + ")";

				if (result.empty())
					result = n;
				else
					result += "-[" + n + ']';
			}

			if (not result.empty() and result.back() != ']')
				result += '-';

			auto compound = cif::compound_factory::instance().create(sugar->c1.resName);
			if (compound)
				result += compound->name();
			else if (sugar->c1.resName == "MAN")
				result += "alpha-D-mannopyranose";
			else if (sugar->c1.resName == "BMA")
				result += "beta-D-mannopyranose";
			else if (sugar->c1.resName == "NAG")
				result += "2-acetamido-2-deoxy-beta-D-glucopyranose";
			else if (sugar->c1.resName == "NDG")
				result += "2-acetamido-2-deoxy-alpha-D-glucopyranose";
			else if (sugar->c1.resName == "FUC")
				result += "alpha-L-fucopyranose";
			else if (sugar->c1.resName == "FUL")
				result += "beta-L-fucopyranose";
			else
				result += sugar->c1.resName;

			return result;
		}
	};

	// ----------------------------------------------------------------

	/*
	    To get from PDB chains to CIF entity and poly records we take the following steps:

	    First check if there is a Primary Structure Section. If there is, it should contain
	    a valid DBREF/SEQRES pair that allows the reconstruction of numbering of residues.

	    If that fails, we fall back to:

	    1. Collect the chains from the PDB file.
	    2. For each chain, split out the residues and waters, assign those to new entities
	    3. If there are multiple chains containing residues, align those to find unique polymers
	    4. Annotate the entity records with available information in the PDB file (COMPND e.g.)
	    5. Create the mapping structures from PDB numbering to CIF numbering.
	*/

	struct PDBCompound
	{
		int mMolID;
		std::string mTitle;
		std::set<char> mChains;
		std::map<std::string, std::string> mInfo;
		std::map<std::string, std::string> mSource;
		int mCount = 0;
	};

	struct PDBSeqRes
	{
		std::string mMonID;
		int mSeqNum;
		char mIcode;

		int mDbSeqNum = 0;
		bool mSeen = false;
		std::set<std::string> mAlts;

		bool operator==(const PDBSeqRes &rhs) const
		{
			return mSeqNum == rhs.mSeqNum and mMonID == rhs.mMonID and mIcode == rhs.mIcode;
		}
	};

	struct PDBChain
	{
		PDBChain(const std::string &structureID, char chainID, int molID)
			: mDbref{ structureID, chainID }
			, mWaters(0)
			, mTerIndex(0)
			, mMolID(molID)
			, mNextSeqNum(1)
			, mNextDbSeqNum(1)
		{
		}

		DBREF mDbref;
		std::vector<PDBSeqRes> mSeqres, mHet;
		int mWaters;
		int mTerIndex;

		int mMolID;

		// scratch values for reading SEQRES records
		int mNextSeqNum;
		int mNextDbSeqNum;

		// scratch value for aligning
		struct AtomRes
		{
			std::string mMonID;
			int mSeqNum;
			char mIcode;

			bool operator==(const AtomRes &rhs) const { return mSeqNum == rhs.mSeqNum and mIcode == rhs.mIcode; }
			bool operator!=(const AtomRes &rhs) const { return mSeqNum != rhs.mSeqNum or mIcode != rhs.mIcode; }
		};
		std::vector<AtomRes> mResiduesSeen;

		int AlignResToSeqRes();
		bool SameSequence(const PDBChain &rhs) const;
	};

	// ----------------------------------------------------------------

	PDBCompound &GetOrCreateCompound(int molID)
	{
		auto i = std::find_if(mCompounds.begin(), mCompounds.end(), [molID](PDBCompound &comp) -> bool
			{ return comp.mMolID == molID; });
		if (i == mCompounds.end())
		{
			mCompounds.push_back(PDBCompound{ molID });

			mMolID2EntityID[molID] = std::to_string(mNextEntityNr++);

			i = prev(mCompounds.end());
		}

		return *i;
	}

	// locate the PDBChain record for a chain ID, or create it with dummy data if missing
	PDBChain &GetChainForID(char chainID, int numRes = 0)
	{
		auto i = std::find_if(mChains.begin(), mChains.end(), [chainID](PDBChain &ch) -> bool
			{ return ch.mDbref.chainID == chainID; });

		if (i == mChains.end())
		{
			// locate the compound for this chain, if any (does that happen?)
			int molID = 0;
			for (auto &cmp : mCompounds)
			{
				if (cmp.mChains.count(chainID) > 0)
				{
					molID = cmp.mMolID;
					break;
				}
			}

			mChains.emplace_back(mStructureID, chainID, molID);

			i = prev(mChains.end());
		}

		return *i;
	};

	void InsertChemComp(const std::string &chemComp)
	{
		if (find(mChemComp.begin(), mChemComp.end(), chemComp) == mChemComp.end())
			mChemComp.push_back(chemComp);
	}

	void InsertAtomType(const std::string &atomType)
	{
		if (find(mAtomTypes.begin(), mAtomTypes.end(), atomType) == mAtomTypes.end())
			mAtomTypes.push_back(atomType);
	}

	// ----------------------------------------------------------------

	template <typename Predicate>
	PDBRecord *FindRecord(Predicate &&pred)
	{
		PDBRecord *result;

		for (result = mData; result != nullptr; result = result->mNext)
		{
			if (pred(*result))
				break;
		}

		return result;
	}

	PDBRecord *FindRecord(const char *name)
	{
		return FindRecord([name](PDBRecord &rec) -> bool
			{ return rec.is(name); });
	}

	// ----------------------------------------------------------------

	char vC(std::size_t column) const
	{
		return mRec->vC(column);
	}

	std::string vS(std::size_t columnFirst, std::size_t columnLast = std::numeric_limits<std::size_t>::max()) const
	{
		return mRec->vS(columnFirst, columnLast);
	}

	std::string vF(std::size_t columnFirst, std::size_t columnLast) const
	{
		return mRec->vF(columnFirst, columnLast);
	}

	int vI(int columnFirst, int columnLast) const
	{
		return mRec->vI(columnFirst, columnLast);
	}

	// ----------------------------------------------------------------

	// Map a PDB residue location to a seqnum in a struct_asym
	std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode) const
	{
		auto key = std::make_tuple(chainID, resSeq, iCode);

		try
		{
			return mChainSeq2AsymSeq.at(key);
		}
		catch (const std::exception &ex)
		{
			throw_with_nested(std::runtime_error(std::string("Residue ") + chainID + std::to_string(resSeq) + iCode + " could not be mapped"));
		}
	}

	std::tuple<std::string, int, bool> MapResidue(char chainID, int resSeq, char iCode, std::error_code &ec) const
	{
		auto key = std::make_tuple(chainID, resSeq, iCode);

		std::tuple<std::string, int, bool> result;

		if (not mChainSeq2AsymSeq.count(key))
		{
			ec = error::make_error_code(error::pdbErrors::residueNotFound);
			if (cif::VERBOSE > 0)
				std::cerr << "Residue " << chainID << resSeq << iCode << " could not be mapped\n";
		}
		else
			result = mChainSeq2AsymSeq.at(key);

		return result;
	}

	// ----------------------------------------------------------------

	void PreParseInput(std::istream &is);

	void GetNextRecord();
	void Match(const std::string &expected, bool throwIfMissing);

	void ParseTitle();
	void ParseCitation(const std::string &id);
	void ParseRemarks();

	//	void ParseRemark3();
	//	std::size_t ParseRemark3(const std::string& program, const Remark3Template templ[], std::size_t N);
	//	std::string NextRemark3Line();

	void ParseRemark200();
	void ParseRemark350();

	void ParsePrimaryStructure();
	void ParseHeterogen();
	void ConstructEntities();
	void ConstructSugarTrees(int &asymNr);
	void ParseSecondaryStructure();
	void ParseConnectivtyAnnotation();
	void ParseMiscellaneousFeatures();
	void ParseCrystallographic();
	void ParseCoordinateTransformation();
	void ParseCoordinate(int modelNr);
	void ParseConnectivty();
	void ParseBookkeeping();

	// ----------------------------------------------------------------

	category *getCategory(std::string name)
	{
		return &mDatablock[name];
	}

	std::vector<std::string> SplitCSV(const std::string &value);

	std::string pdb2cifDate(std::string s, std::error_code &ec)
	{
		std::smatch m;
		const std::regex
			rx1(R"((\d{2})-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))"),
			rx2(R"((JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)-(\d{2}))");

		try
		{
			if (regex_match(s, m, rx1))
			{
				int day = stoi(m[1].str());
				auto mi = kMonths.find(m[2].str());
				if (mi == kMonths.end())
					throw std::runtime_error("Invalid month: '" + m[2].str() + '\'');
				int month = mi->second;
				int year = 1900 + stoi(m[3].str());
				if (year < 1950)
					year += 100;

				s = cif::format("{:04}-{:02}-{:02}", year, month, day);
			}
			else if (regex_match(s, m, rx2))
			{
				auto mi = kMonths.find(m[1].str());
				if (mi == kMonths.end())
					throw std::runtime_error("Invalid month: '" + m[1].str() + '\'');
				int month = mi->second;
				int year = 1900 + stoi(m[2].str());
				if (year < 1950)
					year += 100;

				s = cif::format("{:04}-{:02}", year, month);
			}
			else
				ec = error::make_error_code(error::pdbErrors::invalidDate);
		}
		catch (const std::exception &ex)
		{
			if (cif::VERBOSE > 0)
				std::cerr << ex.what() << '\n';
			ec = error::make_error_code(error::pdbErrors::invalidDate);
		}

		return s;
	}

	std::string pdb2cifDate(std::string s)
	{
		std::error_code ec;
		auto result = pdb2cifDate(s, ec);
		if (ec and cif::VERBOSE > 0)
			std::cerr << "Invalid date(" << s << "): " << ec.message() << '\n';
		return result;
	}

	std::string pdb2cifAuth(std::string author)
	{
		cif::trim(author);

		const std::regex rx(R"(((?:[A-Z]+\.)+)(.+))");
		std::smatch m;
		if (regex_match(author, m, rx))
			author = m[2].str() + ", " + m[1].str();

		bool upper = true;
		for (auto &c : author)
		{
			if (ispunct(c) or isspace(c))
				upper = true;
			else if (upper)
				upper = false;
			else
				c = cif::tolower(c);
		}

		return author;
	}

	std::string pdb2cifSymmetry(std::string s)
	{
		static const std::regex sgRx(R"((\d{1,3})(\d{3}))");

		if (not s.empty())
		{
			std::smatch m;
			if (not std::regex_match(s, m, sgRx))
				throw std::runtime_error("invalid symmetry value '" + s + '\'');

			s = m[1].str() + "_" + m[2].str();
		}

		return s;
	}

	std::string pdb2cifCharge(std::string c)
	{
		std::regex rx(R"((\d+)(\+|-))");
		std::smatch m;

		if (std::regex_match(c, m, rx))
		{
			if (m[2].str() == "-")
				c = '-' + m[1].str();
			else
				c = m[1].str();
		}

		return c;
	}

	std::vector<char> altLocsForAtom(char chainID, int seqNum, char iCode, std::string atomName);
	void MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds);

	std::tuple<ATOM_REF, bool> FindLink(const std::string &name1, const std::string &resName1, int resSeq1, char altLoc1, char chainID1, char iCode1,
		const std::string &name2, const std::string &resName2 = "")
	{
		return FindLink(ATOM_REF{ name1, resName1, resSeq1, altLoc1, chainID1, iCode1 }, name2, resName2);
	}

	std::tuple<ATOM_REF, bool> FindLink(const ATOM_REF &atom, const std::string &name2, const std::string &resName2 = "") const
	{
		auto i = std::find_if(mLinks.begin(), mLinks.end(), [&](const LINK &link)
			{ return (link.a == atom and link.b.name == name2 and (resName2.empty() or link.b.resName == resName2)) or
			         (link.b == atom and link.a.name == name2 and (resName2.empty() or link.a.resName == resName2)); });

		if (i != mLinks.end())
			return { i->a == atom ? i->b : i->a, true };

		return {};
	}

	// ----------------------------------------------------------------

	PDBRecord *mData;
	PDBRecord *mRec;
	cif::datablock mDatablock;

	std::string mStructureID;
	std::string mModelTypeDetails;
	std::string mOriginalDate;
	std::string mExpMethod = "X-RAY DIFFRACTION";
	int mCitationAuthorNr = 1, mCitationEditorNr = 1;
	int mNextMolID = 1, mNextEntityNr = 1;
	int mNextSoftwareOrd = 1;

	struct SEQADV
	{
		std::string resName;
		char chainID;
		int seqNum;
		char iCode;
		std::string database;
		std::string dbAccession;
		std::string dbRes;
		int dbSeq;
		std::string conflict;
	};

	std::vector<SEQADV> mSeqadvs;

	std::list<PDBCompound> mCompounds;
	std::list<PDBChain> mChains;
	std::vector<HET> mHets;
	std::map<std::string, std::string> mHetnams;
	std::map<std::string, std::string> mHetsyns;
	std::map<std::string, std::string> mFormuls;
	std::string mWaterHetID;
	std::vector<std::string> mChemComp, mAtomTypes;

	std::map<std::string, std::string> mRemark200;
	std::string mRefinementSoftware;
	int mAtomID = 0;
	int mPdbxDifOrdinal = 0;

	std::vector<UNOBS> mUnobs;
	std::vector<LINK> mLinks;

	// various maps between numbering schemes
	std::map<std::tuple<char, int, char>, std::tuple<std::string, int, bool>> mChainSeq2AsymSeq;

	std::map<int, std::string> mMolID2EntityID;
	std::map<std::string, std::string> mHet2EntityID;
	std::map<std::string, std::string> mBranch2EntityID;
	std::map<std::string, std::string> mAsymID2EntityID;
	std::map<std::string, std::string> mMod2parent;
	std::set<std::string> mSugarEntities;
};

// --------------------------------------------------------------------

std::vector<char> PDBFileParser::altLocsForAtom(char inChainID, int inResSeq, char inICode, std::string inAtomName)
{
	// well, maybe this could be optimized...
	std::set<char> result;

	for (auto r = mData; r != nullptr; r = r->mNext)
	{
		if (r->is("ATOM  ") or r->is("HETATM")) //	 1 -  6        Record name   "ATOM  "
		{                                       //	 ...
			std::string name = r->vS(13, 16);   //	13 - 16        Atom          name         Atom name.
			char altLoc = r->vC(17);            //	17             Character     altLoc       Alternate location indicator.
			char chainID = r->vC(22);           //	22             Character     chainID      Chain identifier.
			int resSeq = r->vI(23, 26);         //	23 - 26        Integer       resSeq       Residue sequence number.
			char iCode = r->vC(27);             //	27             AChar         iCode        Code for insertion of residues.

			if (chainID == inChainID and resSeq == inResSeq and iCode == inICode and name == inAtomName and altLoc != ' ')
				result.insert(altLoc);
		}
	}

	return { result.begin(), result.end() };
}

void PDBFileParser::MapChainID2AsymIDS(char chainID, std::vector<std::string> &asymIds)
{
	for (const auto &[key, value] : mChainSeq2AsymSeq)
	{
		if (std::get<0>(key) == chainID)
			asymIds.push_back(std::get<0>(value));
	}

	std::sort(asymIds.begin(), asymIds.end(), [](const std::string &a, const std::string &b)
		{
			int d = static_cast<int>(a.length() - b.length());
			if (d == 0)
				d = a.compare(b);
			return d < 0; });

	asymIds.erase(std::unique(asymIds.begin(), asymIds.end()), asymIds.end());
}

// --------------------------------------------------------------------

void PDBFileParser::PreParseInput(std::istream &is)
{
	std::string lookahead;
	uint32_t lineNr = 1;
	getline(is, lookahead);

	if (lookahead.back() == '\r')
		lookahead.pop_back();

	auto contNr = [&lookahead](int offset, int len) -> int
	{
		std::string cs = lookahead.substr(offset, len);
		cif::trim(cs);
		int result = 0;

		if (not cs.empty())
		{
			auto r = std::from_chars(cs.data(), cs.data() + cs.length(), result);
			if ((bool)r.ec)
				throw std::runtime_error("Continuation std::string '" + cs + "' is not valid");
		}

		return result;
	};

	PDBRecord *last = nullptr;
	std::set<std::string> dropped;

	for (;;)
	{
		if (lookahead.empty())
		{
			if (is.eof())
				break;

			if (cif::VERBOSE > 0)
				std::cerr << "Line number " << lineNr << " is empty!\n";

			getline(is, lookahead);
			++lineNr;

			continue;
		}

		std::string type = lookahead.substr(0, 6);
		std::string value;
		if (lookahead.length() > 6)
			value = cif::trim_right_copy(lookahead.substr(6));

		lookahead.clear();

		uint32_t curLineNr = lineNr;
		getline(is, lookahead);
		++lineNr;

		if (kSupportedRecords.count(type) == 0)
		{
			cif::trim(type);

			if (type != "END") // special case
				dropped.insert(type);

			lookahead.clear();

			continue;
		}

		// see if we need to append continuation values
		if (type == "AUTHOR" or
			type == "EXPDTA" or
			type == "MDLTYP" or
			type == "KEYWDS" or
			type == "SPLIT " or
			type == "SPRSDE" or
			type == "TITLE ")
		{
			int n = 2;
			while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
			{
				value += cif::trim_right_copy(lookahead.substr(10));
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "COMPND")
		{
			int n = 2;
			value += '\n';
			while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
			{
				value += cif::trim_right_copy(lookahead.substr(10));
				value += '\n';
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "REVDAT")
		{
			int revNr = stoi(value.substr(1, 3));
			int n = 2;
			while (lookahead.substr(0, 6) == type and
				   stoi(lookahead.substr(7, 3)) == revNr and
				   contNr(10, 2) == n)
			{
				value += lookahead.substr(38);
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "CAVEAT")
		{
			int n = 2;
			while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
			{
				value += cif::trim_right_copy(lookahead.substr(13));
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "OBSLTE")
		{
			while (lookahead.substr(0, 6) == type)
			{
				value += lookahead.substr(31);
				getline(is, lookahead);
				++lineNr;
			}
		}
		else if (type == "SOURCE")
		{
			value += '\n';
			int n = 2;
			while (lookahead.substr(0, 6) == type and contNr(7, 3) == n)
			{
				value += cif::trim_copy(lookahead.substr(10));
				value += '\n';
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "FORMUL")
		{
			try
			{
				int compNr;
				try
				{
					compNr = stoi(value.substr(1, 3));
				}
				catch (const std::exception &ex)
				{
					if (cif::VERBOSE >= 0)
						std::cerr << "Dropping FORMUL line (" << (lineNr - 1) << ") with invalid component number '" << value.substr(1, 3) << '\'' << '\n';
					continue;
					// throw_with_nested(std::runtime_error("Invalid component number '" + value.substr(1, 3) + '\''));
				}

				int n = 2;
				try
				{
					while (lookahead.substr(0, 6) == type and
						   stoi(lookahead.substr(7, 3)) == compNr and
						   contNr(16, 2) == n)
					{
						value += cif::trim_right_copy(lookahead.substr(19));
						;
						getline(is, lookahead);
						++lineNr;
						++n;
					}
				}
				catch (const std::invalid_argument &ex)
				{
					continue;
					// throw_with_nested(std::runtime_error("Invalid component number '" + lookahead.substr(7, 3) + '\''));
				}
			}
			catch (const std::exception &ex)
			{
				if (cif::VERBOSE >= 0)
					std::cerr << "Error parsing FORMUL at line " << lineNr << '\n';
				throw;
			}
		}
		else if (type == "HETNAM" or
				 type == "HETSYN")
		{
			int n = 2;
			while (lookahead.substr(0, 6) == type and contNr(8, 2) == n)
			{
				value += cif::trim_right_copy(lookahead.substr(16));
				;
				getline(is, lookahead);
				++lineNr;
				++n;
			}
		}
		else if (type == "SITE  ")
		{
			std::string siteName = value.substr(5, 3);
			cif::trim_right(value);
			std::size_t n = value.length() - 12;
			value += std::string(11 - (n % 11), ' ');

			while (lookahead.substr(0, 6) == type and lookahead.substr(11, 3) == siteName)
			{
				std::string s = lookahead.substr(18);
				cif::trim_right(s);
				s += std::string(11 - (s.length() % 11), ' ');
				value += s;

				// TODO: improve this... either use numRes or don't lump together all text
				//				value += " " + cif::trim_right_copy();
				getline(is, lookahead);
				++lineNr;
			}
		}
		else if (type == "REMARK")
		{
			type += value.substr(0, 4);

			// parse it now, makes life easier later on
			if (type == "REMARK 200" or type == "REMARK 240")
			{
				auto i = value.find(":");

				if (i != std::string::npos)
				{
					std::string k = value.substr(4, i - 4);
					std::string v = value.substr(i + 1);

					cif::trim(k);
					while (k.find("  ") != std::string::npos)
						cif::replace_all(k, "  ", " ");
					cif::trim(v);

					if (iequals(v, "NONE") or iequals(v, "N/A") or iequals(v, "NAN"))
						mRemark200[k] = ".";
					else if (not iequals(v, "NULL"))
						mRemark200[k] = v;
				}
			}
		}

		PDBRecord *cur = new (value.length()) PDBRecord(curLineNr, type, value);

		if (last == nullptr)
			last = mData = cur;
		else
			last->mNext = cur;

		last = cur;

		cif::trim(type);

		if (type == "LINK" or type == "LINKR")
		{
			LINK link = {};

			link.a.name = cur->vS(13, 16);    //	13 - 16         Atom           name1           Atom name.
			link.a.altLoc = cur->vC(17);      //	17              Character      altLoc1         Alternate location indicator.
			link.a.resName = cur->vS(18, 20); //	18 - 20         Residue name   resName1        Residue  name.
			link.a.chainID = cur->vC(22);     //	22              Character      chainID1        Chain identifier.
			link.a.resSeq = cur->vI(23, 26);  //	23 - 26         Integer        resSeq1         Residue sequence number.
			link.a.iCode = cur->vC(27);       //	27              AChar          iCode1          Insertion code.
			link.b.name = cur->vS(43, 46);    //	43 - 46         Atom           name2           Atom name.
			link.b.altLoc = cur->vC(47);      //	47              Character      altLoc2         Alternate location indicator.
			link.b.resName = cur->vS(48, 50); //	48 - 50         Residue name   resName2        Residue name.
			link.b.chainID = cur->vC(52);     //	52              Character      chainID2        Chain identifier.
			link.b.resSeq = cur->vI(53, 56);  //	53 - 56         Integer        resSeq2         Residue sequence number.
			link.b.iCode = cur->vC(57);       //	57              AChar          iCode2          Insertion code.
			link.symOpA = cur->vS(60, 65);    //	60 - 65         SymOP          sym1            Symmetry operator atom 1.
			link.symOpB = cur->vS(67, 72);    //	67 - 72         SymOP          sym2            Symmetry operator atom 2.

			if (type == "LINK") //	 1 -  6         Record name    "LINK  "
			{
				auto f = cur->vF(74, 78);
				auto r = cif::from_chars(f.data(), f.data() + f.length(), link.distance);
				if ((bool)r.ec and cif::VERBOSE > 0)
					std::cerr << "Error parsing link distance at line " << cur->mLineNr << '\n';
			}
			//	74 – 78         Real(5.2)      Length          Link distance

			mLinks.push_back(link);
		}

		if (type == "END")
			break;
	}

	if (not dropped.empty())
	{
		if (cif::VERBOSE >= 0)
			std::cerr << "Dropped unsupported records: " << cif::join(dropped, ", ") << '\n';
	}

	if (mData == nullptr)
		throw std::runtime_error("Empty file?");

	mRec = mData;
}

void PDBFileParser::GetNextRecord()
{
	if (mRec != nullptr)
		mRec = mRec->mNext;

	if (mRec == nullptr)
	{
		static PDBRecord *end = new (0) PDBRecord({ 0, "END   ", "" });
		mRec = end;
	}
}

void PDBFileParser::Match(const std::string &expected, bool throwIfMissing)
{
	assert(mRec);
	if (mRec->mName != expected)
	{
		if (throwIfMissing)
			throw std::runtime_error("Expected record " + expected + " but found " + mRec->mName);
		if (cif::VERBOSE > 0)
			std::cerr << "Expected record " + expected + " but found " + mRec->mName << '\n';
	}
}

std::vector<std::string> PDBFileParser::SplitCSV(const std::string &value)
{
	auto vs = cif::split<std::string>(value, ",");
	for (auto &v : vs)
		cif::trim(v);
	return vs;
}

void PDBFileParser::ParseTitle()
{
	// strict ordering required

	// HEADER
	//	 1 -  6       Record name    "HEADER"
	//	11 - 50       String(40)     classification    Classifies the molecule(s).
	//	51 - 59       Date           depDate           Deposition date. This is the date the
	//	                                               coordinates  were received at the PDB.
	//	63 - 66       IDcode         idCode            This identifier is unique within the PDB.

	Match("HEADER", false);

	std::string keywords;

	if (mRec->is("HEADER"))
	{
		mStructureID = vS(63, 66);
		keywords = vS(11, 50);
		mOriginalDate = pdb2cifDate(vS(51, 59));

		cif::trim(keywords);

		GetNextRecord();
	}

	cif::trim(mStructureID);
	if (mStructureID.empty())
		mStructureID = "nohd";

	mDatablock.set_name(mStructureID);

	auto cat = getCategory("entry");
	//	cat->addColumn("id");
	cat->emplace({ { "id", mStructureID } });

	// OBSLTE
	if (mRec->is("OBSLTE"))
	{
		//	 1 -  6       Record name   "OBSLTE"
		//	 9 - 10       Continuation  continuation  Allows concatenation of multiple records
		//	12 - 20       Date          repDate       Date that this datablock was replaced.
		//	22 - 25       IDcode        idCode        ID code of this datablock.
		//	32 - 35       IDcode        rIdCode       ID code of datablock that replaced this one.
		//	37 - 40       ...

		std::string old = vS(22, 25);
		std::string date = pdb2cifDate(vS(12, 20));
		cat = getCategory("pdbx_database_PDB_obs");

		std::string value = mRec->vS(32);
		for (auto i : cif::split<std::string>(value, " ", true))
		{
			cat->emplace({ { "id", "OBSLTE" },
				{ "date", date },
				{ "replace_pdb_id", old },
				{ "pdb_id", i } });
		}

		GetNextRecord();
	}

	// TITLE
	Match("TITLE ", false);
	std::string title;
	if (mRec->is("TITLE ")) //	 1 -  6       Record name    "TITLE "
	{                       //	 9 - 10       Continuation   continuation  Allows concatenation of multiple records.
		title = vS(11);     //	11 - 80       String         title         Title of the  experiment.
		GetNextRecord();
	}

	// SPLIT
	if (mRec->is("SPLIT "))
	{
		//	 1 -  6        Record  name  "SPLIT "
		//	 9 - 10        Continuation  continuation  Allows concatenation of multiple records.
		//	12 - 15        IDcode        idCode        ID code of related datablock.

		throw std::runtime_error("SPLIT PDB files are not supported");
	}

	// CAVEAT
	int caveatID = 1;
	while (mRec->is("CAVEAT")) //	  1 - 6       Record name   "CAVEAT"
	{
		// clang-format off
		getCategory("database_PDB_caveat")->emplace({
			{ "id", caveatID++ },
			{ "text", std::string{ mRec->vS(20) } } //	20 - 79       String        comment        Free text giving the reason for the  CAVEAT.
		});
		// clang-format on

		GetNextRecord();
	}

	// COMPND
	Match("COMPND", false);
	//	 1 -  6       Record name     "COMPND"
	//	 8 - 10       Continuation    continuation  Allows concatenation of multiple records.
	//	11 - 80       Specification   compound      Description of the molecular components.
	//	              list

	if (mRec->is("COMPND"))
	{
		std::string value{ mRec->vS(11) };
		if (value.find(':') == std::string::npos)
		{
			// special case for dumb, stripped files
			auto &comp = GetOrCreateCompound(1);
			comp.mInfo["MOLECULE"] = value;
		}
		else
		{
			SpecificationListParser p(value);

			for (;;)
			{
				std::string key, val;
				std::tie(key, val) = p.GetNextSpecification();

				if (key.empty())
					break;

				if (not iequals(key, "MOL_ID") and mCompounds.empty())
				{
					if (cif::VERBOSE > 0)
						std::cerr << "Ignoring invalid COMPND record\n";
					break;
				}

				if (key == "MOL_ID")
				{
					auto &comp = GetOrCreateCompound(stoi(val));
					comp.mTitle = title;
				}
				else if (key == "CHAIN")
				{
					for (auto c : cif::split<std::string>(val, ","))
					{
						cif::trim(c);
						mCompounds.back().mChains.insert(c[0]);
					}
				}
				else
					mCompounds.back().mInfo[key] = val;
			}
		}

		GetNextRecord();
	}

	// SOURCE
	Match("SOURCE", false);

	if (mRec->is("SOURCE"))
	{
		//	 1 -  6      Record name    "SOURCE"
		//	 8 - 10      Continuation   continuation   Allows concatenation of multiple records.
		//	11 - 79      Specification  srcName        Identifies the source of the
		//	             List                          macromolecule in a  token: value format.

		std::map<std::string, std::string> *source = nullptr;

		//		value = { mRec->vS(11) };
		//		for (auto si = ba::make_split_iterator(value, ba::token_finder(ba::is_any_of(";"), ba::token_compress_on)); not si.eof(); ++si)
		//		{
		//			std::string s(si->begin(), si->end());
		//			if (s.empty())
		//				continue;
		//
		//			auto colon = s.find(": ");
		//			if (colon == std::string::npos)
		//			{
		//				if (cif::VERBOSE > 0)
		//					std::cerr << "invalid source field, missing colon (" << s << ')' << '\n';
		//				continue;
		//			}
		SpecificationListParser p(vS(11));

		for (;;)
		{
			std::string key, val;
			std::tie(key, val) = p.GetNextSpecification();

			if (key.empty())
				break;

			if (key == "MOL_ID")
			{
				for (auto &c : mCompounds)
				{
					if (c.mMolID == stoi(val))
					{
						source = &c.mSource;
						break;
					}
				}

				continue;
			}

			if (source == nullptr)
				throw std::runtime_error("At line " + std::to_string(mRec->mLineNr) + ": missing MOL_ID in SOURCE");

			(*source)[key] = val;
		}

		GetNextRecord();
	}

	// KEYWDS
	Match("KEYWDS", false);
	std::string pdbxKeywords;

	if (mRec->is("KEYWDS"))    //	 1 -  6       Record name    "KEYWDS"
	{                          //	 9 - 10       Continuation   continuation  Allows concatenation of records if necessary.
		pdbxKeywords = vS(11); //	11 - 79       List           keywds        Comma-separated list of keywords relevant
		                       //	                                           to the datablock.
		GetNextRecord();
	}

	if (not(keywords.empty() and pdbxKeywords.empty()))
	{
		// clang-format off
		getCategory("struct_keywords")->emplace({
			{ "entry_id", mStructureID },
			{ "pdbx_keywords", keywords },
			{ "text", pdbxKeywords }
		});
		// clang-format on
	}

	// EXPDTA
	Match("EXPDTA", false);
	if (mRec->is("EXPDTA"))
	{
		mExpMethod = vS(11);

		cat = getCategory("exptl");

		auto crystals = cif::split<std::string>(mRemark200["NUMBER OF CRYSTALS USED"], "; ");
		if (crystals.empty())
			crystals.push_back("");
		auto ci = crystals.begin();

		for (auto expMethod : cif::split<std::string>(mExpMethod, ";"))
		{
			cif::trim(expMethod);

			if (expMethod.empty())
				continue;

			// clang-format off
			cat->emplace({
				{ "entry_id", mStructureID },
				{ "method", expMethod },
				{ "crystals_number", ci != crystals.end() ? *ci : "" }
			});
		// clang-format ob
		}

		GetNextRecord();
	}

	// NUMMDL
	if (mRec->is("NUMMDL"))
	{
		if (cif::VERBOSE > 0)
			std::cerr << "skipping unimplemented NUMMDL record\n";
		GetNextRecord();
	}

	// MDLTYP
	if (mRec->is("MDLTYP"))
	{
		mModelTypeDetails = vS(11);
		GetNextRecord();
	}

	// AUTHOR
	Match("AUTHOR", false);
	if (mRec->is("AUTHOR"))
	{
		int n = 1;
		cat = getCategory("audit_author");

		std::string value = { mRec->vS(11) };
		for (auto author : cif::split<std::string>(value, ",", true))
		{
			// clang-format off
			cat->emplace({
				{ "name", pdb2cifAuth(author) },
				{ "pdbx_ordinal", n }
			});
			// clang-format on
			++n;
		}

		GetNextRecord();
	}

	// REVDAT
	bool firstRevDat = true;
	struct RevDat
	{
		int revNum;
		std::string date, dateOriginal, replaces;
		int modType;
		std::vector<std::string> types;

		bool operator<(const RevDat &rhs) const { return revNum < rhs.revNum; }
	};
	std::vector<RevDat> revdats;

	while (mRec->is("REVDAT"))
	{
		//	 1 -  6       Record name    "REVDAT"
		int revNum = vI(8, 10);                     //	 8 - 10       Integer        modNum        Modification number.
		                                            //	11 - 12       Continuation   continuation  Allows concatenation of multiple records.
		std::string date = pdb2cifDate(vS(14, 22)); //	14 - 22       Date           modDate       Date of modification (or release  for
		                                            //	                                           new entries)  in DD-MMM-YY format. This is
		                                            //	                                           not repeated on continued lines.
		std::string modID = vS(24, 27);             //	24 - 27       IDCode         modID         ID code of this datablock. This is not repeated on
		                                            //	                                           continuation lines.
		int modType = vI(32, 32);                   //	32            Integer        modType       An integer identifying the type of
		                                            //	                                           modification. For all  revisions, the
		                                            //	                                           modification type is listed as 1
		std::string detail = vS(40);                //	40 - 45       LString(6)     record        Modification detail.
		                                            //	47 - 52       LString(6)     record        Modification detail.
		                                            //	54 - 59       LString(6)     record        Modification detail.
		                                            //	61 - 66       LString(6)     record        Modification detail.

		revdats.push_back({ revNum, date, modType == 0 ? mOriginalDate : "", modID, modType });

		revdats.back().types = cif::split<std::string>(detail, " ");

		if (firstRevDat)
		{
			// clang-format off
			getCategory("database_2")->emplace({
				{ "database_id", "PDB" },
				{ "database_code", modID }
			});
			// clang-format on
		}

		GetNextRecord();
		firstRevDat = false;
	}

	/*
	This is internal stuff for PDB, don't write it ???
*/
	sort(revdats.begin(), revdats.end());
	for (auto &revdat : revdats)
	{
		// clang-format off
		getCategory("database_PDB_rev")->emplace({
			{ "num", revdat.revNum },
			{ "date", revdat.date },
			{ "date_original", revdat.dateOriginal },
			{ "replaces", revdat.replaces },
			{ "mod_type", revdat.modType }
		});
		// clang-format on

		for (auto &type : revdat.types)
		{
			if (type.empty())
				continue;

			// clang-format off
			getCategory("database_PDB_rev_record")->emplace({
				{ "rev_num", revdat.revNum },
				{ "type", type }
			});
			// clang-format on
		}
	}
	//*/

	// SPRSDE
	if (mRec->is("SPRSDE"))
	{
		if (cif::VERBOSE > 0)
			std::cerr << "skipping unimplemented SPRSDE record\n";
		GetNextRecord();
	}

	// JRNL
	if (mRec->is("JRNL  "))
		ParseCitation("primary");
}

void PDBFileParser::ParseCitation(const std::string &id)
{
	const char *rec = mRec->mName;

	std::string auth, titl, edit, publ, refn, pmid, doi;
	std::string pubname, volume, astm, country, issn, csd;
	std::string pageFirst;
	int year = 0;

	auto extend = [](std::string &s, const std::string &p)
	{
		if (not s.empty())
			s += ' ';
		s += cif::trim_copy(p);
	};

	while (mRec->is(rec) and (id == "primary" or vC(12) == ' '))
	{
		std::string k = vS(13, 16);
		if (k == "AUTH")
			extend(auth, vS(20, 79));
		else if (k == "TITL")
			extend(titl, vS(20, 79));
		else if (k == "EDIT")
			extend(edit, vS(20, 79));
		else if (k == "REF")
		{
			if (pubname.empty())
			{
				extend(pubname, vS(20, 47));
				if (vS(50, 51) == "V.")
					volume = cif::trim_copy(vS(52, 55));
				pageFirst = vS(57, 61);
				year = vI(63, 66);
			}
			else
				extend(pubname, vS(20, 47));
		}
		else if (k == "PUBL")
			extend(publ, vS(20, 70));
		else if (k == "REFN")
		{
			if (vS(20, 23) == "ASTN")
				astm = vS(25, 30);
			country = vS(33, 34);
			if (vS(36, 39) == "ISSN")
				issn = vS(41, 65);
		}
		else if (k == "PMID")
			pmid = vS(20, 79);
		else if (k == "DOI")
			doi = vS(20, 79);

		GetNextRecord();
	}

	auto cat = getCategory("citation");
	// clang-format off
	cat->emplace({
		{ "id", id },
		{ "title", titl },
		{ "journal_abbrev", pubname },
		{ "journal_volume", volume },
		{ "page_first", pageFirst },
		{ "year", year > 0 ? std::to_string(year) : "" },
		{ "journal_id_ASTM", astm },
		{ "country", country },
		{ "journal_id_ISSN", issn },
		{ "journal_id_CSD", csd },
		{ "book_publisher", publ },
		{ "pdbx_database_id_PubMed", pmid },
		{ "pdbx_database_id_DOI", doi }
	});
	// clang-format on

	if (not auth.empty())
	{
		cat = getCategory("citation_author");
		for (auto author : cif::split<std::string>(auth, ",", true))
		{
			cat->emplace({ { "citation_id", id },
				{ "name", pdb2cifAuth(author) },
				{ "ordinal", mCitationAuthorNr } });

			++mCitationAuthorNr;
		}
	}

	if (not edit.empty())
	{
		cat = getCategory("citation_editor");
		for (auto editor : cif::split<std::string>(edit, ",", true))
		{
			cat->emplace({ { "citation_id", id },
				{ "name", pdb2cifAuth(editor) },
				{ "ordinal", mCitationEditorNr } });

			++mCitationEditorNr;
		}
	}
}

void PDBFileParser::ParseRemarks()
{
	std::string sequenceDetails, compoundDetails, sourceDetails;

	while (cif::starts_with(mRec->mName, "REMARK"))
	{
		int remarkNr = vI(8, 10);

		try
		{
			switch (remarkNr)
			{
				case 1:
					while (mRec->is("REMARK   1") or mRec->is("REMARK 001"))
					{
						if (mRec->mVlen > 15 and vS(12, 20) == "REFERENCE")
						{
							std::string id = vS(22, 70);
							GetNextRecord();

							ParseCitation(id);
						}
						else
							GetNextRecord();
					}
					break;

				case 3:
					// we skip REMARK 3 until we know the mapping
					while (mRec->is("REMARK   3"))
						GetNextRecord();
					break;

				case 4:
					// who cares...
					while (mRec->is("REMARK   4"))
						GetNextRecord();
					break;

				case 100:
				{
					const std::regex rx(R"(THE (\S+) ID CODE IS (\S+?)\.?\s*)");
					std::smatch m;
					std::string r = vS(12);

					if (std::regex_match(r, m, rx))
					{
						auto cat = getCategory("database_2");
						cat->emplace({ { "database_id", m[1].str() },
							{ "database_code", m[2].str() } });
					}

					GetNextRecord();
					break;
				}

				case 200:
				{
					// we already parsed most of this remark, but the "REMARK:" part might have been split.

					bool remark = false;

					do
					{
						std::string r = mRec->vS(12);

						if (cif::starts_with(r, "REMARK: "))
						{
							mRemark200["REMARK"] = r.substr(8);
							remark = true;
						}
						else if (remark)
						{
							if (r.empty())
								remark = false;
							else
								mRemark200["REMARK"] += r;
						}

						GetNextRecord();
					} while (mRec->is("REMARK 200"));
					break;
				}

				case 280:
				{
					std::string density_Matthews, densityPercentSol, conditions;

					const std::regex rx1(R"(SOLVENT CONTENT, VS +\(%\): *(.+))"),
						rx2(R"(MATTHEWS COEFFICIENT, VM \(ANGSTROMS\*\*3/DA\): *(.+))");

					std::smatch m;

					do
					{
						std::string r = vS(12);

						if (conditions.empty())
						{
							if (std::regex_match(r, m, rx1))
								densityPercentSol = m[1].str();
							else if (std::regex_match(r, m, rx2))
								density_Matthews = m[1].str();
							else if (cif::starts_with(r, "CRYSTALLIZATION CONDITIONS: "))
								conditions = r.substr(28);
						}
						else
							conditions = conditions + ' ' + r;

						GetNextRecord();
					} while (mRec->is("REMARK 280"));

					std::string desc = mRemark200["REMARK"];
					if (desc == "NULL")
						desc.clear();

					// clang-format off
					getCategory("exptl_crystal")->emplace({
						{ "id", 1 },
						{ "density_Matthews", iequals(density_Matthews, "NULL") ? "" : density_Matthews },
						{ "density_percent_sol", iequals(densityPercentSol, "NULL") ? "" : densityPercentSol },
						{ "description", desc }
					});
					// clang-format on

					// now try to parse the conditions
					const std::regex rx3(R"(TEMPERATURE +(\d+)K)"), rx4(R"(PH *(?:: *)?(\d+(?:\.\d+)?))") /*, rx5(R"(\b(\d+)C\b)")*/;

					std::string temp, ph, method;

					for (auto s : cif::split<std::string>(conditions, ",", true))
					{
						cif::trim(s);

						if (std::regex_search(s, m, rx3))
							temp = m[1].str();
						if (std::regex_search(s, m, rx4))
							ph = m[1].str();
						if (s.length() < 60 and
							(cif::icontains(s, "drop") or cif::icontains(s, "vapor") or cif::icontains(s, "batch")))
						{
							if (not method.empty())
								method = method + ", " + s;
							else
								method = s;
						}
					}

					if (not(method.empty() and temp.empty() and ph.empty() and (conditions.empty() or conditions == "NULL")))
					{
						// clang-format off
						getCategory("exptl_crystal_grow")->emplace({
							{ "crystal_id", 1 },
							{ "method", method },
							{ "temp", temp },
							{ "pH", ph },
							{ "pdbx_details", conditions }
						});
						// clang-format on
					}

					break;
				}

					//			case 290:
					//
					//				break;

				case 350:
					// postponed since we don't have the required information yet
					for (; mRec->is("REMARK 350"); GetNextRecord())
						;
					break;

				case 400:
				{
					std::stringstream s;
					GetNextRecord();
					if (vS(12) == "COMPOUND")
						GetNextRecord();

					while (mRec->is("REMARK 400"))
					{
						s << vS(12) << '\n';
						GetNextRecord();
					}

					compoundDetails = s.str();
					break;
				}

				case 450:
				{
					std::stringstream s;
					GetNextRecord();
					if (vS(12) == "SOURCE")
						GetNextRecord();

					while (mRec->is("REMARK 450"))
					{
						s << vS(12) << '\n';
						GetNextRecord();
					}

					sourceDetails = s.str();
					break;
				}

				case 465:
				{
					bool headerSeen = false;
					std::regex rx(R"( *MODELS *(\d+)-(\d+))");
					int models[2] = { -1, -1 };

					for (; mRec->is("REMARK 465"); GetNextRecord())
					{
						if (not headerSeen)
						{
							std::string line = vS(12);
							std::smatch m;

							if (std::regex_match(line, m, rx))
							{
								models[0] = std::stoi(m[1].str());
								models[1] = stoi(m[2].str());
							}
							else
								headerSeen = cif::contains(line, "RES C SSSEQI");
							continue;
						}

						if (models[0] == models[1])
							models[0] = models[1] = vI(12, 14);

						std::string res = vS(16, 18);
						char chain = vC(20);
						int seq = vI(22, 26);
						char iCode = vC(27);

						for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
							mUnobs.push_back({ modelNr, res, chain, seq, iCode });
					}

					break;
				}

				case 470:
				{
					bool headerSeen = false;
					std::regex rx(R"( *MODELS *(\d+)-(\d+))");
					int models[2] = { -1, -1 };

					for (; mRec->is("REMARK 470"); GetNextRecord())
					{
						if (not headerSeen)
						{
							std::string line = vS(12);
							std::smatch m;

							if (std::regex_match(line, m, rx))
							{
								models[0] = stoi(m[1].str());
								models[1] = stoi(m[2].str());
							}
							else
								headerSeen = cif::contains(line, "RES CSSEQI  ATOMS");
							continue;
						}

						if (models[0] == models[1])
							models[0] = models[1] = vI(12, 14);

						std::string res = vS(16, 18);
						char chain = vC(20);
						int seq = vI(21, 24);
						char iCode = vC(25);

						std::string atomStr = mRec->vS(29);
						auto atoms = cif::split<std::string>(atomStr, " ", true);

						for (int modelNr = models[0]; modelNr <= models[1]; ++modelNr)
							mUnobs.push_back({ modelNr, res, chain, seq, iCode, atoms });
					}

					break;
				}

				case 500:
				{
					GetNextRecord();

					enum State
					{
						eStart,
						eCCinSAU,
						eCC,
						eCBL,
						eCBA,
						eTA,
						eCTg,
						ePG,
						eMCP,
						eChC
					} state = eStart;
					bool headerSeen = false;
					int id = 0;

					for (; mRec->is("REMARK 500"); GetNextRecord())
					{
						std::string line = vS(12);

						if (line == "GEOMETRY AND STEREOCHEMISTRY")
							continue;

						switch (state)
						{
							case eStart:
							{
								if (line.empty() or not cif::starts_with(line, "SUBTOPIC: "))
									continue;

								std::string subtopic = line.substr(10);

								if (subtopic == "CLOSE CONTACTS IN SAME ASYMMETRIC UNIT")
									state = eCCinSAU;
								else if (subtopic == "CLOSE CONTACTS")
									state = eCC;
								else if (subtopic == "COVALENT BOND LENGTHS")
									state = eCBL;
								else if (subtopic == "COVALENT BOND ANGLES")
									state = eCBA;
								else if (subtopic == "TORSION ANGLES")
									state = eTA;
								else if (subtopic == "NON-CIS, NON-TRANS")
									state = eCTg;
								else if (subtopic == "PLANAR GROUPS")
									state = ePG;
								else if (subtopic == "MAIN CHAIN PLANARITY")
									state = eMCP;
								else if (subtopic == "CHIRAL CENTERS")
									state = eChC;
								else if (cif::VERBOSE > 0)
									throw std::runtime_error("Unknown subtopic in REMARK 500: " + subtopic);

								headerSeen = false;
								id = 0;
								break;
							}

							case eCCinSAU:
							{
								if (not headerSeen)
									headerSeen =
										line == "ATM1  RES C  SSEQI   ATM2  RES C  SSEQI           DISTANCE";
								else if (line.empty())
									state = eStart;
								else
								{
									std::string atom1 = vS(13, 16);
									std::string res1 = vS(19, 21);
									std::string alt1 = vS(17, 17);
									char chain1 = vC(23);
									int seq1 = vI(25, 29);
									std::string iCode1 = vS(30, 30);

									std::string atom2 = vS(34, 37);
									std::string alt2 = vS(38, 38);
									std::string res2 = vS(40, 42);
									char chain2 = vC(44);
									int seq2 = vI(46, 50);
									std::string iCode2 = vS(51, 51);

									std::string distance = vF(63, 71);

									// clang-format off
									getCategory("pdbx_validate_close_contact")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", 1 },
										{ "auth_atom_id_1", atom1 },
										{ "auth_asym_id_1", std::string{ chain1 } },
										{ "auth_comp_id_1", res1 },
										{ "auth_seq_id_1", seq1 },
										{ "PDB_ins_code_1", iCode1 },
										{ "label_alt_id_1", alt1 },
										{ "auth_atom_id_2", atom2 },
										{ "auth_asym_id_2", std::string{ chain2 } },
										{ "auth_comp_id_2", res2 },
										{ "auth_seq_id_2", seq2 },
										{ "PDB_ins_code_2", iCode2 },
										{ "label_alt_id_2", alt2 },
										{ "dist", distance }
									});
									// clang-format on
								}
								break;
							}

							case eCC:
							{
								if (not headerSeen)
									headerSeen = line == "ATM1  RES C  SSEQI   ATM2  RES C  SSEQI  SSYMOP   DISTANCE";
								else if (line.empty())
									state = eStart;
								else
								{
									std::string atom1 = vS(13, 16);
									std::string res1 = vS(19, 21);
									char chain1 = vC(23);
									int seq1 = vI(25, 29);

									std::string atom2 = vS(34, 37);
									std::string res2 = vS(40, 42);
									char chain2 = vC(44);
									int seq2 = vI(46, 50);

									std::string symop;
									try
									{
										symop = pdb2cifSymmetry(vS(54, 59));
									}
									catch (const std::exception &ex)
									{
										if (cif::VERBOSE > 0)
											std::cerr << "Dropping REMARK 500 at line " << mRec->mLineNr << " due to invalid symmetry operation\n";
										continue;
									}

									std::string distance = vF(63, 71);

									// clang-format off
									getCategory("pdbx_validate_symm_contact")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", 1 },
										{ "auth_atom_id_1", atom1 },
										{ "auth_asym_id_1", std::string{ chain1 } },
										{ "auth_comp_id_1", res1 },
										{ "auth_seq_id_1", seq1 },
//										{ "PDB_ins_code_1", "" },
//										{ "label_alt_id_1", "" },
										{ "site_symmetry_1", "1_555" },
										{ "auth_atom_id_2", atom2 },
										{ "auth_asym_id_2", std::string{ chain2 } },
										{ "auth_comp_id_2", res2 },
										{ "auth_seq_id_2", seq2 },
//										{ "PDB_ins_code_2", "" },
//										{ "label_alt_id_2", "" },
										{ "site_symmetry_2", symop },
										{ "dist", distance }
									});
									// clang-format on
								}
								break;
							}

							case eCBL:
							{
								if (not headerSeen)
								{
									if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,2(A3,1X,A1,I4,A1,1X,A4,3X),1X,F6.3)")
										throw std::runtime_error("Unexpected format in REMARK 500");

									headerSeen = line == "M RES CSSEQI ATM1   RES CSSEQI ATM2   DEVIATION";
								}
								else if (line.empty())
									state = eStart;
								else
								{
									int model = vI(11, 13);
									std::string resNam1 = vS(15, 17);
									std::string chainID1{ vC(19) };
									int seqNum1 = vI(20, 23);
									std::string iCode1{ vC(24) };
									std::string alt1 = vS(30, 30);
									std::string atm1 = vS(26, 29);

									std::string resNam2 = vS(33, 35);
									std::string chainID2{ vC(37) };
									int seqNum2 = vI(38, 41);
									std::string iCode2{ vC(42) };
									std::string alt2 = vS(48, 48);
									std::string atm2 = vS(44, 47);

									std::string deviation = vF(51, 57);

									if (iCode1 == " ")
										iCode1.clear();
									if (iCode2 == " ")
										iCode2.clear();

									// clang-format off
									getCategory("pdbx_validate_rmsd_bond")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", model ? model : 1 },
										{ "auth_atom_id_1", atm1 },
										{ "auth_asym_id_1", chainID1 },
										{ "auth_comp_id_1", resNam1 },
										{ "auth_seq_id_1", seqNum1 },
										{ "PDB_ins_code_1", iCode1 },
										{ "label_alt_id_1", alt1 },
										{ "auth_atom_id_2", atm2 },
										{ "auth_asym_id_2", chainID2 },
										{ "auth_comp_id_2", resNam2 },
										{ "auth_seq_id_2", seqNum2 },
										{ "PDB_ins_code_2", iCode2 },
										{ "label_alt_id_2", alt2 },
										{ "bond_deviation", deviation }
									});
									// clang-format on
								}

								break;
							}

							case eCBA:
								if (not headerSeen)
								{
									if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT: (10X,I3,1X,A3,1X,A1,I4,A1,3(1X,A4,2X),12X,F5.1)")
										throw std::runtime_error("Unexpected format in REMARK 500");

									headerSeen = line == "M RES CSSEQI ATM1   ATM2   ATM3";
								}
								else if (line.empty())
									state = eStart;
								else if (vS(64) == "DEGREES")
								{
									int model = vI(11, 13);
									std::string resNam = vS(15, 17);
									std::string chainID{ vC(19) };
									int seqNum = vI(20, 23);
									std::string iCode{ vC(24) };

									if (iCode == " ")
										iCode.clear();

									std::string atoms[3] = { vS(27, 30), vS(34, 37), vS(41, 44) };
									std::string deviation = vF(57, 62);
									if (deviation == "*****")
										deviation.clear();

									// clang-format off
									getCategory("pdbx_validate_rmsd_angle")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", model ? model : 1 },
										{ "auth_atom_id_1", atoms[0] },
										{ "auth_asym_id_1", chainID },
										{ "auth_comp_id_1", resNam },
										{ "auth_seq_id_1", seqNum },
										{ "PDB_ins_code_1", iCode },
										{ "auth_atom_id_2", atoms[1] },
										{ "auth_asym_id_2", chainID },
										{ "auth_comp_id_2", resNam },
										{ "auth_seq_id_2", seqNum },
										{ "PDB_ins_code_2", iCode },
										{ "auth_atom_id_3", atoms[2] },
										{ "auth_asym_id_3", chainID },
										{ "auth_comp_id_3", resNam },
										{ "auth_seq_id_3", seqNum },
										{ "PDB_ins_code_3", iCode },
										{ "angle_deviation", deviation }
									});
									// clang-format on
								}

								break;

							case eTA:
								if (not headerSeen)
								{
									if (cif::starts_with(line, "FORMAT: ") and line != "FORMAT:(10X,I3,1X,A3,1X,A1,I4,A1,4X,F7.2,3X,F7.2)")
										throw std::runtime_error("Unexpected format in REMARK 500");

									headerSeen = line == "M RES CSSEQI        PSI       PHI";
								}
								else if (line.empty())
									state = eStart;
								else
								{
									int model = vI(11, 13);
									std::string resNam = vS(15, 17);
									std::string chainID{ vC(19) };
									int seqNum = vI(20, 23);
									std::string iCode{ vC(24) };

									if (iCode == " ")
										iCode.clear();

									std::string psi = vF(27, 35);
									std::string phi = vF(37, 45);

									// clang-format off
									getCategory("pdbx_validate_torsion")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", model ? model : 1 },
										{ "auth_comp_id", resNam },
										{ "auth_asym_id", chainID },
										{ "auth_seq_id", seqNum },
										{ "PDB_ins_code", iCode },
										{ "phi", phi },
										{ "psi", psi }
									});
									// clang-format on
								}
								break;

							case eCTg:
								if (not headerSeen)
									headerSeen = line == "MODEL     OMEGA";
								else if (line.empty())
									state = eStart;
								else
								{
									int model = vI(45, 48);

									std::string resNam1 = vS(12, 14);
									std::string chainID1{ vC(16) };
									int seqNum1 = vI(17, 21);
									std::string iCode1{ vC(22) };

									if (iCode1 == " ")
										iCode1.clear();

									std::string resNam2 = vS(27, 29);
									std::string chainID2{ vC(31) };
									int seqNum2 = vI(32, 36);
									std::string iCode2{ vC(37) };

									if (iCode2 == " ")
										iCode2.clear();

									std::string omega = vF(54, 60);

									// clang-format off
									getCategory("pdbx_validate_peptide_omega")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", model ? model : 1 },
										{ "auth_comp_id_1", resNam1 },
										{ "auth_asym_id_1", chainID1 },
										{ "auth_seq_id_1", seqNum1 },
										{ "PDB_ins_code_1", iCode1 },
										{ "auth_comp_id_2", resNam2 },
										{ "auth_asym_id_2", chainID2 },
										{ "auth_seq_id_2", seqNum2 },
										{ "PDB_ins_code_2", iCode2 },
										{ "omega", omega }
									});
									// clang-format on
								}
								break;

							case ePG:
								if (not headerSeen)
									headerSeen = line == "M RES CSSEQI        RMS     TYPE";
								else if (line.empty())
									state = eStart;
								else
								{
									int model = vI(11, 13);
									std::string resNam = vS(15, 17);
									std::string chainID{ vC(19) };
									int seqNum = vI(20, 23);
									std::string iCode{ vC(24) };

									if (iCode == " ")
										iCode.clear();

									std::string rmsd = vF(32, 36);
									std::string type = vS(41);

									// clang-format off
									getCategory("pdbx_validate_planes")->emplace({
										{ "id", std::to_string(++id) },
										{ "PDB_model_num", model ? model : 1 },
										{ "auth_comp_id", resNam },
										{ "auth_asym_id", chainID },
										{ "auth_seq_id", seqNum },
										{ "PDB_ins_code", iCode },
										{ "rmsd", rmsd },
										{ "type", type }
									});
									// clang-format on
								}
								break;

							default:
								state = eStart;
								break;
						}
					}

					break;
				}

				case 610:
				{
					bool headerSeen = false;

					for (; mRec->is("REMARK 610"); GetNextRecord())
					{
						if (not headerSeen)
						{
							std::string line = vS(12);
							headerSeen = cif::contains(line, "RES C SSEQI");
							continue;
						}

						int modelNr = vI(12, 14);
						if (modelNr == 0)
							modelNr = 1;
						std::string res = vS(16, 18);
						char chain = vC(20);
						int seq = vI(22, 25);
						char iCode = vC(26);

						auto compound = cif::compound_factory::instance().create(res);
						if (compound == nullptr)
							continue;

						std::vector<std::string> atoms;
						for (auto atom : compound->atoms())
						{
							if (atom.type_symbol != cif::H)
								atoms.push_back(atom.id);
						}

						mUnobs.push_back({ modelNr, res, chain, seq, iCode, { atoms } });
					}

					break;
				}

				case 800:
				{
					const std::regex rx1(R"(SITE_IDENTIFIER: (.+))"),
						rx2(R"(EVIDENCE_CODE: (.+))"),
						rx3(R"(SITE_DESCRIPTION: (binding site for residue ([[:alnum:]]{1,3}) ([[:alnum:]]) (\d+)|.+))", std::regex_constants::icase);

					std::string id, evidence, desc;
					std::string pdbxAuthAsymID, pdbxAuthCompID, pdbxAuthSeqID, pdbxAuthInsCode;
					std::smatch m;

					enum State
					{
						sStart,
						sID,
						sEvidence,
						sDesc,
						sDesc2
					} state = sStart;

					auto store = [&]()
					{
						// Find the matching SITE record
						auto site = FindRecord([id](PDBRecord &r) -> bool
							{ return r.is("SITE  ") and r.vS(12, 14) == id; });

						if (site == nullptr)
							throw std::runtime_error("Invalid REMARK 800, no SITE record for id " + id);

						// next record, store what we have
						// clang-format off
						getCategory("struct_site")->emplace({
							{ "id", id },
							{ "details", desc },
							{ "pdbx_auth_asym_id", pdbxAuthAsymID },
							{ "pdbx_auth_comp_id", pdbxAuthCompID },
							{ "pdbx_auth_seq_id", pdbxAuthSeqID },
							{ "pdbx_num_residues", site->vI(16, 17) },
							{ "pdbx_evidence_code", evidence }
						});
						// clang-format on
					};

					for (; mRec->is("REMARK 800"); GetNextRecord())
					{
						std::string s = mRec->vS(12);
						if (s.empty())
							continue;

						switch (state)
						{
							case sStart:
								if (s == "SITE")
									state = sID;
								else if (cif::VERBOSE > 0)
									throw std::runtime_error("Invalid REMARK 800 record, expected SITE");
								break;

							case sID:
								if (std::regex_match(s, m, rx1))
								{
									id = m[1].str();
									state = sEvidence;
								}
								else if (cif::VERBOSE > 0)
									throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
								break;

							case sEvidence:
								if (regex_match(s, m, rx2))
								{
									evidence = m[1].str();
									state = sDesc;
								}
								else if (cif::VERBOSE > 0)
									throw std::runtime_error("Invalid REMARK 800 record, expected SITE_IDENTIFIER");
								break;

							case sDesc:
								if (regex_match(s, m, rx3))
								{
									desc = m[1].str();
									pdbxAuthCompID = m[2].str();
									pdbxAuthAsymID = m[3].str();
									pdbxAuthSeqID = m[4].str();

									state = sDesc2;
								}
								break;

							case sDesc2:
								if (regex_match(s, m, rx1))
								{
									store();

									id = m[1].str();
									state = sEvidence;
									evidence.clear();
									desc.clear();
								}
								else
									desc = desc + ' ' + s;
								break;
						}
					}

					if (not id.empty())
						store();

					break;
				}

				case 999:
				{
					std::stringstream s;
					GetNextRecord();
					if (vS(12) == "SEQUENCE")
						GetNextRecord();

					while (mRec->is("REMARK 999"))
					{
						s << vS(12) << '\n';
						GetNextRecord();
					}

					sequenceDetails = s.str();
					break;
				}

					// these are skipped

				case 2:
				case 290:
				case 300:
				case 620:
					GetNextRecord();
					break;

				default:
				{
					std::string skipped = mRec->mName;

					std::stringstream s;

					if (not mRec->vS(11).empty())
						s << mRec->vS(11) << '\n';
					GetNextRecord();

					while (mRec->is(skipped.c_str()))
					{
						s << mRec->vS(11) << '\n';
						GetNextRecord();
					}

					// clang-format off
					getCategory("pdbx_database_remark")->emplace({
						{ "id", remarkNr },
						{ "text", s.str() }
					});
					// clang-format on

					break;
				}
			}
		}
		catch (const std::exception &ex)
		{
			std::throw_with_nested(std::runtime_error("Error parsing REMARK " + std::to_string(remarkNr)));
		}
	}

	if (not(compoundDetails.empty() and sequenceDetails.empty() and sourceDetails.empty()))
	{
		// clang-format off
		getCategory("pdbx_entry_details")->emplace({
			{ "entry_id", mStructureID },
			{ "compound_details", compoundDetails },
			{ "sequence_details", sequenceDetails },
			{ "source_details", sourceDetails }
		});
		// clang-format on
	}

	// store remark 200 info (special case)
	if (not mRemark200.empty())
		ParseRemark200();
}

void PDBFileParser::ParseRemark200()
{
	auto rm200 = [&](const char *name, int diffrnNr) -> std::string
	{
		int nr = 0;
		std::string result;

		for (auto s : cif::split<std::string>(mRemark200[name], ";"))
		{
			if (++nr != diffrnNr)
				continue;

			cif::trim(s);

			if (s == "NULL")
				s.clear();

			result = std::move(s);
			break;
		}

		return result;
	};

	auto inRM200 = [this](std::initializer_list<const char *> s) -> bool
	{
		bool result = false;

		for (auto *n : s)
		{
			if (not this->mRemark200[n].empty())
			{
				result = true;
				break;
			}
		}

		return result;
	};

	/*
	The category computing is no longer used.

	    if (inRM200({"INTENSITY-INTEGRATION SOFTWARE", "DATA SCALING SOFTWARE", "SOFTWARE USED"}) or
	        not mRefinementSoftware.empty())
	        getCategory("computing")->emplace({
	            { "entry_id", mStructureID },
	            { "pdbx_data_reduction_ii", mRemark200["INTENSITY-INTEGRATION SOFTWARE"] },
	            { "pdbx_data_reduction_ds", mRemark200["DATA SCALING SOFTWARE"] },
	            { "structure_solution", mRemark200["SOFTWARE USED"] },
	            { "structure_refinement", mRefinementSoftware }
	        });
*/

	struct
	{
		const char *a;
		const char *b;
	} kSWMap[] = {
		{ "data reduction", "INTENSITY-INTEGRATION SOFTWARE" },
		{ "data scaling", "DATA SCALING SOFTWARE" },
		{ "phasing", "SOFTWARE USED" },
	};

	for (auto &sw : kSWMap)
	{
		if (mRemark200[sw.b].empty())
			continue;

		// clang-format off
		getCategory("software")->emplace({
			{ "name", mRemark200[sw.b] },
			{ "classification", sw.a },
			{ "version", "." },
			{ "pdbx_ordinal", mNextSoftwareOrd++ }
		});
		// clang-format on
	}

	std::string scatteringType;
	if (mRemark200["EXPERIMENT TYPE"] == "X-RAY DIFFRACTION")
		scatteringType = "x-ray";
	else if (mRemark200["EXPERIMENT TYPE"] == "NEUTRON DIFFRACTION")
		scatteringType = "neutron";

	std::set<std::string> diffrnWaveLengths;

	for (int diffrnNr = 1;; ++diffrnNr)
	{
		std::string ambientTemp = rm200("TEMPERATURE (KELVIN)", diffrnNr);
		if (ambientTemp.empty())
			break;

		if (cif::ends_with(ambientTemp, "K"))
			ambientTemp.erase(ambientTemp.length() - 1, 1);

		// clang-format off
		getCategory("diffrn")->emplace({
			{ "id", diffrnNr },
			{ "ambient_temp", ambientTemp },
//			{ "ambient_temp_details", seqID },
			{ "crystal_id", 1 } });
		// clang-format on

		std::string collectionDate;
		std::error_code ec;
		collectionDate = pdb2cifDate(rm200("DATE OF DATA COLLECTION", diffrnNr), ec);
		if (ec)
		{
			if (cif::VERBOSE > 0)
				std::cerr << ec.message() << " for pdbx_collection_date\n";

			// The date field can become truncated when multiple values are available
			if (diffrnNr != 1)
				collectionDate.clear();
		}

		// clang-format off
		getCategory("diffrn_detector")->emplace({
			{ "diffrn_id", diffrnNr },
			{ "detector", rm200("DETECTOR TYPE", diffrnNr) },
			{ "type", rm200("DETECTOR MANUFACTURER", diffrnNr) },
			{ "pdbx_collection_date", collectionDate },
			{ "details", rm200("OPTICS", diffrnNr) }
		});
		// clang-format on

		if (inRM200({ "MONOCHROMATIC OR LAUE (M/L)", "MONOCHROMATOR", "DIFFRACTION PROTOCOL" }) or not scatteringType.empty())
			// clang-format off
			getCategory("diffrn_radiation")->emplace({
				{ "diffrn_id", diffrnNr },
				{ "wavelength_id", 1 },
				{ "pdbx_monochromatic_or_laue_m_l", rm200("MONOCHROMATIC OR LAUE (M/L)", diffrnNr) },
				{ "monochromator", rm200("MONOCHROMATOR", diffrnNr) },
				{ "pdbx_diffrn_protocol", rm200("DIFFRACTION PROTOCOL", diffrnNr) },
				{ "pdbx_scattering_type", scatteringType }
			});
		// clang-format on

		std::string wl = rm200("WAVELENGTH OR RANGE (A)", diffrnNr);
		auto wavelengths = cif::split<std::string>(wl, ", -", true);

		diffrnWaveLengths.insert(wavelengths.begin(), wavelengths.end());

		std::string source;
		if (rm200("SYNCHROTRON (Y/N)", diffrnNr) == "Y")
		{
			// clang-format off
			getCategory("diffrn_source")->emplace({
				{ "diffrn_id", diffrnNr },
				{ "source", "SYNCHROTRON" },
				{ "type", rm200("RADIATION SOURCE", diffrnNr) + " BEAMLINE " + rm200("BEAMLINE", diffrnNr) },
				{ "pdbx_synchrotron_site", rm200("RADIATION SOURCE", diffrnNr) },
				{ "pdbx_synchrotron_beamline", rm200("BEAMLINE", diffrnNr) },

				{ "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
				{ "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
			});
			// clang-format on
		}
		else if (inRM200({ "X-RAY GENERATOR MODEL", "RADIATION SOURCE", "BEAMLINE", "WAVELENGTH OR RANGE (A)" }))
		{
			// clang-format off
			getCategory("diffrn_source")->emplace({
				{ "diffrn_id", diffrnNr },
				{ "source", rm200("RADIATION SOURCE", diffrnNr) },
				{ "type", rm200("X-RAY GENERATOR MODEL", diffrnNr) },

				{ "pdbx_wavelength", wavelengths.size() == 1 ? wavelengths[0] : "" },
				{ "pdbx_wavelength_list", wavelengths.size() == 1 ? "" : cif::join(wavelengths, ", ") },
			});
			// clang-format on
		}
	}

	int wavelengthNr = 1;
	for (auto wl : diffrnWaveLengths)
	{
		if (cif::ends_with(wl, "A"))
			wl.erase(wl.length() - 1, 1);

		// clang-format off
		getCategory("diffrn_radiation_wavelength")->emplace({
			{ "id", wavelengthNr++ },
			{ "wavelength", wl.empty() ? "." : wl },
			{ "wt", "1.0" }
		});
		// clang-format on
	}

	if (inRM200({ "METHOD USED TO DETERMINE THE STRUCTURE", "STARTING MODEL" }))
	{
		auto cat = getCategory("refine");
		assert(cat->empty());

		std::string resolution = mRemark200["RESOLUTION RANGE HIGH (A)"];
		if (resolution.empty())
			resolution = ".";

		// clang-format off
		cat->emplace({
			{ "pdbx_method_to_determine_struct", mRemark200["METHOD USED TO DETERMINE THE STRUCTURE"] },
			{ "pdbx_starting_model", mRemark200["STARTING MODEL"] },
			{ "ls_d_res_high", resolution },
			{ "pdbx_diffrn_id", 1 },
			{ "pdbx_refine_id", mExpMethod },
			{ "entry_id", mStructureID } });
		// clang-format on
	}

	if (inRM200({ "REJECTION CRITERIA (SIGMA(I))", "RESOLUTION RANGE HIGH (A)", "RESOLUTION RANGE LOW (A)", "NUMBER OF UNIQUE REFLECTIONS", "COMPLETENESS FOR RANGE (%)", "<I/SIGMA(I)> FOR THE DATA SET", "R MERGE (I)", "R SYM (I)", "DATA REDUNDANCY" }))
	{
		auto cat = getCategory("reflns");
		// clang-format off
		cat->emplace({
			{ "entry_id", mStructureID },
			{ "observed_criterion_sigma_I", mRemark200["REJECTION CRITERIA (SIGMA(I))"] },
			{ "d_resolution_high", mRemark200["RESOLUTION RANGE HIGH (A)"] },
			{ "d_resolution_low", mRemark200["RESOLUTION RANGE LOW (A)"] },
			{ "number_obs", mRemark200["NUMBER OF UNIQUE REFLECTIONS"] },
			{ "percent_possible_obs", mRemark200["COMPLETENESS FOR RANGE (%)"] },
			{ "pdbx_netI_over_sigmaI", mRemark200["<I/SIGMA(I)> FOR THE DATA SET"] },
			{ "pdbx_Rmerge_I_obs", mRemark200["R MERGE (I)"] },
			{ "pdbx_Rsym_value", mRemark200["R SYM (I)"] },
			{ "pdbx_redundancy", mRemark200["DATA REDUNDANCY"] },
			{ "pdbx_ordinal", 1 },
			{ "pdbx_diffrn_id", 1 }
		});
		// clang-format on
	}

	if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE HIGH (A)" })) // that one field is mandatory...
	{
		// clang-format off
		getCategory("reflns_shell")->emplace({
			{ "d_res_high", mRemark200["HIGHEST RESOLUTION SHELL, RANGE HIGH (A)"] },
			{ "d_res_low", mRemark200["HIGHEST RESOLUTION SHELL, RANGE LOW (A)"] },
			{ "percent_possible_all", mRemark200["COMPLETENESS FOR SHELL (%)"] },
			{ "Rmerge_I_obs", mRemark200["R MERGE FOR SHELL (I)"] },
			{ "pdbx_Rsym_value", mRemark200["R SYM FOR SHELL (I)"] },
			{ "meanI_over_sigI_obs", mRemark200["<I/SIGMA(I)> FOR SHELL"] },
			{ "pdbx_redundancy", mRemark200["DATA REDUNDANCY IN SHELL"] },
			{ "pdbx_ordinal", 1 },
			{ "pdbx_diffrn_id", 1 }
		});
		// clang-format on
	}
	else if (inRM200({ "HIGHEST RESOLUTION SHELL, RANGE LOW (A)", "COMPLETENESS FOR SHELL (%)",
				 "R MERGE FOR SHELL (I)", "R SYM FOR SHELL (I)", "<I/SIGMA(I)> FOR SHELL", "DATA REDUNDANCY IN SHELL" }))
	{
		if (cif::VERBOSE > 0)
			std::cerr << "Not writing reflns_shell record since d_res_high is missing\n";
	}
}

void PDBFileParser::ParseRemark350()
{
	auto saved = mRec;

	enum State
	{
		eStart,
		eInfo,
		eAnd,
		eApply,
		eBioMT
	} state = eStart;

	const std::regex
		kRX1(R"(BIOMOLECULE: (\d+))"),
		kRX2(R"(([^:]+): (.+?)(?: (ANGSTROM\*\*2|KCAL/MOL))?)"),
		kRX8(R"(APPLY THE FOLLOWING TO CHAINS: (.+))"),
		kRX9(R"(AND CHAINS: (.+))"),
		kRX10(R"(BIOMT([123])\s+(\d+)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?)\s+(-?\d+(?:\.\d+)?))");

	int biomolecule = 0, operID = 0;
	std::vector<std::string> operExpression;
	std::map<std::string, std::string> values;
	std::vector<std::string> asymIdList;
	std::smatch m;

	std::vector<double> mat, vec;

	for (mRec = FindRecord("REMARK 350"); mRec != nullptr and mRec->is("REMARK 350"); GetNextRecord())
	{
		std::string line = vS(11);

		switch (state)
		{
			case eStart:
				if (regex_match(line, m, kRX1))
				{
					biomolecule = stoi(m[1].str());
					state = eInfo;
				}
				break;

			case eInfo:
				if (regex_match(line, m, kRX8))
				{
					state = eApply;

					std::string value = m[1].str();

					for (auto chain : cif::split<std::string>(value, ", ", true))
					{
						if (chain.empty()) // happens when we have a AND CHAIN line
						{
							state = eAnd;
							break;
						}

						if (chain.length() != 1)
							throw std::runtime_error("Invalid REMARK 350");

						MapChainID2AsymIDS(chain[0], asymIdList);
					}
				}
				else if (regex_match(line, m, kRX2))
					values[m[1].str()] = m[2].str();
				break;

			case eAnd:
				if (regex_match(line, m, kRX9))
				{
					state = eApply;

					std::string value = m[1].str();

					for (auto chain : cif::split<std::string>(value, ", ", true))
					{
						if (chain.empty()) // happens when we have another AND CHAIN line
						{
							state = eAnd;
							break;
						}

						MapChainID2AsymIDS(chain[0], asymIdList);
					}

					continue;
				}
				// fall through

			case eApply:
				if (regex_match(line, m, kRX10))
				{
					int mt = stoi(m[1].str());
					if (mt != 1)
						throw std::runtime_error("Invalid REMARK 350");

					operID = stoi(m[2].str());
					operExpression.push_back(std::to_string(operID));

					mat.push_back(stod(m[3].str()));
					mat.push_back(stod(m[4].str()));
					mat.push_back(stod(m[5].str()));
					vec.push_back(stod(m[6].str()));
					state = eBioMT;
				}
				break;

			case eBioMT:
				if (regex_match(line, m, kRX10))
				{
					int mt = stoi(m[1].str());

					if (mt == 1)
					{
						operID = stoi(m[2].str());
						operExpression.push_back(std::to_string(operID));
					}
					else if (operID != stoi(m[2].str()))
						throw std::runtime_error("Invalid REMARK 350");

					mat.push_back(stod(m[3].str()));
					mat.push_back(stod(m[4].str()));
					mat.push_back(stod(m[5].str()));
					vec.push_back(stod(m[6].str()));

					if (mt == 3)
					{
						if (vec.size() != 3 or mat.size() != 9)
							throw std::runtime_error("Invalid REMARK 350");

						if (operID == 1)
						{
							std::string oligomer = values["AUTHOR DETERMINED BIOLOGICAL UNIT"];
							if (oligomer.empty())
								oligomer = values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"];
							to_lower(oligomer);

							int count = 0;
							std::smatch m2;

							if (std::regex_match(oligomer, m2, std::regex(R"((\d+)-meric)")))
							{
								count = stoi(m2[1].str());
							}
							else if (cif::ends_with(oligomer, "meric"))
							{
								std::string cs = oligomer.substr(0, oligomer.length() - 5);
								if (cs == "mono")
									count = 1;
								else if (cs == "di")
									count = 2;
								else if (cs == "tri")
									count = 3;
								else if (cs == "tetra")
									count = 4;
								else if (cs == "hexa")
									count = 6;
								else if (cs == "octa")
									count = 8;
								else if (cs == "dodeca")
									count = 12;
							}

							std::string details;
							if (values["AUTHOR DETERMINED BIOLOGICAL UNIT"].empty())
							{
								if (not values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
									details = "software_defined_assembly";
							}
							else if (values["SOFTWARE DETERMINED QUATERNARY STRUCTURE"].empty())
								details = "author_defined_assembly";
							else
								details = "author_and_software_defined_assembly";

							// clang-format off
							getCategory("pdbx_struct_assembly")->emplace({
								{ "id", biomolecule },
								{ "details", details },
								{ "method_details", values["SOFTWARE USED"] },
								{ "oligomeric_details", oligomer },
								{ "oligomeric_count", count > 0 ? std::to_string(count) : "" }
							});

							auto cat = getCategory("pdbx_struct_assembly_prop");

							if (not values["TOTAL BURIED SURFACE AREA"].empty())
								cat->emplace({
									{ "biol_id", biomolecule },
									{ "type", "ABSA (A^2)" },
									{ "value", values["TOTAL BURIED SURFACE AREA"] }
								});

							if (not values["CHANGE IN SOLVENT FREE ENERGY"].empty())
								cat->emplace({
									{ "biol_id", biomolecule },
									{ "type", "MORE" },
									{ "value", values["CHANGE IN SOLVENT FREE ENERGY"] }
								});

							if (not values["SURFACE AREA OF THE COMPLEX"].empty())
								cat->emplace({
									{ "biol_id", biomolecule },
									{ "type", "SSA (A^2)" },
									{ "value", values["SURFACE AREA OF THE COMPLEX"] }
								});
							// clang-format on

							values.clear();
						}

						std::string type = mat == std::vector<double>{ 1, 0, 0, 0, 1, 0, 0, 0, 1 } and vec == std::vector<double>{ 0, 0, 0 } ? "identity operation" : "crystal symmetry operation";

						auto pdbx_struct_oper_list = getCategory("pdbx_struct_oper_list");
						if (not pdbx_struct_oper_list->contains(cif::key("id") == operID))
							getCategory("pdbx_struct_oper_list")->emplace({ // clang-format off
								{ "id", operID },
								{ "type", type },
								// { "name", "" },
							    // { "symmetryOperation", "" },
								{ "matrix[1][1]", cif::format("{:12.10f}", mat[0]) },
								{ "matrix[1][2]", cif::format("{:12.10f}", mat[1]) },
								{ "matrix[1][3]", cif::format("{:12.10f}", mat[2]) },
								{ "vector[1]", cif::format("{:12.10f}", vec[0]) },
								{ "matrix[2][1]", cif::format("{:12.10f}", mat[3]) },
								{ "matrix[2][2]", cif::format("{:12.10f}", mat[4]) },
								{ "matrix[2][3]", cif::format("{:12.10f}", mat[5]) },
								{ "vector[2]", cif::format("{:12.10f}", vec[1]) },
								{ "matrix[3][1]", cif::format("{:12.10f}", mat[6]) },
								{ "matrix[3][2]", cif::format("{:12.10f}", mat[7]) },
								{ "matrix[3][3]", cif::format("{:12.10f}", mat[8]) },
								{ "vector[3]", cif::format("{:12.10f}", vec[2]) }
							});
																			// clang-format on

						mat.clear();
						vec.clear();
					}
				}
				else if (regex_match(line, m, kRX1))
				{
					if (not(vec.empty() and mat.empty()))
						throw std::runtime_error("Invalid REMARK 350");

					// clang-format off
					getCategory("pdbx_struct_assembly_gen")->emplace({
						{ "assembly_id", biomolecule },
						{ "oper_expression", cif::join(operExpression, ",") },
						{ "asym_id_list", cif::join(asymIdList, ",") }
					});
					// clang-format on

					biomolecule = stoi(m[1].str());
					asymIdList.clear();
					operExpression.clear();

					state = eInfo;
				}
				break;
		}
	}

	if (not operExpression.empty())
	{
		// clang-format off
		getCategory("pdbx_struct_assembly_gen")->emplace({
			{ "assembly_id", biomolecule },
			{ "oper_expression", cif::join(operExpression, ",") },
			{ "asym_id_list", cif::join(asymIdList, ",") }
		});
		// clang-format on
	}

	mRec = saved;
}

void PDBFileParser::ParsePrimaryStructure()
{
	// First locate the DBREF record. Might be missing
	DBREF cur = { mStructureID };

	while (cif::starts_with(mRec->mName, "DBREF"))
	{
		if (mRec->is("DBREF ")) //	 1 -  6       Record name   "DBREF "
		{
			cur.PDBIDCode = vS(8, 11);    //	 8 - 11       IDcode        idCode             ID code of this datablock.
			cur.chainID = vC(13);         //	13            Character     chainID            Chain  identifier.
			cur.seqBegin = vI(15, 18);    //	15 - 18       Integer       seqBegin           Initial sequence number of the
			                              //	                                               PDB sequence segment.
			cur.insertBegin = vC(19);     //	19            AChar         insertBegin        Initial  insertion code of the
			                              //	                                               PDB  sequence segment.
			cur.seqEnd = vI(21, 24);      //	21 - 24       Integer       seqEnd             Ending sequence number of the
			                              //	                                               PDB  sequence segment.
			cur.insertEnd = vC(25);       //	25            AChar         insertEnd          Ending insertion code of the
			                              //	                                               PDB  sequence segment.
			cur.database = vS(27, 32);    //	27 - 32       LString       database           Sequence database name.
			cur.dbAccession = vS(34, 41); //	34 - 41       LString       dbAccession        Sequence database accession code.
			cur.dbIdCode = vS(43, 54);    //	43 - 54       LString       dbIdCode           Sequence  database identification code.
			cur.dbSeqBegin = vI(56, 60);  //	56 - 60       Integer       dbseqBegin         Initial sequence number of the
			                              //	                                               database seqment.
			cur.dbinsBeg = vC(61);        //	61            AChar         idbnsBeg           Insertion code of initial residue of the
			                              //	                                               segment, if PDB is the reference.
			cur.dbSeqEnd = vI(63, 67);    //	63 - 67       Integer       dbseqEnd           Ending sequence number of the
			                              //	                                               database segment.
			cur.dbinsEnd = vC(68);        //	68            AChar         dbinsEnd           Insertion code of the ending residue of
			                              //	                                               the segment, if PDB is the reference.
			auto &chain = GetChainForID(cur.chainID);
			chain.mDbref = cur;
		}
		else if (mRec->is("DBREF1")) //	 1 -  6        Record name   "DBREF1"
		{
			cur.PDBIDCode = vS(8, 11); //	 8 - 11       IDcode        idCode             ID code of this datablock.
			cur.chainID = vC(13);      //	13             Character     chainID       Chain identifier.
			cur.seqBegin = vI(15, 18); //	15 - 18        Integer       seqBegin      Initial sequence number of the
			                           //	                                           PDB sequence segment, right justified.
			cur.insertBegin = vC(19);  //	19             AChar         insertBegin   Initial insertion code of the
			                           //	                                           PDB sequence segment.
			cur.seqEnd = vI(21, 24);   //	21 - 24        Integer       seqEnd        Ending sequence number of the
			                           //	                                           PDB sequence segment, right justified.
			cur.insertEnd = vC(25);    //	25             AChar         insertEnd     Ending insertion code of the
			                           //	                                           PDB sequence  segment.
			cur.database = vS(27, 32); //	27 - 32        LString       database      Sequence database name.
			cur.dbIdCode = vS(48, 67); //	48 - 67        LString       dbIdCode      Sequence database identification code,
		}
		else if (mRec->is("DBREF2"))   //	 1 -  6       Record name   "DBREF2"
		{                              //	 8 - 11       IDcode        idCode        ID code of this datablock.
			if (vC(13) != cur.chainID) //	13            Character     chainID       Chain identifier.
				throw std::runtime_error("Chain ID's for DBREF1/DBREF2 records do not match");
			cur.dbAccession = vS(19, 40); //	19 - 40       LString       dbAccession   Sequence database accession code,
			                              //	                                          left justified.
			cur.dbSeqBegin = vI(46, 55);  //	46 - 55       Integer       seqBegin      Initial sequence number of the
			                              //	                                          Database segment, right justified.
			cur.dbSeqEnd = vI(58, 67);    //	58 - 67       Integer       seqEnd        Ending sequence number of the
			                              //	                                          Database segment, right justified.
			auto &chain = GetChainForID(cur.chainID);
			chain.mDbref = cur;
		}

		GetNextRecord();
	}

	// update chains
	for (auto &chain : mChains)
	{
		chain.mNextSeqNum = chain.mDbref.seqBegin;
		chain.mNextDbSeqNum = chain.mDbref.dbSeqBegin;
	}

	while (mRec->is("SEQADV"))
	{ //	 1 -  6        Record name   "SEQADV"
		mSeqadvs.push_back({
			//	 8 - 11        IDcode        idCode        ID  code of this datablock.
			vS(13, 15), //	13 - 15        Residue name  resName       Name of the PDB residue in conflict.
			vC(17),     //	17             Character     chainID       PDB  chain identifier.
			vI(19, 22), //	19 - 22        Integer       seqNum        PDB  sequence number.
			vC(23),     //	23             AChar         iCode         PDB insertion code.
			vS(25, 28), //	25 - 28        LString       database
			vS(30, 38), //	30 - 38        LString       dbAccession   Sequence  database accession number.
			vS(40, 42), //	40 - 42        Residue name  dbRes         Sequence database residue name.
			vI(44, 48), //	44 - 48        Integer       dbSeq         Sequence database sequence number.
			vS(50, 70)  //	50 - 70        LString       conflict      Conflict comment.
		});

		GetNextRecord();
	}

	while (mRec->is("SEQRES"))             //	 1 -  6        Record name    "SEQRES"
	{                                      //	 8 - 10        Integer        serNum       Serial number of the SEQRES record for  the
		                                   //	                                           current  chain. Starts at 1 and increments
		                                   //	                                           by one  each line. Reset to 1 for each chain.
		char chainID = vC(12);             //	12             Character      chainID      Chain identifier. This may be any single
		                                   //	                                           legal  character, including a blank which is
		                                   //	                                           is  used if there is only one chain.
		int numRes = vI(14, 17);           //	14 - 17        Integer        numRes       Number of residues in the chain.
		                                   //	                                           This  value is repeated on every record.
		std::string monomers = vS(20, 70); //	20 - 22        Residue name   resName      Residue name.
		                                   //	 ...

		auto &chain = GetChainForID(chainID, numRes);

		for (auto monID : cif::split<std::string>(monomers, " ", true))
		{
			if (monID.empty())
				continue;

			chain.mSeqres.push_back({ monID, chain.mNextSeqNum++, ' ', chain.mNextDbSeqNum++ });

			InsertChemComp(monID);
		}

		GetNextRecord();
	}

	// First pass over MODRES, only store relevant information required in ConstructEntities
	while (mRec->is("MODRES"))            //	 1 -  6        Record name   "MODRES"
	{                                     //	 8 - 11        IDcode        idCode      ID code of this datablock.
		std::string resName = vS(13, 15); //	13 - 15        Residue name  resName     Residue name used in this datablock.
		                                  //		char chainID		= vC(17);			//	17             Character     chainID     Chain identifier.
		                                  //		int seqNum			= vI(19, 22);		//	19 - 22        Integer       seqNum      Sequence number.
		                                  //		char iCode			= vC(23);			//	23             AChar         iCode       Insertion code.
		std::string stdRes = vS(25, 27);  //	25 - 27        Residue name  stdRes      Standard residue name.
		                                  //		std::string comment		= vS(30, 70);	//	30 - 70        String        comment     Description of the residue modification.

		mMod2parent[resName] = stdRes;

		GetNextRecord();
	}
}

void PDBFileParser::ParseHeterogen()
{
	while (mRec->is("HET   "))
	{                                  //	 1 -  6       Record name   "HET   "
		std::string hetID = vS(8, 10); //	 8 - 10       LString(3)    hetID          Het identifier, right-justified.
		char chainID = vC(13);         //	13            Character     ChainID        Chain  identifier.
		int seqNum = vI(14, 17);       //	14 - 17       Integer       seqNum         Sequence  number.
		char iCode = vC(18);           //	18            AChar         iCode          Insertion  code.
		int numHetAtoms = vI(21, 25);  //	21 - 25       Integer       numHetAtoms    Number of HETATM records for the group
		                               //	                                           present in the datablock.
		std::string text = vS(31, 70); //	31 - 70       String        text           Text describing Het group.

		mHets.emplace_back(hetID, chainID, seqNum, iCode, numHetAtoms, text);

		GetNextRecord();
	}

	for (;;)
	{
		if (mRec->is("HETNAM"))             //	 1 -  6       Record name   "HETNAM"
		{                                   //	 9 - 10       Continuation  continuation    Allows concatenation of multiple records.
			std::string hetID = vS(12, 14); //	12 - 14       LString(3)    hetID           Het identifier, right-justified.
			std::string text = vS(16);      //	16 - 70       String        text            Chemical name.

			mHetnams[hetID] = text;
			InsertChemComp(hetID);

			GetNextRecord();
			continue;
		}

		if (mRec->is("HETSYN"))             //	 1 -  6       Record name   "HETSYN"
		{                                   //	 9 - 10       Continuation  continuation   Allows concatenation of multiple records.
			std::string hetID = vS(12, 14); //	12 - 14       LString(3)    hetID          Het identifier, right-justified.
			std::string syn = vS(16);       //	16 - 70       SList         hetSynonyms    List of synonyms.

			mHetsyns[hetID] = syn;

			GetNextRecord();
			continue;
		}

		break;
	}

	while (mRec->is("FORMUL"))          //	 1 -  6        Record name   "FORMUL"
	{                                   //	 9 - 10        Integer       compNum       Component  number.
		std::string hetID = vS(13, 15); //	13 - 15        LString(3)    hetID         Het identifier.
		                                //	17 - 18        Integer       continuation  Continuation number.
		char waterMark = vC(19);        //	19             Character     asterisk      "*" for water.
		std::string formula = vS(20);   //	20 - 70        String        text          Chemical formula.

		mFormuls[hetID] = formula;

		if (waterMark == '*')
			mWaterHetID = hetID;

		GetNextRecord();
	}
}

void PDBFileParser::ConstructEntities()
{
	// We parsed the Primary Structure and Heterogen sections, if available.
	// But if we didn't parse anything, we need to fake the data based on residues in ATOM records

	// First iterate all ATOM records and store the residues as found in these records
	int modelNr = 1;

	typedef std::map<std::tuple<char, int, char, char>, std::string> CompTypeMap;
	CompTypeMap residuesSeen; // used to validate PDB files...

	for (auto r = mData; r != nullptr; r = r->mNext)
	{
		if (r->is("MODEL "))
		{
			modelNr = r->vI(11, 14);
			if (modelNr != 1)
				break;
			continue;
		}

		if (r->is("ATOM  ") or r->is("HETATM"))  //	 1 -  6        Record name   "ATOM  "
		{                                        //	 ...
			std::string name = r->vS(13, 16);    //	13 - 16        Atom          name         Atom name.
			char altLoc = r->vC(17);             //	17             Character     altLoc       Alternate location indicator.
			std::string resName = r->vS(18, 20); //	18 - 20        Residue name  resName      Residue name.
			char chainID = r->vC(22);            //	22             Character     chainID      Chain identifier.
			int resSeq = r->vI(23, 26);          //	23 - 26        Integer       resSeq       Residue sequence number.
			char iCode = r->vC(27);              //	27             AChar         iCode        Code for insertion of residues.

			// first validate, too sad this is required...
			CompTypeMap::key_type k = std::make_tuple(chainID, resSeq, iCode, altLoc);
			if (residuesSeen.count(k) == 0)
				residuesSeen[k] = resName;
			else if (residuesSeen[k] != resName)
				throw std::runtime_error("inconsistent residue type for " + std::string{ chainID } + std::to_string(resSeq) + iCode + altLoc + "\n" +
										 "  (" + residuesSeen[k] + " != " + resName + ")");

			auto &chain = GetChainForID(chainID);

			PDBChain::AtomRes ar{ resName, resSeq, iCode };

			if ((chain.mResiduesSeen.empty() or chain.mResiduesSeen.back() != ar) and
				cif::compound_factory::instance().is_monomer(resName))
			{
				chain.mResiduesSeen.push_back(ar);
			}

			// now that we're iterating atoms anyway, clean up the mUnobs array
			mUnobs.erase(remove_if(mUnobs.begin(), mUnobs.end(), [=](UNOBS &a)
							 {
				bool result = false;

				if (modelNr == a.modelNr and
					resName == a.res and
					chainID == a.chain and
					resSeq == a.seq and
					iCode == a.iCode)
				{
					auto i = find(a.atoms.begin(), a.atoms.end(), name);
					if (i != a.atoms.end())
					{
						a.atoms.erase(i);
						result = a.atoms.empty();
					}
				}

				return result; }),
				mUnobs.end());

			continue;
		}

		if (r->is("TER   "))          //	 1 -  6 	   Record name	 "TER	"
		{                             //	 7 - 11 	   Integer		 serial 		 Serial number.
			                          //	18 - 20 	   Residue name  resName		 Residue name.
			char chainID = r->vC(22); //	22			   Character	 chainID		 Chain identifier.
			                          //	23 - 26 	   Integer		 resSeq 		 Residue sequence number.
			                          //	27			   AChar		 iCode			 Insertion code.
			auto &chain = GetChainForID(chainID);
			if (chain.mTerIndex == 0) // Is this the first TER record? (Refmac writes out multiple TER records...)
				chain.mTerIndex = static_cast<int>(chain.mResiduesSeen.size());
			continue;
		}
	}

	// prune completely empty chains?
	mChains.erase(remove_if(mChains.begin(), mChains.end(), [](auto &chain)
					  { return chain.mResiduesSeen.empty() and chain.mSeqres.empty(); }),
		mChains.end());

	for (auto &chain : mChains)
	{
		if (not(chain.mSeqres.empty() or chain.mResiduesSeen.empty()))
		{
			// seems safe to assume TER record is at the right location...
			// However, some files don't have them at all.
			// When mTerIndex == 0 this is most likely the case. Right?

			if (chain.mTerIndex > 0)
				chain.mResiduesSeen.erase(chain.mResiduesSeen.begin() + chain.mTerIndex, chain.mResiduesSeen.end());

			int lastResidueIndex = chain.AlignResToSeqRes();

			if (lastResidueIndex > 0 and lastResidueIndex + 1 < static_cast<int>(chain.mResiduesSeen.size()))
			{
				auto &r = chain.mResiduesSeen[lastResidueIndex + 1];

				if (cif::VERBOSE > 0)
				{
					std::cerr << "Detected residues that cannot be aligned to SEQRES\n"
							  << "First residue is " << chain.mDbref.chainID << ':' << r.mSeqNum << r.mIcode << '\n';
				}

				chain.mTerIndex = lastResidueIndex + 1;
			}
		}
		else
		{
			// So, we did not have a SEQRES for this chain. Try to reconstruct it.
			// Problem here is that TER records may be located incorrectly. So
			// first lets shift the ter index until it is past the last known
			// aminoacid or base.

			for (int ix = chain.mTerIndex; ix < static_cast<int>(chain.mResiduesSeen.size()); ++ix)
			{
				std::string resName = chain.mResiduesSeen[ix].mMonID;

				if (cif::compound_factory::instance().is_monomer(resName))
					chain.mTerIndex = ix + 1;

				InsertChemComp(resName);
			}

			// And now construct our 'SEQRES'...
			for (int ix = 0; ix < chain.mTerIndex; ++ix)
			{
				auto &ar = chain.mResiduesSeen[ix];
				chain.mSeqres.push_back({ ar.mMonID, ar.mSeqNum, ar.mIcode, ar.mSeqNum, true });
			}
		}
	}

	std::set<char> terminatedChains;
	std::map<char, int> residuePerChainCounter;

	for (auto r = mData; r != nullptr; r = r->mNext)
	{
		if (r->is("MODEL "))
		{
			modelNr = r->vI(11, 14);
			if (modelNr != 1)
				break;
			continue;
		}

		if (r->is("ATOM  ") or r->is("HETATM"))
		{ //	 1 -  6        Record name   "ATOM  "
			// int serial = r->vI(7, 11);			//	 7 - 11        Integer       serial       Atom  serial number.
			//	 ...
			char altLoc = vC(17);                //	17             Character     altLoc       Alternate location indicator.
			std::string resName = r->vS(18, 20); //	18 - 20        Residue name  resName      Residue name.
			char chainID = r->vC(22);            //	22             Character     chainID      Chain identifier.
			int resSeq = r->vI(23, 26);          //	23 - 26        Integer       resSeq       Residue sequence number.
			char iCode = r->vC(27);              //	27             AChar         iCode        Code for insertion of residues.

			auto &chain = GetChainForID(chainID);

			auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ resName, resSeq, iCode });

			// might be a hetero
			if (altLoc != ' ' and i == chain.mSeqres.end())
			{
				i = find_if(chain.mSeqres.begin(), chain.mSeqres.end(),
					[resSeq, iCode](const PDBSeqRes &r) -> bool
					{
						return r.mSeqNum == resSeq and r.mIcode == iCode;
					});
			}

			if (i != chain.mSeqres.end())
			{
				i->mSeen = true;
				if (i->mMonID != resName)
					i->mAlts.insert(resName);
			}
			else
			{
				auto &residues = chain.mHet;

				if (residues.empty() or residues.back().mSeqNum != resSeq)
				{
					i = lower_bound(residues.begin(), residues.end(),
						PDBSeqRes{ resName, resSeq, iCode },
						[=](const PDBSeqRes &r1, const PDBSeqRes &r2) -> bool
						{
							return r1.mSeqNum < r2.mSeqNum;
						});

					residues.insert(i, { resName, resSeq, iCode, resSeq, true });

					InsertChemComp(resName);
				}
			}

			int residueCount = (residuePerChainCounter[chainID] += 1);

			// There appears to be a program that writes out HETATM records as ATOM records....
			if (not cif::compound_factory::instance().is_monomer(resName) or
				terminatedChains.count(chainID) or
				(chain.mTerIndex > 0 and residueCount >= chain.mTerIndex))
			{
				if (isWater(resName))
					mWaterHetID = resName;

				auto h = find_if(mHets.begin(), mHets.end(), [=](const HET &het) -> bool
					{ return het.hetID == resName and het.chainID == chainID and
					         het.seqNum == resSeq and het.iCode == iCode; });

				if (h == mHets.end())
				{
					mHets.push_back({ resName, chainID, resSeq, iCode, 0 }); // double perhaps, but that does not care
					h = prev(mHets.end());
				}

				h->atoms.push_back(r);
			}

			continue;
		}

		if (r->is("TER   "))
		{
			char chainID = r->vC(22); //	22             Character     chainID      Chain identifier.
			terminatedChains.insert(chainID);
		}
	}

	// Create missing compounds
	for (auto &chain : mChains)
	{
		if (chain.mMolID != 0 or chain.mSeqres.empty())
			continue;

		// now this chain may contain the same residues as another one
		for (auto &other : mChains)
		{
			if (&other == &chain or other.mMolID == 0)
				continue;

			if (chain.SameSequence(other))
			{
				chain.mMolID = other.mMolID;
				break;
			}
		}

		if (chain.mMolID != 0)
			continue;

		auto &comp = GetOrCreateCompound(mNextMolID++);
		comp.mChains.insert(chain.mDbref.chainID);

		chain.mMolID = comp.mMolID;
	}

	std::set<std::string> structTitle, structDescription;

	// Create poly_scheme and write pdbx_poly_seq_scheme and create mapping table

	auto cat = getCategory("pdbx_poly_seq_scheme");
	int asymNr = 0;
	for (auto &chain : mChains)
	{
		std::string asymID = cif::cif_id_for_number(asymNr++);

		if (mMolID2EntityID.count(chain.mMolID) == 0)
			continue;

		std::string entityID = mMolID2EntityID[chain.mMolID];

		mAsymID2EntityID[asymID] = entityID;

		// clang-format off
		getCategory("struct_asym")->emplace({
			{ "id", asymID },
			{ "pdbx_blank_PDB_chainid_flag", chain.mDbref.chainID == ' ' ? "Y" : "N" },
			// pdbx_modified
			{ "entity_id", entityID },
			// details
		});
		// clang-format on

		int seqNr = 1;
		for (auto &res : chain.mSeqres)
		{
			mChainSeq2AsymSeq[std::make_tuple(chain.mDbref.chainID, res.mSeqNum, res.mIcode)] = std::make_tuple(asymID, seqNr, true);

			std::string seqID = std::to_string(seqNr);
			++seqNr;

			std::set<std::string> monIds = { res.mMonID };
			monIds.insert(res.mAlts.begin(), res.mAlts.end());

			for (std::string monID : monIds)
			{
				std::string authMonID, authSeqNum, authInsCode{ '.' };

				if (res.mSeen)
				{
					authMonID = monID;
					authSeqNum = std::to_string(res.mSeqNum);
					if (res.mIcode != ' ' and res.mIcode != 0)
						authInsCode = std::string{ res.mIcode };

					// clang-format off
					cat->emplace({
						{ "asym_id", asymID },
						{ "entity_id", mMolID2EntityID[chain.mMolID] },
						{ "seq_id", seqID },
						{ "mon_id", monID },
						{ "ndb_seq_num", seqID },
						{ "pdb_seq_num", res.mSeqNum },
						{ "auth_seq_num", authSeqNum },
						{ "pdb_mon_id", authMonID },
						{ "auth_mon_id", authMonID },
						{ "pdb_strand_id", std::string{ chain.mDbref.chainID } },
						{ "pdb_ins_code", authInsCode },
						{ "hetero", res.mAlts.empty() ? "n" : "y" }
					});
					// clang-format on
				}
				else
				{
					if (res.mIcode != ' ' and res.mIcode != 0)
						authInsCode = std::string{ res.mIcode } + "A";

					// clang-format off
					cat->emplace({
						{ "asym_id", asymID },
						{ "entity_id", mMolID2EntityID[chain.mMolID] },
						{ "seq_id", seqID },
						{ "mon_id", monID },
						{ "ndb_seq_num", seqID },
						{ "pdb_seq_num", res.mSeqNum },
						{ "auth_seq_num", "." },
						{ "pdb_mon_id", "." },
						{ "auth_mon_id", "." },
						{ "pdb_strand_id", std::string{ chain.mDbref.chainID } },
						{ "pdb_ins_code", authInsCode },
						{ "hetero", res.mAlts.empty() ? "n" : "y" }
					});
					// clang-format on
				}
			}
		}
	}

	// We have now created all compounds, write them out
	uint32_t structRefID = 0, structRefSeqAlignID = 0;

	for (auto &cmp : mCompounds)
	{
		++structRefID;

		std::string srcMethod;

		if (not cmp.mSource["SYNTHETIC"].empty())
		{
			srcMethod = "syn";

			// clang-format off
			getCategory("pdbx_entity_src_syn")->emplace({
				{ "entity_id", mMolID2EntityID[cmp.mMolID] },
				{ "pdbx_src_id", structRefID },
				{ "organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
				{ "ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
			});
			// clang-format on
		}
		else if (cmp.mInfo["ENGINEERED"] == "YES" or
				 not cmp.mSource["EXPRESSION_SYSTEM"].empty())
		{
			srcMethod = "man";

			// clang-format off
			getCategory("entity_src_gen")->emplace({
				{ "entity_id", mMolID2EntityID[cmp.mMolID] },
				{ "pdbx_src_id", structRefID },
				{ "gene_src_common_name", cmp.mSource["ORGANISM_COMMON"] },
				{ "pdbx_gene_src_gene", cmp.mSource["GENE"] },
				{ "gene_src_strain", cmp.mSource["STRAIN"] },
				{ "gene_src_tissue", cmp.mSource["TISSUE"] },
				{ "gene_src_tissue_fraction", cmp.mSource["TISSUE_FRACTION"] },
				{ "pdbx_gene_src_cell_line", cmp.mSource["CELL_LINE"] },
				{ "pdbx_gene_src_organelle", cmp.mSource["ORGANELLE"] },
				{ "pdbx_gene_src_cell", cmp.mSource["CELL"] },
				{ "pdbx_gene_src_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
				{ "host_org_common_name", cmp.mSource["EXPRESSION_SYSTEM_COMMON"] },
				{ "pdbx_gene_src_scientific_name", cmp.mSource["ORGANISM_SCIENTIFIC"] },
				{ "pdbx_gene_src_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
				{ "pdbx_host_org_scientific_name", cmp.mSource["EXPRESSION_SYSTEM"] },
				{ "pdbx_host_org_ncbi_taxonomy_id", cmp.mSource["EXPRESSION_SYSTEM_TAXID"] },
				{ "pdbx_host_org_strain", cmp.mSource["EXPRESSION_SYSTEM_STRAIN"] },
				{ "pdbx_host_org_variant", cmp.mSource["EXPRESSION_SYSTEM_VARIANT"] },
				{ "pdbx_host_org_cell_line", cmp.mSource["EXPRESSION_SYSTEM_CELL_LINE"] },
				{ "pdbx_host_org_cellular_location", cmp.mSource["EXPRESSION_SYSTEM_CELLULAR_LOCATION"] },
				{ "pdbx_host_org_vector_type", cmp.mSource["EXPRESSION_SYSTEM_VECTOR_TYPE"] },
				{ "pdbx_host_org_vector", cmp.mSource["EXPRESSION_SYSTEM_VECTOR"] },
				{ "pdbx_host_org_gene", cmp.mSource["EXPRESSION_SYSTEM_GENE"] },
				{ "plasmid_name", cmp.mSource["EXPRESSION_SYSTEM_PLASMID"] },
				{ "pdbx_description", cmp.mSource["OTHER_DETAILS"] }
			});
			// clang-format on
		}
		else if (not cmp.mSource["ORGANISM_SCIENTIFIC"].empty())
		{
			srcMethod = "nat";

			// clang-format off
			getCategory("entity_src_nat")->emplace({
				{ "entity_id", mMolID2EntityID[cmp.mMolID] },
				{ "pdbx_src_id", structRefID },
				{ "common_name", cmp.mSource["ORGANISM_COMMON"] },
				{ "strain", cmp.mSource["STRAIN"] },
				{ "pdbx_secretion", cmp.mSource["SECRETION"] },
				{ "pdbx_organism_scientific", cmp.mSource["ORGANISM_SCIENTIFIC"] },
				{ "pdbx_ncbi_taxonomy_id", cmp.mSource["ORGANISM_TAXID"] },
				{ "pdbx_cellular_location", cmp.mSource["CELLULAR_LOCATION"] },
				{ "pdbx_plasmid_name", cmp.mSource["PLASMID"] },
				{ "pdbx_organ", cmp.mSource["ORGAN"] },
			});
			// clang-format on
		}

		// clang-format off
		getCategory("entity")->emplace({
			{ "id", mMolID2EntityID[cmp.mMolID] },
			{ "type", "polymer" },
			{ "src_method", srcMethod },
			{ "pdbx_description", cmp.mInfo["MOLECULE"] },
//			{ "pdbx_formula_weight", 		},
			{ "pdbx_number_of_molecules", cmp.mChains.size() },
			{ "details", cmp.mInfo["OTHER_DETAILS"] },
			{ "pdbx_mutation", cmp.mInfo["MUTATION"] },
			{ "pdbx_fragment", cmp.mInfo["FRAGMENT"] },
			{ "pdbx_ec", cmp.mInfo["EC"] }
		});
		// clang-format on

		if (not cmp.mInfo["SYNONYM"].empty())
		{
			// clang-format off
			getCategory("entity_name_com")->emplace({
				{ "entity_id", mMolID2EntityID[cmp.mMolID] },
				{ "name", cmp.mInfo["SYNONYM"] }
			});
			// clang-format on
		}

		std::string desc = cmp.mInfo["MOLECULE"];
		if (not cmp.mInfo["EC"].empty())
			desc += " (E.C." + cmp.mInfo["EC"] + ")";

		if (not cmp.mTitle.empty())
			structTitle.insert(cmp.mTitle);

		if (not desc.empty())
			structDescription.insert(desc);

		auto ci = find_if(mChains.begin(), mChains.end(),
			[cmp](PDBChain &c) -> bool
			{ return cmp.mChains.count(c.mDbref.chainID); });

		if (ci != mChains.end() and not ci->mDbref.dbIdCode.empty())
		{
			// clang-format off
			getCategory("struct_ref")->emplace({
				{ "id", structRefID },
				{ "entity_id", mMolID2EntityID[cmp.mMolID] },
				{ "db_name", ci->mDbref.database },
				{ "db_code", ci->mDbref.dbIdCode },
				{ "pdbx_db_accession", ci->mDbref.dbAccession },
//				{ "pdbx_align_begin", ci->mDbref.dbSeqBegin }
			});
			// clang-format on
		}

		bool nstdMonomer = false, nonstandardLinkage = false;
		bool mightBePolyPeptide = true, mightBeDNA = true;

		std::vector<std::string> chains;
		std::string seq, seqCan;

		// write out the chains for this compound
		for (auto &chain : mChains)
		{
			if (chain.mMolID != cmp.mMolID)
				continue;

			//			chain.mEntityID = cmp.mEntityID;

			++structRefSeqAlignID;
			DBREF &dbref = chain.mDbref;

			if (not dbref.database.empty())
			{
				auto insToStr = [](char i) -> std::string
				{
					return i == ' ' or not isprint(i) ? "" : std::string{ i };
				};

				auto &pdbxPolySeqScheme = *getCategory("pdbx_poly_seq_scheme");

				int seqAlignBeg = 0, seqAlignEnd = 0;

				try
				{
					seqAlignBeg = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string{ dbref.chainID } and
																   key("pdb_seq_num") == dbref.seqBegin and
																   (key("pdb_ins_code") == insToStr(dbref.insertBegin) or key("pdb_ins_code") == cif::null),
						"seq_id");

					seqAlignEnd = pdbxPolySeqScheme.find1<int>(key("pdb_strand_id") == std::string{ dbref.chainID } and
																   key("pdb_seq_num") == dbref.seqEnd and
																   (key("pdb_ins_code") == insToStr(dbref.insertEnd) or key("pdb_ins_code") == cif::null),
						"seq_id");
				}
				catch (...)
				{
				}

				// clang-format off
				getCategory("struct_ref_seq")->emplace({
					{ "align_id", structRefSeqAlignID },
					{ "ref_id", structRefID },
					{ "pdbx_PDB_id_code", dbref.PDBIDCode },
					{ "pdbx_strand_id", std::string{ chain.mDbref.chainID } },
					{ "seq_align_beg", seqAlignBeg },
					{ "pdbx_seq_align_beg_ins_code", insToStr(dbref.insertBegin) },
					{ "seq_align_end", seqAlignEnd },
					{ "pdbx_seq_align_end_ins_code", insToStr(dbref.insertEnd) },
					{ "pdbx_db_accession", dbref.dbAccession },
					{ "db_align_beg", dbref.dbSeqBegin },
					{ "pdbx_db_align_beg_ins_code", insToStr(dbref.dbinsBeg) },
					{ "db_align_end", dbref.dbSeqEnd },
					{ "pdbx_db_align_end_ins_code", insToStr(dbref.dbinsEnd) },
					{ "pdbx_auth_seq_align_beg", dbref.seqBegin },
					{ "pdbx_auth_seq_align_end", dbref.seqEnd }
				});
				// clang-format on

				// write the struct_ref_seq_dif
				for (auto &seqadv : mSeqadvs)
				{
					if (seqadv.chainID != chain.mDbref.chainID or seqadv.resName.empty())
						continue;

					std::string asym, seqNum;
					int labelSeq = -1;
					std::error_code ec;

					std::tie(asym, labelSeq, std::ignore) = MapResidue(seqadv.chainID, seqadv.seqNum, seqadv.iCode, ec);
					if (ec)
					{
						if (cif::VERBOSE > 0)
							std::cerr << "dropping unmatched SEQADV record\n";
						continue;
					}

					seqNum = std::to_string(labelSeq);

					// clang-format off
					getCategory("struct_ref_seq_dif")->emplace({
						{ "align_id", structRefSeqAlignID },
						{ "pdbx_PDB_id_code", dbref.PDBIDCode },
						{ "mon_id", seqadv.resName },
						{ "pdbx_pdb_strand_id", seqadv.chainID },
						{ "seq_num", seqNum },
						{ "pdbx_pdb_ins_code", seqadv.iCode == ' ' ? std::string{} : std::string{ seqadv.iCode } },
						{ "pdbx_seq_db_name", seqadv.database },
						{ "pdbx_seq_db_accession_code", seqadv.dbAccession },
						{ "db_mon_id", seqadv.dbRes },
						{ "pdbx_seq_db_seq_num", seqadv.dbSeq },
						{ "details", seqadv.conflict },
						{ "pdbx_auth_seq_num", seqadv.seqNum },
						{ "pdbx_ordinal", ++mPdbxDifOrdinal }
					});
					// clang-format on
				}
			}

			if (not chains.empty()) // not the first one for this molID
			{
				chains.push_back(std::string{ chain.mDbref.chainID });
				continue;
			}

			chains.push_back(std::string{ chain.mDbref.chainID });

			std::size_t seqLen = 0, seqCanLen = 0;

			for (auto &res : chain.mSeqres)
			{
				std::string letter, stdRes;

				if (mMod2parent.count(res.mMonID))
					stdRes = mMod2parent.at(res.mMonID);

				if (cif::compound_factory::kAAMap.count(res.mMonID))
				{
					letter = cif::compound_factory::kAAMap.at(res.mMonID);
					mightBeDNA = false;
				}
				else if (cif::compound_factory::kBaseMap.count(res.mMonID))
				{
					letter = cif::compound_factory::kBaseMap.at(res.mMonID);
					mightBePolyPeptide = false;
				}
				else
				{
					nstdMonomer = true;
					letter = '(' + res.mMonID + ')';

					// sja...
					auto compound = cif::compound_factory::instance().create(stdRes.empty() ? res.mMonID : stdRes);
					if (compound != nullptr and
						not iequals(compound->type(), "L-peptide linking") and
						not iequals(compound->type(), "RNA linking"))
					{
						nonstandardLinkage = true;
					}
				}

				if (seqLen + letter.length() > 80)
				{
					seq += '\n';
					seqLen = 0;
				}

				seq += letter;
				seqLen += letter.length();

				if (letter.length() > 1)
				{
					if (not stdRes.empty() and cif::compound_factory::kAAMap.count(stdRes))
						letter = cif::compound_factory::kAAMap.at(stdRes);
					else if (cif::compound_factory::kBaseMap.count(res.mMonID))
						letter = cif::compound_factory::kBaseMap.at(res.mMonID);
					else
						letter = 'X';
				}

				if (seqCanLen + letter.length() > 80)
				{
					seqCan += '\n';
					seqCanLen = 0;
				}
				seqCan += letter;
				seqCanLen += letter.length();
			}

			auto cat_ps = getCategory("entity_poly_seq");
			for (std::size_t i = 0; i < chain.mSeqres.size(); ++i)
			{
				auto &rs = chain.mSeqres[i];

				if (std::find(mChemComp.begin(), mChemComp.end(), rs.mMonID) == mChemComp.end())
					mChemComp.emplace_back(rs.mMonID);

				// clang-format off
				cat_ps->emplace({
					{ "entity_id", mMolID2EntityID[cmp.mMolID] },
					{ "num", i + 1 },
					{ "mon_id", rs.mMonID },
					{ "hetero", rs.mAlts.empty() ? "n" : "y" }
				});
				// clang-format on

				for (auto &a : rs.mAlts)
				{
					// clang-format off
					cat_ps->emplace({
						{ "entity_id", mMolID2EntityID[cmp.mMolID] },
						{ "num", i + 1 },
						{ "mon_id", a },
						{ "hetero", "y" }
					});
					// clang-format on
				}
			}
		}

		std::string type;
		if (mightBePolyPeptide and not mightBeDNA)
			type = "polypeptide(L)";
		else if (mightBeDNA and not mightBePolyPeptide)
			type = "polyribonucleotide";
		else
			type = "other";

		// clang-format off
		getCategory("entity_poly")->emplace({
			{ "entity_id", mMolID2EntityID[cmp.mMolID] },
			{ "pdbx_seq_one_letter_code", seq },
			{ "pdbx_seq_one_letter_code_can", seqCan },
			{ "nstd_monomer", (nstdMonomer ? "yes" : "no") },
			{ "pdbx_strand_id", cif::join(chains, ",") },
			{ "nstd_linkage", nonstandardLinkage ? "yes" : "no" },
			{ "type", type }
		});
		// clang-format on
	}

	if (not(structTitle.empty() and structDescription.empty()))
	{
		// clang-format off
		getCategory("struct")->emplace({
			{ "entry_id", mStructureID },
			{ "title", cif::join(structTitle, ", ") },
			{ "pdbx_descriptor", cif::join(structDescription, ", ") },
			{ "pdbx_model_type_details", mModelTypeDetails }
		});
		// clang-format on
	}

	// build sugar trees first
	// ConstructSugarTrees(asymNr);

	// done with the sugar, resume operation as before

	std::map<char, std::string> waterChains;
	std::map<std::tuple<std::string, std::string>, int> ndbSeqNum; // for nonpoly scheme
	std::map<std::string, int> entityAuthSeqNum;                   // for nonpoly scheme too

	for (std::size_t i = 0; i < mHets.size(); ++i)
	{
		auto &heti = mHets[i];

		if (not heti.asymID.empty())
			continue;

		if (heti.hetID == mWaterHetID or isWater(heti.hetID))
			continue;

		// See if this residue is part of SEQRES
		auto &chain = GetChainForID(heti.chainID);
		auto ih = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ heti.hetID, heti.seqNum, heti.iCode });

		// If so, skip it, it is not an entity then
		if (ih != chain.mSeqres.end())
			continue;

		heti.asymID = cif::cif_id_for_number(asymNr++);
	}

	std::set<std::string> writtenAsyms;

	std::map<std::string, int> hetCount; // for pdbx_number_of_molecules
	for (auto &het : mHets)
		hetCount[het.hetID] += 1;

	for (auto &het : mHets)
	{
		std::string hetID = het.hetID;

		auto &chain = GetChainForID(het.chainID);

		// See if this residue is part of SEQRES
		auto i = find(chain.mSeqres.begin(), chain.mSeqres.end(), PDBSeqRes{ hetID, het.seqNum, het.iCode });

		// If so, skip it, it is not an entity then
		if (i != chain.mSeqres.end())
			continue;

		// See if we've already added it to the entities
		if (mHet2EntityID.count(hetID) == 0)
		{
			std::string entityID = std::to_string(mNextEntityNr++);
			mHet2EntityID[hetID] = entityID;

			if (hetID == mWaterHetID)
			{
				// clang-format off
				getCategory("entity")->emplace({
					{ "id", entityID },
					{ "type", "water" },
					{ "src_method", "nat" },
					{ "pdbx_description", "water" },
					{ "pdbx_number_of_molecules", hetCount[hetID] }
				});
				// clang-format on
			}
			else
			{
				if (mHetnams[hetID].empty())
				{
					auto compound = cif::compound_factory::instance().create(hetID);
					if (compound != nullptr)
						mHetnams[hetID] = compound->name();
				}

				// clang-format off
				getCategory("entity")->emplace({
					{ "id", entityID },
					{ "type", "non-polymer" },
					{ "src_method", "syn" },
					{ "pdbx_description", mHetnams[hetID] },
					{ "details", mHetsyns[hetID] },
					{ "pdbx_number_of_molecules", hetCount[hetID] }
				});
				// clang-format on
			}

			// write a pdbx_entity_nonpoly record
			std::string name = mHetnams[hetID];
			if (name.empty() and hetID == mWaterHetID)
				name = "water";

			// clang-format off
			getCategory("pdbx_entity_nonpoly")->emplace({
				{ "entity_id", entityID },
				{ "name", name },
				{ "comp_id", hetID }
			});
			// clang-format on
		}

		// create an asym for this het/chain combo, if needed

		std::string asymID = het.asymID;

		auto k = std::make_tuple(het.chainID, het.seqNum, het.iCode);
		if (mChainSeq2AsymSeq.count(k) == 0)
		{
			if (hetID == mWaterHetID or isWater(hetID))
			{
				if (waterChains.count(het.chainID) == 0)
				{
					asymID = cif::cif_id_for_number(asymNr++);
					waterChains[het.chainID] = asymID;
				}
				else
					asymID = waterChains[het.chainID];
			}
			else
				asymID = het.asymID;

			assert(asymID.empty() == false);

			mAsymID2EntityID[asymID] = mHet2EntityID[hetID];

			// NOTE, a nonpoly residue has no label_seq_id
			// but in pdbx_nonpoly_scheme there is such a number.
			// Since this number is not used anywhere else we
			// just use it here and do not store it in the table
			mChainSeq2AsymSeq[k] = std::make_tuple(asymID, 0, false);

			if (writtenAsyms.count(asymID) == 0)
			{
				writtenAsyms.insert(asymID);

				// clang-format off
				getCategory("struct_asym")->emplace({
					{ "id", asymID },
					{ "pdbx_blank_PDB_chainid_flag", het.chainID == ' ' ? "Y" : "N" },
					//					pdbx_modified
					{ "entity_id", mHet2EntityID[hetID] },
					//					details
				});

				// clang-format on
			}
		}

		int seqNr = ++ndbSeqNum[std::make_tuple(hetID, asymID)];
		int authSeqNr = ++entityAuthSeqNum[hetID];

		std::string iCode{ het.iCode };
		cif::trim(iCode);
		if (iCode.empty())
			iCode = { '.' };

		// clang-format off
		getCategory("pdbx_nonpoly_scheme")->emplace({
			{ "asym_id", asymID },
			{ "entity_id", mHet2EntityID[hetID] },
			{ "mon_id", hetID },
			{ "ndb_seq_num", seqNr },
			{ "pdb_seq_num", het.seqNum },
			{ "auth_seq_num", authSeqNr }, // Yes
			{ "pdb_mon_id", hetID },
			{ "auth_mon_id", hetID },
			{ "pdb_strand_id", std::string{ het.chainID } },
			{ "pdb_ins_code", iCode }
		});
		// clang-format on

		// mapping needed?
		mChainSeq2AsymSeq[std::make_tuple(het.chainID, het.seqNum, het.iCode)] = std::make_tuple(asymID, seqNr, false);
	}

	int modResID = 1;
	std::set<std::string> modResSet;
	for (auto rec = FindRecord("MODRES"); rec != nullptr and rec->is("MODRES");
		rec = rec->mNext)                      //	 1 -  6        Record name   "MODRES"
	{                                          //	 8 - 11        IDcode        idCode      ID code of this datablock.
		std::string resName = rec->vS(13, 15); //	13 - 15        Residue name  resName     Residue name used in this datablock.
		char chainID = rec->vC(17);            //	17             Character     chainID     Chain identifier.
		int seqNum = rec->vI(19, 22);          //	19 - 22        Integer       seqNum      Sequence number.
		char iCode = rec->vC(23);              //	23             AChar         iCode       Insertion code.
		std::string stdRes = rec->vS(25, 27);  //	25 - 27        Residue name  stdRes      Standard residue name.
		std::string comment = rec->vS(30, 70); //	30 - 70        String        comment     Description of the residue modification.

		std::string asymID;
		int seq;
		std::error_code ec;

		std::tie(asymID, seq, std::ignore) = MapResidue(chainID, seqNum, iCode, ec);
		if (ec) // no need to write a modres if it could not be found
		{
			if (cif::VERBOSE > 0)
				std::cerr << "dropping unmapped MODRES record\n";
			continue;
		}

		// clang-format off
		getCategory("pdbx_struct_mod_residue")->emplace({
			{ "id", modResID++ },
			{ "label_asym_id", asymID },
			{ "label_seq_id", seq },
			{ "label_comp_id", resName },
			{ "auth_asym_id", std::string(1, chainID) },
			{ "auth_seq_id", seqNum },
			{ "auth_comp_id", resName },
			{ "PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
			{ "parent_comp_id", stdRes },
			{ "details", comment }
		});
		// clang-format on

		modResSet.insert(resName);
	}

	//	// chem compounds

	for (auto cc : mChemComp)
	{
		auto compound = cif::compound_factory::instance().create(
			mMod2parent.count(cc) ? mMod2parent[cc] : cc);

		std::string name;
		std::string formula;
		std::string type;
		std::string nstd = ".";
		std::optional<float> formulaWeight;

		if (compound != nullptr)
		{
			name = compound->name();
			type = compound->type();

			if (iequals(type, "L-peptide linking") or iequals(type, "peptide linking"))
				nstd = "y";

			formula = compound->formula();
			formulaWeight = compound->formula_weight();
		}

		if (name.empty())
			name = mHetnams[cc];

		if (type.empty())
			type = "NON-POLYMER";

		if (formula.empty())
		{
			formula = mFormuls[cc];

			const std::regex rx(R"(\d+\((.+)\))");
			std::smatch m;
			if (std::regex_match(formula, m, rx))
				formula = m[1].str();
		}

		if (modResSet.count(cc))
			nstd = "n";

		// clang-format off
		getCategory("chem_comp")->emplace({
			{ "id", cc },
			{ "name", name },
			{ "formula", formula },
			{ "formula_weight", formulaWeight, 3 },
			{ "mon_nstd_flag", nstd },
			{ "type", type }
		});
		// clang-format on
	}

	getCategory("chem_comp")->reorder_by_index();

	// unobserved can now be written as well

	int idRes = 0, idAtom = 0;
	sort(mUnobs.begin(), mUnobs.end(), [](const UNOBS &a, const UNOBS &b) -> bool
		{
			 int d = a.modelNr - b.modelNr;
			 if (d == 0)
				 d = a.seq - b.seq;
			 return d < 0; });

	for (auto &unobs : mUnobs)
	{
		bool isPolymer = false;
		std::string asymID, compID = unobs.res;
		int seqNr = 0;
		std::error_code ec;

		std::tie(asymID, seqNr, isPolymer) = MapResidue(unobs.chain, unobs.seq, unobs.iCode, ec);
		if (ec)
		{
			if (cif::VERBOSE > 0)
				std::cerr << "error mapping unobserved residue\n";
			continue;
		}

		if (unobs.atoms.empty())
		{
			// clang-format off
			getCategory("pdbx_unobs_or_zero_occ_residues")->emplace({
				{ "id", std::to_string(++idRes) },
				{ "polymer_flag", isPolymer ? "Y" : "N" },
				{ "occupancy_flag", 1 },
				{ "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
				{ "auth_asym_id", std::string{ unobs.chain } },
				{ "auth_comp_id", unobs.res },
				{ "auth_seq_id", unobs.seq },
				{ "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
				{ "label_asym_id", asymID },
				{ "label_comp_id", compID }, // TODO: change to correct comp_id
				{ "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" }
			});
			// clang-format on
		}
		else
		{
			for (auto &atom : unobs.atoms)
			{
				// clang-format off
				getCategory("pdbx_unobs_or_zero_occ_atoms")->emplace({
					{ "id", std::to_string(++idAtom) },
					{ "polymer_flag", isPolymer ? "Y" : "N" },
					{ "occupancy_flag", 1 },
					{ "PDB_model_num", unobs.modelNr ? unobs.modelNr : 1 },
					{ "auth_asym_id", std::string{ unobs.chain } },
					{ "auth_comp_id", unobs.res },
					{ "auth_seq_id", unobs.seq },
					{ "PDB_ins_code", unobs.iCode == ' ' ? "" : std::string{ unobs.iCode } },
					{ "auth_atom_id", atom },
					{ "label_asym_id", asymID },
					{ "label_comp_id", compID }, // TODO: change to correct comp_id
					{ "label_seq_id", seqNr > 0 ? std::to_string(seqNr) : "" },
					{ "label_atom_id", atom }
				});
				// clang-format on
			}
		}
	}
}

void PDBFileParser::ConstructSugarTrees(int &asymNr)
{
	for (;;)
	{
		// find a first NAG/NDG
		auto si = std::find_if(mHets.begin(), mHets.end(), [](const HET &h)
			{ return (h.hetID == "NAG" or h.hetID == "NDG") and not(h.processed or h.branch); });
		if (si != mHets.end())
		{
			si->processed = true;

			// take the location of the C1 atom(s?)
			std::set<char> ci;

			for (auto a : si->atoms)
			{
				std::string name = a->vS(13, 16); //	13 - 16        Atom          name         Atom name.

				if (name != "C1")
					continue;

				ci.insert(a->vC(17)); //	17             Character     altLoc       Alternate location indicator.
			}

			if (ci.empty())
				continue;

			for (auto alt : ci)
			{
				ATOM_REF c1{ "C1", si->hetID, si->seqNum, si->chainID, si->iCode, alt };

				const auto &[asn, linked] = FindLink(c1, "ND2", "ASN");
				if (not linked)
					continue;

				std::stack<ATOM_REF> c1s;
				c1s.push(c1);

				SUGAR_TREE sugarTree;
				sugarTree.push_back({ c1 });

				// naive implementation
				while (not c1s.empty())
				{
					c1 = c1s.top();
					c1s.pop();

					for (auto o : { "O1", "O2", "O3", "O4", "O5", "O6" })
					{
						ATOM_REF leaving = c1;
						leaving.name = o;

						const auto &[nc1, linked_c1] = FindLink(leaving, "C1");
						if (linked_c1)
						{
							sugarTree.push_back({ nc1, o[1] - '0', c1 });
							c1s.push(nc1);
						}
					}
				}

				if (sugarTree.size() < 2) // not really a tree
					continue;

				auto branchName = sugarTree.entityName();
				auto entityID = mBranch2EntityID[branchName];

				// See if we've already added it to the entities
				if (entityID.empty())
				{
					entityID = std::to_string(mNextEntityNr++);
					mBranch2EntityID[branchName] = entityID;

					// clang-format off
					getCategory("entity")->emplace({
						{ "id", entityID },
						{ "type", "branched" },
						{ "src_method", "man" },
						{ "pdbx_description", branchName }
					});

					getCategory("pdbx_entity_branch")->emplace({
						{ "entity_id", entityID },
						{ "type", "oligosaccharide" }
					});
					// clang-format on

					int num = 0;
					std::map<ATOM_REF, int> branch_list;

					for (auto &s : sugarTree)
					{
						// clang-format off
						getCategory("pdbx_entity_branch_list")->emplace({
							{ "entity_id", entityID },
							{ "comp_id", s.c1.resName },
							{ "num", ++num },
							{ "hetero", ci.size() == 1 ? "n" : "y" }
						});
						// clang-format on

						branch_list[s.c1] = num;
					}

					auto &branch_link = *getCategory("pdbx_entity_branch_link");

					for (auto &s : sugarTree)
					{
						if (s.leaving_o == 0)
							continue;

						// clang-format off
						branch_link.emplace({
							{ "link_id", branch_link.size() + 1 },
							{ "entity_id", entityID },
							{ "entity_branch_list_num_1", branch_list[s.c1] },
							{ "comp_id_1", s.c1.resName },
							{ "atom_id_1", s.c1.name },
							{ "leaving_atom_id_1", "O1" },
							{ "entity_branch_list_num_2", branch_list[s.next] },
							{ "comp_id_2", s.next.resName },
							{ "atom_id_2", "O" + std::to_string(s.leaving_o) },
							{ "leaving_atom_id_2", "HO" + std::to_string(s.leaving_o) },
							{ "value_order", "sing" } /// ??
						});
						// clang-format on
					}
				}

				mSugarEntities.insert(entityID);

				// create an asym for this sugar tree

				std::string asymID = cif::cif_id_for_number(asymNr++);

				mAsymID2EntityID[asymID] = entityID;

				// clang-format off
				getCategory("struct_asym")->emplace({
					{ "id", asymID },
					{ "pdbx_blank_PDB_chainid_flag", si->chainID == ' ' ? "Y" : "N" },
					{ "pdbx_modified", "N" },
					{ "entity_id", entityID }
				});
				// clang-format on

				std::string iCode{ si->iCode };
				cif::trim(iCode);
				if (iCode.empty())
					iCode = { '.' };

				int num = 0;
				for (auto s : sugarTree)
				{
					// clang-format off
					getCategory("pdbx_branch_scheme")->emplace({
						{ "asym_id", asymID },
						{ "entity_id", entityID },
						{ "mon_id", s.c1.resName },
						{ "num", ++num },
						{ "pdb_asym_id", asymID },
						{ "pdb_mon_id", s.c1.resName },
						{ "pdb_seq_num", num },
						{ "auth_asym_id", std::string{ s.c1.chainID } },
						{ "auth_mon_id", s.next.resName },
						{ "auth_seq_num", s.c1.resSeq },
						{ "hetero", ci.size() == 1 ? "n" : "y" }
					});
					// clang-format on

					auto k = std::make_tuple(s.c1.chainID, s.c1.resSeq, s.c1.iCode);
					assert(mChainSeq2AsymSeq.count(k) == 0);

					mChainSeq2AsymSeq[k] = std::make_tuple(asymID, num, false);

					// mark all hets as part of tree

					for (auto &h : mHets)
					{
						if (h.hetID == s.c1.resName and h.chainID == s.c1.chainID and h.seqNum == s.c1.resSeq and h.iCode == s.c1.iCode)
						{
							h.branch = true;
							break; // should be only one of course... right?
						}
					}
				}

				break;
			}

			continue;
		}

		break;
	}

	// remove the branched HET's
	mHets.erase(std::remove_if(mHets.begin(), mHets.end(), [](auto &h)
					{ return h.branch; }),
		mHets.end());
}

void PDBFileParser::ParseSecondaryStructure()
{
	bool firstHelix = true;

	while (mRec->is("HELIX "))
	{
		//	 1 -  6        Record name    "HELIX "
		//	 8 - 10        Integer        serNum        Serial number of the helix. This starts
		//	                                            at 1  and increases incrementally.
		//	12 - 14        LString(3)     helixID       Helix  identifier. In addition to a serial
		//	                                            number, each helix is given an
		//	                                            alphanumeric character helix identifier.
		//	16 - 18        Residue name   initResName   Name of the initial residue.
		//	20             Character      initChainID   Chain identifier for the chain containing
		//	                                            this  helix.
		//	22 - 25        Integer        initSeqNum    Sequence number of the initial residue.
		//	26             AChar          initICode     Insertion code of the initial residue.
		//	28 - 30        Residue  name  endResName    Name of the terminal residue of the helix.
		//	32             Character      endChainID    Chain identifier for the chain containing
		//	                                            this  helix.
		//	34 - 37        Integer        endSeqNum     Sequence number of the terminal residue.
		//	38             AChar          endICode      Insertion code of the terminal residue.
		//	39 - 40        Integer        helixClass    Helix class (see below).
		//	41 - 70        String         comment       Comment about this helix.
		//	72 - 76        Integer        length        Length of this helix.

		std::string begAsymID, endAsymID;
		int begSeq, endSeq;
		std::error_code ec;

		std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(20), vI(22, 25), vC(26), ec);
		if (not ec)
			std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(32), vI(34, 37), vC(38), ec);

		if (ec)
		{
			if (cif::VERBOSE > 0)
				std::cerr << "Could not map residue for HELIX " << vI(8, 10) << '\n';
		}
		else
		{
			auto cat = getCategory("struct_conf");
			// clang-format off
			cat->emplace({
				{ "conf_type_id", "HELX_P" },
				{ "id", "HELX_P" + std::to_string(vI(8, 10)) },
				{ "pdbx_PDB_helix_id", vS(12, 14) },
				{ "beg_label_comp_id", vS(16, 18) },
				{ "beg_label_asym_id", begAsymID },
				{ "beg_label_seq_id", begSeq },
				{ "pdbx_beg_PDB_ins_code", vS(26, 26) },
				{ "end_label_comp_id", vS(28, 30) },
				{ "end_label_asym_id", endAsymID },
				{ "end_label_seq_id", endSeq },
				{ "pdbx_end_PDB_ins_code", vS(38, 38) },

				{ "beg_auth_comp_id", vS(16, 18) },
				{ "beg_auth_asym_id", vS(20, 20) },
				{ "beg_auth_seq_id", vI(22, 25) },
				{ "end_auth_comp_id", vS(28, 30) },
				{ "end_auth_asym_id", vS(32, 32) },
				{ "end_auth_seq_id", vI(34, 37) },

				{ "pdbx_PDB_helix_class", vS(39, 40) },
				{ "details", vS(41, 70) },
				{ "pdbx_PDB_helix_length", vI(72, 76) }
			});
			// clang-format off

			if (firstHelix)
			{
				cat = getCategory("struct_conf_type");
				cat->emplace({ { "id", "HELX_P" } });
				firstHelix = false;
			}
		}

		GetNextRecord();
	}

	std::set<std::string> sheetsSeen;
	int rangeID = 1;

	while (mRec->is("SHEET "))
	{
		//	 1 -  6        Record name   "SHEET "
		//	 8 - 10        Integer       strand         Strand  number which starts at 1 for each
		//	                                            strand within a sheet and increases by one.
		//	12 - 14        LString(3)    sheetID        Sheet  identifier.
		//	15 - 16        Integer       numStrands     Number  of strands in sheet.
		//	18 - 20        Residue name  initResName    Residue  name of initial residue.
		//	22             Character     initChainID    Chain identifier of initial residue
		//	                                            in strand.
		//	23 - 26        Integer       initSeqNum     Sequence number of initial residue
		//	                                            in strand.
		//	27             AChar         initICode      Insertion code of initial residue
		//	                                            in  strand.
		//	29 - 31        Residue name  endResName     Residue name of terminal residue.
		//	33             Character     endChainID     Chain identifier of terminal residue.
		//	34 - 37        Integer       endSeqNum      Sequence number of terminal residue.
		//	38             AChar         endICode       Insertion code of terminal residue.
		//	39 - 40        Integer       sense          Sense of strand with respect to previous
		//	                                            strand in the sheet. 0 if first strand,
		//	                                            1 if  parallel,and -1 if anti-parallel.
		//	42 - 45        Atom          curAtom        Registration.  Atom name in current strand.
		//	46 - 48        Residue name  curResName     Registration.  Residue name in current strand
		//	50             Character     curChainID     Registration. Chain identifier in
		//	                                            current strand.
		//	51 - 54        Integer       curResSeq      Registration.  Residue sequence number
		//	                                            in current strand.
		//	55             AChar         curICode       Registration. Insertion code in
		//	                                            current strand.
		//	57 - 60        Atom          prevAtom       Registration.  Atom name in previous strand.
		//	61 - 63        Residue name  prevResName    Registration.  Residue name in
		//	                                            previous strand.
		//	65             Character     prevChainID    Registration.  Chain identifier in
		//	                                            previous  strand.
		//	66 - 69        Integer       prevResSeq     Registration. Residue sequence number
		//	                                            in previous strand.
		//	70             AChar         prevICode      Registration.  Insertion code in
		//	                                            previous strand.

		std::string sheetID = cif::trim_copy(vS(12, 14));
		if (sheetsSeen.count(sheetID) == 0)
		{
			sheetsSeen.insert(sheetID);

			rangeID = 1;

			getCategory("struct_sheet")->emplace({
				{ "id", sheetID },
				{ "number_strands", vI(15, 16) },
			});
		}

		int sense = vI(39, 40);

		if (sense != 0)
		{
			// clang-format off
			getCategory("struct_sheet_order")->emplace({
				{ "sheet_id", sheetID },
				{ "range_id_1", rangeID },
				{ "range_id_2", rangeID + 1 },
				{ "sense", sense == -1 ? "anti-parallel" : "parallel" }
			});
			// clang-format on
		}

		std::string begAsymID, endAsymID;
		int begSeq, endSeq;
		std::error_code ec;

		std::tie(begAsymID, begSeq, std::ignore) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
		if (not ec)
			std::tie(endAsymID, endSeq, std::ignore) = MapResidue(vC(33), vI(34, 37), vC(38), ec);

		if (ec)
		{
			if (cif::VERBOSE > 0)
				std::cerr << "Dropping SHEET record " << vI(8, 10) << '\n';
		}
		else
		{
			// clang-format off
			getCategory("struct_sheet_range")->emplace({
				{ "sheet_id", sheetID },
				{ "id", vI(8, 10) },
				{ "beg_label_comp_id", vS(18, 20) },
				{ "beg_label_asym_id", begAsymID },
				{ "beg_label_seq_id", begSeq },
				{ "pdbx_beg_PDB_ins_code", vS(27, 27) },
				{ "end_label_comp_id", vS(29, 31) },
				{ "end_label_asym_id", endAsymID },
				{ "end_label_seq_id", endSeq },
				{ "pdbx_end_PDB_ins_code", vS(38, 38) },

				{ "beg_auth_comp_id", vS(18, 20) },
				{ "beg_auth_asym_id", vS(22, 22) },
				{ "beg_auth_seq_id", vI(23, 26) },
				{ "end_auth_comp_id", vS(29, 31) },
				{ "end_auth_asym_id", vS(33, 33) },
				{ "end_auth_seq_id", vI(34, 37) },
			});
			// clang-format on

			if (sense != 0 and mRec->mVlen > 34)
			{
				std::string r1AsymID, r2AsymID;
				int r1Seq, r2Seq;

				std::tie(r1AsymID, r1Seq, std::ignore) = MapResidue(vC(65), vI(66, 69), vC(70), ec);
				if (not ec)
					std::tie(r2AsymID, r2Seq, std::ignore) = MapResidue(vC(50), vI(51, 54), vC(55), ec);

				if (ec)
				{
					if (cif::VERBOSE > 0)
						std::cerr << "skipping unmatched pdbx_struct_sheet_hbond record\n";
				}
				else
					// clang-format off
					getCategory("pdbx_struct_sheet_hbond")->emplace({
						{ "sheet_id", sheetID },
						{ "range_id_1", rangeID },
						{ "range_id_2", rangeID + 1 },
						{ "range_1_label_atom_id", vS(57, 60) },
						{ "range_1_label_comp_id", vS(61, 63) },
						{ "range_1_label_asym_id", r1AsymID },
						{ "range_1_label_seq_id", r1Seq },
						{ "range_1_PDB_ins_code", vS(70, 70) },
						{ "range_1_auth_atom_id", vS(57, 60) },
						{ "range_1_auth_comp_id", vS(61, 63) },
						{ "range_1_auth_asym_id", vS(65, 65) },
						{ "range_1_auth_seq_id", vI(66, 69) },

						{ "range_2_label_atom_id", vS(42, 45) },
						{ "range_2_label_comp_id", vS(46, 48) },
						{ "range_2_label_asym_id", r2AsymID },
						{ "range_2_label_seq_id", r2Seq },
						{ "range_2_PDB_ins_code", vS(55, 55) },
						{ "range_2_auth_atom_id", vS(42, 45) },
						{ "range_2_auth_comp_id", vS(46, 48) },
						{ "range_2_auth_asym_id", vS(50, 50) },
						{ "range_2_auth_seq_id", vI(51, 54) }
					});
				// clang-format on
			}

			if (sense != 0)
				++rangeID;
		}

		GetNextRecord();
	}
}

static bool IsMetal(const std::string &resName, const std::string &atomID)
{
	bool result = false;

	try
	{
		auto compound = cif::compound_factory::instance().create(resName);
		if (compound != nullptr)
		{
			auto at = cif::atom_type_traits(compound->get_atom_by_atom_id(atomID).type_symbol);
			result = at.is_metal();
		}
	}
	catch (...)
	{
	}

	return result;
}

void PDBFileParser::ParseConnectivtyAnnotation()
{
	int ssBondNr = 0;
	int linkNr = 0;
	bool firstCovale = true, firstMetalc = true;

	// Aaargh... Coot writes the records in the wrong order...
	for (;; GetNextRecord())
	{
		if (mRec->is("SSBOND"))
		{
			if (ssBondNr == 0)
			{
				getCategory("struct_conn_type")->emplace({
					{ "id", "disulf" },
				});
			}

			//	 1 -  6        Record name    "SSBOND"
			//	 8 - 10        Integer        serNum           Serial number.
			//	12 - 14        LString(3)     "CYS"            Residue name.
			//	16             Character      chainID1         Chain identifier.
			//	18 - 21        Integer        seqNum1          Residue sequence number.
			//	22             AChar          icode1           Insertion code.
			//	26 - 28        LString(3)     "CYS"            Residue name.
			//	30             Character      chainID2         Chain identifier.
			//	32 - 35        Integer        seqNum2          Residue sequence number.
			//	36             AChar          icode2           Insertion code.
			//	60 - 65        SymOP          sym1             Symmetry operator for residue 1.
			//	67 - 72        SymOP          sym2             Symmetry operator for residue 2.
			//	74 – 78        Real(5.2)      Length           Disulfide bond distance

			std::string p1Asym, p2Asym;
			int p1Seq = 0, p2Seq = 0;
			std::error_code ec;

			std::tie(p1Asym, p1Seq, std::ignore) = MapResidue(vC(16), vI(18, 21), vC(22), ec);
			if (not ec)
				std::tie(p2Asym, p2Seq, std::ignore) = MapResidue(vC(30), vI(32, 35), vC(36), ec);

			if (ec)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Dropping SSBOND " << vI(8, 10) << '\n';
				continue;
			}

			std::vector<char> alt1 = altLocsForAtom(vC(16), vI(18, 21), vC(22), "SG");
			std::vector<char> alt2 = altLocsForAtom(vC(30), vI(32, 35), vC(36), "SG");

			if (alt1.empty())
				alt1.push_back(0);
			if (alt2.empty())
				alt2.push_back(0);

			std::string sym1, sym2;
			try
			{
				sym1 = pdb2cifSymmetry(vS(60, 65));
				sym2 = pdb2cifSymmetry(vS(67, 72));
			}
			catch (const std::exception &ex)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Dropping SSBOND " << vI(8, 10) << " due to invalid symmetry operation\n";
				continue;
			}

			for (auto a1 : alt1)
			{
				for (auto a2 : alt2)
				{
					// clang-format off
					getCategory("struct_conn")->emplace({
						{ "id", "disulf" + std::to_string(++ssBondNr) },
						{ "conn_type_id", "disulf" },

						{ "ptnr1_label_asym_id", p1Asym },
						{ "pdbx_ptnr1_label_alt_id", a1 ? std::string{ a1 } : std::string() },
						{ "ptnr1_label_comp_id", vS(12, 14) },
						{ "ptnr1_label_seq_id", p1Seq ? std::to_string(p1Seq) : "." },
						{ "ptnr1_label_atom_id", "SG" },
						{ "ptnr1_symmetry", sym1 },

						{ "ptnr2_label_asym_id", p2Asym },
						{ "pdbx_ptnr2_label_alt_id", a2 ? std::string{ a2 } : std::string() },
						{ "ptnr2_label_comp_id", vS(26, 28) },
						{ "ptnr2_label_seq_id", p2Seq ? std::to_string(p2Seq) : "." },
						{ "ptnr2_label_atom_id", "SG" },

						{ "ptnr1_auth_asym_id", vS(16, 16) },
						{ "ptnr1_auth_comp_id", vS(12, 14) },
						{ "ptnr1_auth_seq_id", vI(18, 21) },
						{ "ptnr2_auth_asym_id", vS(30, 30) },
						{ "ptnr2_auth_comp_id", vS(26, 28) },
						{ "ptnr2_auth_seq_id", vI(32, 35) },

						{ "ptnr2_symmetry", sym2 },

						{ "pdbx_dist_value", vS(74, 78) },
					});
					// clang-format on
				}
			}

			continue;
		}

		if (mRec->is("LINK  ") or mRec->is("LINKR "))
		{
			if (cif::VERBOSE > 0 and mRec->is("LINKR "))
				std::cerr << "Accepting non-standard LINKR record, but ignoring extra information\n";

			//	 1 -  6         Record name    "LINK  "
			std::string name1 = vS(13, 16);    //	13 - 16         Atom           name1           Atom name.
			                                   //	17              Character      altLoc1         Alternate location indicator.
			std::string resName1 = vS(18, 20); //	18 - 20         Residue name   resName1        Residue  name.
			                                   //	22              Character      chainID1        Chain identifier.
			                                   //	23 - 26         Integer        resSeq1         Residue sequence number.
			                                   //	27              AChar          iCode1          Insertion code.
			std::string name2 = vS(43, 46);    //	43 - 46         Atom           name2           Atom name.
			                                   //	47              Character      altLoc2         Alternate location indicator.
			std::string resName2 = vS(48, 50); //	48 - 50         Residue name   resName2        Residue name.
			                                   //	52              Character      chainID2        Chain identifier.
			                                   //	53 - 56         Integer        resSeq2         Residue sequence number.
			                                   //	57              AChar          iCode2          Insertion code.
			                                   //	60 - 65         SymOP          sym1            Symmetry operator atom 1.
			                                   //	67 - 72         SymOP          sym2            Symmetry operator atom 2.
			                                   //	74 – 78         Real(5.2)      Length          Link distance

			std::string type = "covale";
			if (IsMetal(resName1, name1) or IsMetal(resName2, name2))
				type = "metalc";

			if (type == "covale" and firstCovale)
			{
				getCategory("struct_conn_type")->emplace({
					{ "id", type },
				});
				firstCovale = false;
			}

			if (type == "metalc" and firstMetalc)
			{
				getCategory("struct_conn_type")->emplace({
					{ "id", type },
				});
				firstMetalc = false;
			}

			++linkNr;

			std::string p1Asym, p2Asym;
			int p1Seq = 0, p2Seq = 0;
			bool isResseq1 = false, isResseq2 = false;
			std::error_code ec;

			std::tie(p1Asym, p1Seq, isResseq1) = MapResidue(vC(22), vI(23, 26), vC(27), ec);
			if (not ec)
				std::tie(p2Asym, p2Seq, isResseq2) = MapResidue(vC(52), vI(53, 56), vC(57), ec);

			if (ec)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Dropping LINK record at line " << mRec->mLineNr << '\n';
				continue;
			}

			std::string distance, ccp4LinkID;

			if (mRec->is("LINK  "))
			{
				distance = vS(74, 78);

				double d;
				auto r = cif::from_chars(distance.data(), distance.data() + distance.length(), d);
				if ((bool)r.ec)
				{
					if (cif::VERBOSE > 0)
						std::cerr << "Distance value '" << distance << "' is not a valid float in LINK record\n";
					swap(ccp4LinkID, distance); // assume this is a ccp4_link_id... oh really?
				}
			}
			else                         // LINKR
				ccp4LinkID = vS(74, 78); // the link ID

			std::string sym1, sym2;
			try
			{
				sym1 = pdb2cifSymmetry(vS(60, 65));
				sym2 = pdb2cifSymmetry(vS(67, 72));
			}
			catch (const std::exception &ex)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Dropping LINK record at line " << mRec->mLineNr << " due to invalid symmetry operation\n";
				continue;
			}

			// clang-format off
			getCategory("struct_conn")->emplace({
				{ "id", type + std::to_string(linkNr) },
				{ "conn_type_id", type },

				// { "ccp4_link_id", ccp4LinkID },

				{ "ptnr1_label_asym_id", p1Asym },
				{ "ptnr1_label_comp_id", vS(18, 20) },
				{ "ptnr1_label_seq_id", (isResseq1 and p1Seq) ? std::to_string(p1Seq) : "." },
				{ "ptnr1_label_atom_id", vS(13, 16) },
				{ "pdbx_ptnr1_label_alt_id", vS(17, 17) },
				{ "pdbx_ptnr1_PDB_ins_code", vS(27, 27) },
				{ "pdbx_ptnr1_standard_comp_id", "" },
				{ "ptnr1_symmetry", sym1 },

				{ "ptnr2_label_asym_id", p2Asym },
				{ "ptnr2_label_comp_id", vS(48, 50) },
				{ "ptnr2_label_seq_id", (isResseq2 and p2Seq) ? std::to_string(p2Seq) : "." },
				{ "ptnr2_label_atom_id", vS(43, 46) },
				{ "pdbx_ptnr2_label_alt_id", vS(47, 47) },
				{ "pdbx_ptnr2_PDB_ins_code", vS(57, 57) },

				{ "ptnr1_auth_asym_id", vS(22, 22) },
				{ "ptnr1_auth_comp_id", vS(18, 20) },
				{ "ptnr1_auth_seq_id", vI(23, 26) },
				{ "ptnr2_auth_asym_id", vS(52, 52) },
				{ "ptnr2_auth_comp_id", vS(48, 50) },
				{ "ptnr2_auth_seq_id", vI(53, 56) },

				// { "ptnr1_auth_atom_id", vS(13, 16) },
			    // { "ptnr2_auth_atom_id", vS(43, 46) },

				{ "ptnr2_symmetry", sym2 },

				{ "pdbx_dist_value", distance }
			});
			// clang-format on

			continue;
		}

		if (mRec->is("CISPEP"))
		{
			//	 1 -  6       Record name   "CISPEP"
			int serNum = vI(8, 10);           //	 8 - 10       Integer       serNum        Record serial number.
			std::string pep1 = vS(12, 14);    //	12 - 14       LString(3)    pep1          Residue name.
			char chainID1 = vC(16);           //	16            Character     chainID1      Chain identifier.
			int seqNum1 = vI(18, 21);         //	18 - 21       Integer       seqNum1       Residue sequence number.
			char iCode1 = vC(22);             //	22            AChar         icode1        Insertion code.
			std::string pep2 = vS(26, 28);    //	26 - 28       LString(3)    pep2          Residue name.
			char chainID2 = vC(30);           //	30            Character     chainID2      Chain identifier.
			int seqNum2 = vI(32, 35);         //	32 - 35       Integer       seqNum2       Residue sequence number.
			char iCode2 = vC(36);             //	36            AChar         icode2        Insertion code.
			int modNum = vI(44, 46);          //	44 - 46       Integer       modNum        Identifies the specific model.
			std::string measure = vF(54, 59); //	54 - 59       Real(6.2)     measure       Angle measurement in degrees.

			if (modNum == 0)
				modNum = 1;

			std::string lAsym1, lAsym2;
			int lResSeq1, lResSeq2;
			std::error_code ec;

			std::tie(lAsym1, lResSeq1, std::ignore) = MapResidue(chainID1, seqNum1, iCode1, ec);
			if (not ec)
				std::tie(lAsym2, lResSeq2, std::ignore) = MapResidue(chainID2, seqNum2, iCode2, ec);

			if (ec)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Dropping CISPEP record at line " << mRec->mLineNr << '\n';
				continue;
			}

			std::string iCode1str = iCode1 == ' ' ? std::string() : std::string{ iCode1 };
			std::string iCode2str = iCode2 == ' ' ? std::string() : std::string{ iCode2 };

			// clang-format off
			getCategory("struct_mon_prot_cis")->emplace({
				{ "pdbx_id", serNum },
				{ "label_comp_id", pep1 },
				{ "label_seq_id", lResSeq1 },
				{ "label_asym_id", lAsym1 },
				{ "label_alt_id", "." },
				{ "pdbx_PDB_ins_code", iCode1str },
				{ "auth_comp_id", pep1 },
				{ "auth_seq_id", seqNum1 },
				{ "auth_asym_id", std::string{ chainID1 } },
				{ "pdbx_label_comp_id_2", pep2 },
				{ "pdbx_label_seq_id_2", lResSeq2 },
				{ "pdbx_label_asym_id_2", lAsym2 },
				{ "pdbx_PDB_ins_code_2", iCode2str },
				{ "pdbx_auth_comp_id_2", pep2 },
				{ "pdbx_auth_seq_id_2", seqNum2 },
				{ "pdbx_auth_asym_id_2", std::string{ chainID2 } },
				{ "pdbx_PDB_model_num", modNum },
				{ "pdbx_omega_angle", measure }
			});
			// clang-format on

			continue;
		}

		break;
	}
}

void PDBFileParser::ParseMiscellaneousFeatures()
{
	int structSiteGenID = 1;

	while (mRec->is("SITE  "))
	{                                    //	 1 -  6        Record name   "SITE  "
		                                 //	 8 - 10        Integer       seqNum        Sequence number.
		std::string siteID = vS(12, 14); //	12 - 14        LString(3)    siteID        Site name.
		int numRes = vI(16, 17);         //	16 - 17        Integer       numRes        Number of residues that compose the site.

		int o = 19;

		auto cat = getCategory("struct_site_gen");

		for (int i = 0; i < numRes; ++i)
		{
			std::string resName = vS(o, o + 2); //	19 - 21        Residue name  resName1      Residue name for first residue that
			                                    //	                                           creates the site.
			char chainID = vC(o + 4);           //	23             Character     chainID1      Chain identifier for first residue of site.
			int seq = vI(o + 5, o + 8);         //	24 - 27        Integer       seq1          Residue sequence number for first residue
			                                    //	                                           of the  site.
			char iCode = vC(o + 9);             //	28             AChar         iCode1        Insertion code for first residue of the site.

			int labelSeq;
			std::string asym;
			bool isResseq;
			std::error_code ec;

			std::tie(asym, labelSeq, isResseq) = MapResidue(chainID, seq, iCode, ec);

			if (ec)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "skipping struct_site_gen record\n";
			}
			else
				// clang-format off
				cat->emplace({
					{ "id", structSiteGenID++ },
					{ "site_id", siteID },
					{ "pdbx_num_res", numRes },
					{ "label_comp_id", resName },
					{ "label_asym_id", asym },
					{ "label_seq_id", (labelSeq > 0 and isResseq) ? std::to_string(labelSeq) : std::string(".") },
					{ "pdbx_auth_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
					{ "auth_comp_id", resName },
					{ "auth_asym_id", std::string{ chainID } },
					{ "auth_seq_id", seq },
					{ "label_atom_id", "." },
					{ "label_alt_id", "." },
				});
			// clang-format on

			o += 11;
		}

		GetNextRecord();
	}
}

void PDBFileParser::ParseCrystallographic()
{
	if (mRec->is("CRYST1"))
	{
		Match("CRYST1", true);

		// clang-format off
		getCategory("cell")->emplace({
			{ "entry_id", mStructureID },  //	 1 -  6       Record name   "CRYST1"
			{ "length_a", vF(7, 15) },     //	 7 - 15       Real(9.3)     a              a (Angstroms).
			{ "length_b", vF(16, 24) },    //	16 - 24       Real(9.3)     b              b (Angstroms).
			{ "length_c", vF(25, 33) },    //	25 - 33       Real(9.3)     c              c (Angstroms).
			{ "angle_alpha", vF(34, 40) }, //	34 - 40       Real(7.2)     alpha          alpha (degrees).
			{ "angle_beta", vF(41, 47) },  //	41 - 47       Real(7.2)     beta           beta (degrees).
			{ "angle_gamma", vF(48, 54) }, //	48 - 54       Real(7.2)     gamma          gamma (degrees).
			/* goes into symmetry */       //	56 - 66       LString       sGroup         Space  group.
			{ "Z_PDB", vF(67, 70) }        //	67 - 70       Integer       z              Z value.
		});
		// clang-format on

		std::string spaceGroup, intTablesNr;
		try
		{
			spaceGroup = vS(56, 66);
			intTablesNr = std::to_string(get_space_group_number(spaceGroup));
		}
		catch (...)
		{
		}

		// clang-format off
		getCategory("symmetry")->emplace({
			{ "entry_id", mStructureID },
			{ "space_group_name_H-M", spaceGroup },
			{ "Int_Tables_number", intTablesNr }
		});

		GetNextRecord();
	}
}

void PDBFileParser::ParseCoordinateTransformation()
{
	std::string m[3][3], v[3];

	if (cif::starts_with(mRec->mName, "ORIGX"))
	{
		for (std::string n : { "1", "2", "3" })
		{
			int x = stoi(n) - 1;

			Match("ORIGX" + n, true); //	 1 -  6         Record name   "ORIGXn"      n=1, 2, or 3
			m[x][0] = vF(11, 20);     //	11 - 20         Real(10.6)    o[n][1]       On1
			m[x][1] = vF(21, 30);     //	21 - 30         Real(10.6)    o[n][2]       On2
			m[x][2] = vF(31, 40);     //	31 - 40         Real(10.6)    o[n][3]       On3
			v[x] = vF(46, 55);        //	46 - 55         Real(10.5)    t[n]          Tn

			GetNextRecord();
		}

		// clang-format off
		getCategory("database_PDB_matrix")->emplace({
			{ "entry_id", mStructureID },
			{ "origx[1][1]", m[0][0] },
			{ "origx[1][2]", m[0][1] },
			{ "origx[1][3]", m[0][2] },
			{ "origx[2][1]", m[1][0] },
			{ "origx[2][2]", m[1][1] },
			{ "origx[2][3]", m[1][2] },
			{ "origx[3][1]", m[2][0] },
			{ "origx[3][2]", m[2][1] },
			{ "origx[3][3]", m[2][2] },
			{ "origx_vector[1]", v[0] },
			{ "origx_vector[2]", v[1] },
			{ "origx_vector[3]", v[2] },
		});
		// clang-format on
	}

	if (cif::starts_with(mRec->mName, "SCALE"))
	{
		for (std::string n : { "1", "2", "3" })
		{
			int x = stoi(n) - 1;

			Match("SCALE" + n, true); //	 1 -  6         Record name   "SCALEn" n=1,  2, or 3
			m[x][0] = vF(11, 20);     //	11 - 20         Real(10.6)    s[n][1]            Sn1
			m[x][1] = vF(21, 30);     //	21 - 30         Real(10.6)    s[n][2]            Sn2
			m[x][2] = vF(31, 40);     //	31 - 40         Real(10.6)    s[n][3]            Sn3
			v[x] = vF(46, 55);        //	46 - 55         Real(10.5)    u[n]               Un

			GetNextRecord();
		}

		// clang-format off
		getCategory("atom_sites")->emplace({
			{ "entry_id", mStructureID },
			{ "fract_transf_matrix[1][1]", m[0][0] },
			{ "fract_transf_matrix[1][2]", m[0][1] },
			{ "fract_transf_matrix[1][3]", m[0][2] },
			{ "fract_transf_matrix[2][1]", m[1][0] },
			{ "fract_transf_matrix[2][2]", m[1][1] },
			{ "fract_transf_matrix[2][3]", m[1][2] },
			{ "fract_transf_matrix[3][1]", m[2][0] },
			{ "fract_transf_matrix[3][2]", m[2][1] },
			{ "fract_transf_matrix[3][3]", m[2][2] },
			{ "fract_transf_vector[1]", v[0] },
			{ "fract_transf_vector[2]", v[1] },
			{ "fract_transf_vector[3]", v[2] },
		});
		// clang-format on
	}

	while (cif::starts_with(mRec->mName, "MTRIX1"))
	{
		int serial = 0, igiven = 0;

		for (std::string n : { "1", "2", "3" })
		{
			int x = stoi(n) - 1;

			Match("MTRIX" + n, true); //	 1 -  6        Record name   "MTRIXn"      n=1, 2, or 3
			serial = vI(8, 10);       //	 8 - 10        Integer       serial        Serial number.
			m[x][0] = vF(11, 20);     //	11 - 20        Real(10.6)    m[n][1]       Mn1
			m[x][1] = vF(21, 30);     //	21 - 30        Real(10.6)    m[n][2]       Mn2
			m[x][2] = vF(31, 40);     //	31 - 40        Real(10.6)    m[n][3]       Mn3
			v[x] = vF(46, 55);        //	46 - 55        Real(10.5)    v[n]          Vn
			igiven = vC(60) == '1';   //	60             Integer       iGiven        1 if coordinates for the  representations
			                          //	                                           which  are approximately related by the
			GetNextRecord();          //	                                           transformations  of the molecule are
		} //	                                           contained in the datablock. Otherwise, blank.

		// clang-format off
		getCategory("struct_ncs_oper")->emplace({
			{ "id", serial },
			{ "matrix[1][1]", m[0][0] },
			{ "matrix[1][2]", m[0][1] },
			{ "matrix[1][3]", m[0][2] },
			{ "matrix[2][1]", m[1][0] },
			{ "matrix[2][2]", m[1][1] },
			{ "matrix[2][3]", m[1][2] },
			{ "matrix[3][1]", m[2][0] },
			{ "matrix[3][2]", m[2][1] },
			{ "matrix[3][3]", m[2][2] },
			{ "vector[1]", v[0] },
			{ "vector[2]", v[1] },
			{ "vector[3]", v[2] },
			{ "code", igiven ? "given" : "" }
		});
		// clang-format on
	}
}

void PDBFileParser::ParseCoordinate(int modelNr)
{
	// oh oh, we have to sort our atom_site records by ascending asym_id
	// This routine used to be so trivial...

	typedef std::tuple<std::string, int, bool, PDBRecord *, PDBRecord *> atomRec;

	std::vector<atomRec> atoms;
	while (mRec->is("ATOM  ") or mRec->is("HETATM")) //	 1 -  6        Record name   "ATOM  "
	{
		char chainID = vC(22);   //	22             Character     chainID      Chain identifier.
		int resSeq = vI(23, 26); //	23 - 26        Integer       resSeq       Residue sequence number.
		char iCode = vC(27);

		std::string asymID;
		int seqID;
		bool isResseq;

		std::tie(asymID, seqID, isResseq) = MapResidue(chainID, resSeq, iCode);

		PDBRecord *atom = mRec;
		PDBRecord *anisou = nullptr;

		GetNextRecord();
		if (mRec->is("ANISOU"))
		{
			anisou = mRec;
			GetNextRecord();
		}

		atoms.emplace_back(asymID, seqID, isResseq, atom, anisou);

		/*if?... */ while (mRec->is("TER   "))
		{
			Match("TER   ", true);
			GetNextRecord();
		}
	}

	auto last = mRec;

	// use stable sort here
	auto rLess = [](const atomRec &a, const atomRec &b) -> bool
	{
		int d;

		std::string chainA = std::get<0>(a);
		std::string chainB = std::get<0>(b);

		if (chainA.length() != chainB.length())
			d = static_cast<int>(chainA.length() - chainB.length());
		else
			d = std::get<0>(a).compare(std::get<0>(b));

		if (d == 0)
			d = std::get<1>(a) - std::get<1>(b);
		return d < 0;
	};

	stable_sort(atoms.begin(), atoms.end(), rLess);

	// now reiterate the atoms to reorder alternates
	for (std::size_t i = 0; i + 1 < atoms.size(); ++i)
	{
		char altLoc = std::get<3>(atoms[i])->vC(17);

		if (altLoc == ' ' or altLoc == 0)
			continue;

		auto b = atoms.begin() + i;
		auto e = b;

		std::map<std::string, int> atomIndex; // index number of first occurrence of a atom name

		while (e != atoms.end() and rLess(*b, *e) == false)
		{
			std::string name = std::get<3>(*e)->vS(13, 16);

			if (atomIndex.count(name) == 0)
				atomIndex[name] = static_cast<int>(atomIndex.size() + 1);

			++e;
		}

		auto aLess = [&](atomRec &a, atomRec &b) -> bool
		{
			std::string na = std::get<3>(a)->vS(13, 16);
			std::string nb = std::get<3>(b)->vS(13, 16);

			int d = atomIndex[na] - atomIndex[nb];
			if (d == 0)
				d = std::get<3>(a)->vC(17) - std::get<3>(b)->vC(17);
			assert(d != 0);
			return d < 0;
		};

		sort(b, e, aLess);

		i += distance(b, e) - 1;
	}

	//	while (mRec->is("ATOM  ") or mRec->is("HETATM"))		//	 1 -  6        Record name   "ATOM  "
	for (auto &a : atoms)
	{
		std::string asymID;
		int seqID;
		bool isResseq;
		PDBRecord *atom;
		PDBRecord *anisou;
		std::tie(asymID, seqID, isResseq, atom, anisou) = a;

		mRec = atom;

		++mAtomID;

		std::string groupPDB = mRec->is("ATOM  ") ? "ATOM" : "HETATM";
		//		int serial = vI(7, 11);				//	 7 - 11        Integer       serial       Atom  serial number.
		std::string name = vS(13, 16);       //	13 - 16        Atom          name         Atom name.
		char altLoc = vC(17);                //	17             Character     altLoc       Alternate location indicator.
		std::string resName = vS(18, 20);    //	18 - 20        Residue name  resName      Residue name.
		char chainID = vC(22);               //	22             Character     chainID      Chain identifier.
		int resSeq = vI(23, 26);             //	23 - 26        Integer       resSeq       Residue sequence number.
		char iCode = vC(27);                 //	27             AChar         iCode        Code for insertion of residues.
		std::string x = vF(31, 38);          //	31 - 38        Real(8.3)     x            Orthogonal coordinates for X in Angstroms.
		std::string y = vF(39, 46);          //	39 - 46        Real(8.3)     y            Orthogonal coordinates for Y in Angstroms.
		std::string z = vF(47, 54);          //	47 - 54        Real(8.3)     z            Orthogonal coordinates for Z in Angstroms.
		std::string occupancy = vF(55, 60);  //	55 - 60        Real(6.2)     occupancy    Occupancy.
		std::string tempFactor = vF(61, 66); //	61 - 66        Real(6.2)     tempFactor   Temperature  factor.
		std::string element = vS(77, 78);    //	77 - 78        LString(2)    element      Element symbol, right-justified.
		std::string charge = vS(79, 80);     //	79 - 80        LString(2)    charge       Charge  on the atom.

		if (element.empty())
			throw std::runtime_error("Empty element column in PDB file at line " + std::to_string(mRec->mLineNr));

		std::string entityID = mAsymID2EntityID[asymID];

		charge = pdb2cifCharge(charge);

		// if (cif::compound_factory::instance().is_known_peptide(resName) or cif::compound_factory::instance().is_known_base(resName))
		if (resName == "UNK" or cif::compound_factory::kAAMap.count(resName) or cif::compound_factory::kBaseMap.count(resName))
		{
			if (groupPDB == "HETATM")
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Changing atom from HETATM to ATOM at line " << mRec->mLineNr << '\n';
				groupPDB = "ATOM";
			}
		}
		else
		{
			if (groupPDB == "ATOM")
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Changing atom from ATOM to HETATM at line " << mRec->mLineNr << '\n';
				groupPDB = "HETATM";
			}
		}

		// if the atom is part of a sugar, we need to replace the auth_seq_id/resSeq
		if (mSugarEntities.count(entityID))
		{
			using namespace cif::literals;

			auto &branch_scheme = *getCategory("pdbx_branch_scheme");
			resSeq = branch_scheme.find1<int>("asym_id"_key == asymID and "auth_seq_num"_key == resSeq, "pdb_seq_num");
		}

		// clang-format off
		getCategory("atom_site")->emplace({
			{ "group_PDB", groupPDB },
			{ "id", mAtomID },
			{ "type_symbol", element },
			{ "label_atom_id", name },
			{ "label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
			{ "label_comp_id", resName },
			{ "label_asym_id", asymID },
			{ "label_entity_id", entityID },
			{ "label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
			{ "pdbx_PDB_ins_code", iCode == ' ' ? "" : std::string{ iCode } },
			{ "Cartn_x", x },
			{ "Cartn_y", y },
			{ "Cartn_z", z },
			{ "occupancy", occupancy },
			{ "B_iso_or_equiv", tempFactor },
			{ "pdbx_formal_charge", charge },
			{ "auth_seq_id", resSeq },
			{ "auth_comp_id", resName },
			{ "auth_asym_id", std::string{ chainID } },
			{ "auth_atom_id", name },
			{ "pdbx_PDB_model_num", modelNr }
		});
		// clang-format on

		InsertAtomType(element);

		std::string check = vS(7, 11) + vS(77, 80);

		if (anisou != nullptr)
		{
			mRec = anisou;        //	 1 - 6        Record name   "ANISOU"
			int u11 = vI(29, 35); //	29 - 35       Integer       u[0][0]        U(1,1)
			int u22 = vI(36, 42); //	36 - 42       Integer       u[1][1]        U(2,2)
			int u33 = vI(43, 49); //	43 - 49       Integer       u[2][2]        U(3,3)
			int u12 = vI(50, 56); //	50 - 56       Integer       u[0][1]        U(1,2)
			int u13 = vI(57, 63); //	57 - 63       Integer       u[0][2]        U(1,3)
			int u23 = vI(64, 70); //	64 - 70       Integer       u[1][2]        U(2,3)

			if (vS(7, 11) + vS(77, 80) != check)
				throw std::runtime_error("ANISOU record should follow corresponding ATOM record");

			auto f = [](float f) -> std::string
			{
				return cif::format("{:6.4f}", f);
			};

			// clang-format off
			getCategory("atom_site_anisotrop")->emplace({
				{ "id", mAtomID },
				{ "type_symbol", element },
				{ "pdbx_label_atom_id", name },
				{ "pdbx_label_alt_id", altLoc != ' ' ? std::string{ altLoc } : "." },
				{ "pdbx_label_comp_id", resName },
				{ "pdbx_label_asym_id", asymID },
				{ "pdbx_label_seq_id", (isResseq and seqID > 0) ? std::to_string(seqID) : "." },
				{ "U[1][1]", f(u11 / 10000.f) },
				{ "U[2][2]", f(u22 / 10000.f) },
				{ "U[3][3]", f(u33 / 10000.f) },
				{ "U[1][2]", f(u12 / 10000.f) },
				{ "U[1][3]", f(u13 / 10000.f) },
				{ "U[2][3]", f(u23 / 10000.f) },
				{ "pdbx_auth_seq_id", resSeq },
				{ "pdbx_auth_comp_id", resName },
				{ "pdbx_auth_asym_id", std::string{ chainID } },
				{ "pdbx_auth_atom_id", name }
			});
			// clang-format on
		}
	}

	mRec = last;
}

void PDBFileParser::ParseConnectivty()
{
	while (mRec->is("CONECT"))
		GetNextRecord();
}

void PDBFileParser::ParseBookkeeping()
{
	if (mRec->is("MASTER"))
	{
		Match("MASTER", false);
		GetNextRecord();
	}
	Match("END   ", false);
}

void PDBFileParser::Parse(std::istream &is, cif::file &result)
{
	try
	{
		if (mDatablock.get_validator() == nullptr)
			mDatablock.load_dictionary();

		PreParseInput(is);

		mRec = mData;

		ParseTitle();

		ParseRemarks();
		ParsePrimaryStructure();
		ParseHeterogen();

		ConstructEntities();

		ParseRemark350();

		ParseSecondaryStructure();
		ParseConnectivtyAnnotation();
		ParseMiscellaneousFeatures();
		ParseCrystallographic();
		ParseCoordinateTransformation();

		uint32_t modelNr = 1;
		bool hasAtoms = false;

		while (mRec->is("MODEL ") or mRec->is("ATOM  ") or mRec->is("HETATM"))
		{
			bool model = false;
			if (mRec->is("MODEL "))
			{
				model = true;

				modelNr = vI(11, 14);

				GetNextRecord();
			}

			hasAtoms = hasAtoms or mRec->is("ATOM  ") or mRec->is("HETATM");

			ParseCoordinate(modelNr);

			if (model)
			{
				Match("ENDMDL", true);
				GetNextRecord();
			}
		}

		if (not hasAtoms)
			throw std::runtime_error("Either the PDB file has no atom records, or the field " + std::string(mRec->mName) + " is not at the correct location");

		for (auto e : mAtomTypes)
			getCategory("atom_type")->emplace({ { "symbol", e } });

		// in V5, atom_type is sorted
		getCategory("atom_type")->reorder_by_index();

		ParseConnectivty();
		ParseBookkeeping();

		// almost done, now fix some outstanding issued that could not be done before

		try
		{
			auto r = FindRecord("REMARK   3");

			if (r != nullptr and Remark3Parser::parse(mExpMethod, r, mDatablock))
			{
				// make sure the "exptl" category is created
				auto exptl = getCategory("exptl");
				if (exptl->empty())
				{
					exptl->emplace({ { "entry_id", mStructureID },
						{ "method", mExpMethod },
						{ "crystals_number", mRemark200["NUMBER OF CRYSTALS USED"] } });
				}
			}
		}
		catch (const std::exception &ex)
		{
			if (cif::VERBOSE >= 0)
				std::cerr << "Error parsing REMARK 3\n";
			throw;
		}
		//
		//		auto cat = getCategory("pdbx_refine_tls_group");
		//		for (Row r: *cat)
		//		{
		//			// add the mapped locations
		//
		//			try
		//			{
		//				std::string asymID;
		//				int resNum;
		//
		//				cif::tie(asymID, resNum) = r.get("beg_auth_asym_id", "beg_auth_seq_id");
		//
		//				r["beg_label_asym_id"] = asymID;
		//				r["beg_label_seq_id"] = resNum;
		//
		//				cif::tie(asymID, resNum) = r.get("end_auth_asym_id", "end_auth_seq_id");
		//
		//				r["end_label_asym_id"] = asymID;
		//				r["end_label_seq_id"] = resNum;
		//			}
		//			catch (const std::exception& ex)
		//			{
		//				continue;
		//			}
		//		}

		using namespace cif::literals;

		auto &atom_site = *getCategory("atom_site");

		for (auto r : getCategory("struct_conn")->find("pdbx_dist_value"_key == 0 or "pdbx_dist_value"_key == cif::null))
		{
			const auto &[asym1, seq1, atom1, symm1, asym2, seq2, atom2, symm2] = r.get<std::string, std::string, std::string, std::string, std::string, std::string, std::string, std::string>(
				"ptnr1_label_asym_id", "ptnr1_label_seq_id", "ptnr1_label_atom_id", "ptnr1_symmetry",
				"ptnr2_label_asym_id", "ptnr2_label_seq_id", "ptnr2_label_atom_id", "ptnr2_symmetry");

			float distance = 1.0f;

			try
			{
				auto a1 = atom_site.find1("label_asym_id"_key == asym1 and "label_seq_id"_key == seq1 and "label_atom_id"_key == atom1);
				auto a2 = atom_site.find1("label_asym_id"_key == asym2 and "label_seq_id"_key == seq2 and "label_atom_id"_key == atom2);

				if (not a1 or not a2)
					throw std::runtime_error("cannot find atom");

				const auto &[x1, y1, z1] = a1.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");
				const auto &[x2, y2, z2] = a2.get<float, float, float>("cartn_x", "cartn_y", "cartn_z");

				if ((symm1.empty() or symm1 == "1_555") and (symm2.empty() or symm2 == "1_555"))
					distance = std::sqrt(
						(x1 - x2) * (x1 - x2) +
						(y1 - y2) * (y1 - y2) +
						(z1 - z2) * (z1 - z2));
				else if (cif::VERBOSE > 0)
					std::cerr << "Cannot calculate distance for link since one of the atoms is in another dimension\n";
			}
			catch (std::exception &ex)
			{
				if (cif::VERBOSE > 0)
					std::cerr << "Error finding atom for LINK distance calculation: " << ex.what() << '\n';
			}

			r["pdbx_dist_value"] = distance;
		}

		result.emplace_back(std::move(mDatablock));
	}
	catch (const std::exception &ex)
	{
		if (cif::VERBOSE >= 0)
		{
			std::cerr << "Error parsing PDB";
			if (mRec != nullptr)
				std::cerr << " at line " << mRec->mLineNr;
			std::cerr << '\n';
		}
		throw;
	}
}

// ----------------------------------------------------------------
// A blast like alignment. Returns index of last aligned residue.

// matrix is m x n, addressing i,j is 0 <= i < m and 0 <= j < n
// element m i,j is mapped to [i * n + j] and thus storage is row major

template <typename T>
class matrix
{
  public:
	using value_type = T;

	matrix() = delete;
	matrix(const matrix &) = delete;
	matrix &operator=(const matrix &) = delete;

	matrix(uint32_t m, uint32_t n, T v = T())
		: m_m(m)
		, m_n(n)
	{
		m_data = new value_type[m_m * m_n];
		std::fill(m_data, m_data + (m_m * m_n), v);
	}

	~matrix()
	{
		delete[] m_data;
	}

	uint32_t dim_m() const { return m_m; }
	uint32_t dim_n() const { return m_n; }

	value_type operator()(uint32_t i, uint32_t j) const
	{
		assert(i < m_m);
		assert(j < m_n);
		return m_data[i * m_n + j];
	}

	value_type &operator()(uint32_t i, uint32_t j)
	{
		assert(i < m_m);
		assert(j < m_n);
		return m_data[i * m_n + j];
	}

  private:
	value_type *m_data;
	uint32_t m_m, m_n;
};

int PDBFileParser::PDBChain::AlignResToSeqRes()
{
	// Use dynamic programming to align the found residues (in ATOM records) against
	// the residues in the SEQRES records in order to obtain the residue numbering.
	// sigh...

	auto &rx = mSeqres;
	auto &ry = mResiduesSeen;

	int dimX = static_cast<int>(mSeqres.size());
	if (dimX == 0)
		throw std::runtime_error(std::string("SEQRES for chain ") + mDbref.chainID + " is empty");

	int dimY = static_cast<int>(mResiduesSeen.size());
	if (dimY == 0)
		throw std::runtime_error(std::string("Number of residues in ATOM records for chain ") + mDbref.chainID + " is zero");

	matrix<float> B(dimX, dimY), Ix(dimX, dimY), Iy(dimX, dimY);
	matrix<int8_t> tb(dimX, dimY);

	int x, y;

	const float
		kMatchReward = 5,
		kMismatchCost = -10,
		kGapOpen = 10, gapExtend = 0.1f;

	float high = 0;
	int highX = 0, highY = 0;

	for (x = 0; x < dimX; ++x)
	{
		for (y = 0; y < dimY; ++y)
		{
			auto &a = rx[x];
			auto &b = ry[y];

			float Ix1 = x > 0 ? Ix(x - 1, y) : 0;
			float Iy1 = y > 0 ? Iy(x, y - 1) : 0;

			// score for alignment
			float M;
			if (a.mMonID == b.mMonID)
				M = kMatchReward;
			else
				M = kMismatchCost;

			// gap open cost is zero if the PDB ATOM records indicate that a gap
			// should be here.
			float gapOpen = kGapOpen;
			if (y == 0 or (y + 1 < dimY and ry[y + 1].mSeqNum > ry[y].mSeqNum + 1))
				gapOpen = 0;

			if (x > 0 and y > 0)
				M += B(x - 1, y - 1);

			float s;
			if (M >= Ix1 and M >= Iy1)
			{
				tb(x, y) = 0;
				B(x, y) = s = M;

				Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
				Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
			}
			else if (Ix1 >= Iy1)
			{
				tb(x, y) = 1;
				B(x, y) = s = Ix1;

				Ix(x, y) = Ix1 - gapExtend;
				Iy(x, y) = M - (y < dimY - 1 ? gapOpen : 0);
				if (Iy(x, y) < Iy1 - gapExtend)
					Iy(x, y) = Iy1 - gapExtend;
			}
			else
			{
				tb(x, y) = -1;
				B(x, y) = s = Iy1;

				Ix(x, y) = M - (x < dimX - 1 ? gapOpen : 0);
				if (Ix(x, y) < Ix1 - gapExtend)
					Ix(x, y) = Ix1 - gapExtend;
				Iy(x, y) = Iy1 - gapExtend;
			}

			if (/*(x == dimX - 1 or y == dimY - 1) and */ high < s)
			{
				high = s;
				highX = x;
				highY = y;
			}
		}
	}

	const int kFlagSeqNr = std::numeric_limits<int>::min();

	// reset positions of seqres
	for (auto &sr : rx)
	{
		sr.mSeqNum = kFlagSeqNr;
		sr.mIcode = ' ';
	}

	// assign numbers
	x = highX;
	y = highY;

	// C++ is getting closer to Pascal :-)
	auto printAlignment = [&tb, highX, highY, &rx, &ry, this]()
	{
		std::cerr << std::string(22, '-') << '\n'
				  << "Alignment for chain " << mDbref.chainID << '\n'
				  << '\n';
		std::vector<std::pair<std::string, std::string>> alignment;

		int x = highX;
		int y = highY;

		for (x = highX, y = highY; x >= 0 and y >= 0;)
		{
			switch (tb(x, y))
			{
				case -1:
					alignment.push_back(make_pair("...", ry[y].mMonID));
					--y;
					break;

				case 1:
					alignment.push_back(make_pair(rx[x].mMonID, "..."));
					--x;
					break;

				case 0:
					alignment.push_back(make_pair(rx[x].mMonID, ry[y].mMonID));
					--x;
					--y;
					break;
			}
		}

		while (x >= 0)
		{
			alignment.push_back(make_pair(rx[x].mMonID, "..."));
			--x;
		}

		while (y >= 0)
		{
			alignment.push_back(make_pair("...", ry[y].mMonID));
			--y;
		}

		reverse(alignment.begin(), alignment.end());
		for (auto a : alignment)
			std::cerr << "  " << a.first << " -- " << a.second << '\n';

		std::cerr << '\n';
	};

	if (cif::VERBOSE > 1)
		printAlignment();

	try
	{
		while (x >= 0 and y >= 0)
		{
			switch (tb(x, y))
			{
				case -1:
					throw std::runtime_error("A residue found in the ATOM records (" + ry[y].mMonID +
											 " @ " + std::string{ mDbref.chainID } + ":" + std::to_string(ry[y].mSeqNum) +
											 ((ry[y].mIcode == ' ' or ry[y].mIcode == 0) ? "" : std::string{ ry[y].mIcode }) +
											 ") was not found in the SEQRES records");
					break;

				case 1:
					if (cif::VERBOSE > 3)
						std::cerr << "Missing residue in ATOM records: " << rx[x].mMonID << " at " << rx[x].mSeqNum << '\n';

					--x;
					break;

				case 0:
					if (rx[x].mMonID != ry[y].mMonID)
					{
						std::cerr << "Warning, unaligned residues at " << x << "/" << y << "(" << rx[x].mMonID << '/' << ry[y].mMonID << ") SEQRES does not agree with ATOM records\n";
						rx[x].mMonID = ry[y].mMonID;
					}

					rx[x].mSeqNum = ry[y].mSeqNum;
					rx[x].mIcode = ry[y].mIcode;

					--x;
					--y;
			}
		}
	}
	catch (const std::exception &ex)
	{
		if (cif::VERBOSE == 1)
			printAlignment();

		throw;
	}

	// assign numbers to the residues that don't have them yet
	std::stack<int> unnumbered;
	for (x = 0; x < dimX; ++x)
	{
		if (rx[x].mSeqNum == kFlagSeqNr)
		{
			if (x > 0 and rx[x - 1].mSeqNum != kFlagSeqNr)
				rx[x].mSeqNum = rx[x - 1].mSeqNum + 1;
			else
				unnumbered.push(x);
		}
	}

	while (unnumbered.empty() == false)
	{
		x = unnumbered.top();
		if (x >= dimX - 1)
			throw std::runtime_error("Could not assign sequence numbers");
		rx[x].mSeqNum = rx[x + 1].mSeqNum - 1;
		unnumbered.pop();
	}

	return highY;
}

bool PDBFileParser::PDBChain::SameSequence(const PDBChain &rhs) const
{
	bool result = mSeqres.size() == rhs.mSeqres.size();

	for (std::size_t i = 0; result and i < mSeqres.size(); ++i)
		result = mSeqres[i].mMonID == rhs.mSeqres[i].mMonID;

	return result;
}

// --------------------------------------------------------------------

void read_pdb_file(std::istream &pdbFile, cif::file &cifFile)
{
	PDBFileParser p;

	p.Parse(pdbFile, cifFile);

	if (cifFile.empty())
	{
		if (VERBOSE >= 0)
			std::cerr << "PDB is empty!\n";
	}
	else
	{
		cifFile.front().load_dictionary();
		if (cifFile.front().get_validator() == nullptr)
			cifFile.front().set_validator(&validator_factory::instance().get("mmcif_pdbx.dic"));

		if (not cifFile.is_valid() and cif::VERBOSE >= 0)
			std::cerr << "Resulting mmCIF file is not valid!\n";
	}
}

// --------------------------------------------------------------------

file read(std::istream &is)
{
	file result;

	auto *buffer = is.rdbuf();
	if (buffer)
	{
		char ch = std::char_traits<char>::to_char_type(buffer->sgetc());

		// All PDB files should always start with a HEADER line
		// and so the very first character in a valid PDB file
		// is 'H'. It is as simple as that.

		// Well, not quite, Unfortunately... People insisted that
		// having only ATOM records also makes up a valid PDB file...
		// Since mmCIF files cannot validly start with a letter character
		// apart from the letter 'd', the test has changed into the following:

		if (std::isalpha(ch) and std::toupper(ch) != 'D')
		{
			read_pdb_file(is, result);
			fixup_pdbx(result);
		}
		else
		{
			try
			{
				result.load(is);
			}
			catch (const std::exception &ex)
			{
				std::throw_with_nested(std::runtime_error("Since the file did not start with a valid PDB HEADER line mmCIF was assumed, but that failed."));
			}

			if (not(result.empty() or result.front().empty()))
			{
				if (auto &db = result.front(); db.get("audit_conform") == nullptr)
					reconstruct_pdbx(result);
				else
				{
					try
					{
						// Try to see if we can create an mm::structure out of this data.
						// If that fails, we need to reconstruct a PDBx file out of it.

						cif::mm::structure s(result);
					}
					catch (const std::exception &e)
					{
						reconstruct_pdbx(result);
					}
				}
			}
		}
	}

	// Must be a PDB like file, right?
	if (not result.empty())
	{
		auto &db = result.front();
		if (db.get_validator() == nullptr)
			db.set_validator(&validator_factory::instance().get("mmcif_pdbx.dic"));
		if (db.is_valid())
			db.get_validator()->fill_audit_conform(db["audit_conform"]);
	}

	return result;
}

file read(const std::filesystem::path &file)
{
	try
	{
		gzio::ifstream in(file);
		if (not in.is_open())
			throw std::runtime_error("Could not open file " + file.string() + " for input");

		return read(in);
	}
	catch (const std::exception &ex)
	{
		throw_with_nested(std::runtime_error("Error reading file " + file.string()));
	}
}

} // namespace cif::pdb