Fix includes to contain <cstdint>

Merge branch 'develop' into trunk
update changelog, version bump
2026-06-04 22:14:24 +08:00 · 2023-06-08 13:15:43 +02:00 · 2023-06-08 10:12:03 +02:00 · 2023-06-08 10:10:49 +02:00 · 2023-06-07 19:11:20 +02:00 · 2023-06-07 14:07:27 +02:00
16 changed files with 531 additions and 360 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@
 cmake_minimum_required(VERSION 3.16)

 # set the project name
-project(cifpp VERSION 5.0.9 LANGUAGES CXX)
+project(cifpp VERSION 5.1.0.1 LANGUAGES CXX)

 list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

@@ -382,6 +382,16 @@ install(FILES
 	DESTINATION ${CIFPP_DATA_DIR}
 )

+if(${CIFPP_CACHE_DIR})
+	install(FILES
+		${PROJECT_SOURCE_DIR}/rsrc/mmcif_ddl.dic
+		${PROJECT_SOURCE_DIR}/rsrc/mmcif_pdbx.dic
+		${PROJECT_SOURCE_DIR}/rsrc/mmcif_ma.dic
+		${COMPONENTS_CIF}
+		DESTINATION ${CIFPP_CACHE_DIR}
+	)
+endif()
+
 set(CONFIG_TEMPLATE_FILE ${PROJECT_SOURCE_DIR}/cmake/cifppConfig.cmake.in)

 configure_package_config_file(
--- a/README.md
+++ b/README.md
@@ -3,18 +3,78 @@ libcifpp

 This library contains code to work with mmCIF and PDB files.

+Synopsis
+--------
+
+```c++
+// A simple program counting residues with an OXT atom
+
+#include <filesystem>
+#include <iostream>
+
+#include <cif++.hpp>
+
+namespace fs = std::filesystem;
+
+int main(int argc, char *argv[])
+{
+    if (argc != 2)
+        exit(1);
+
+    // Read file, can be PDB or mmCIF and can even be compressed with gzip.
+    cif::file file = cif::pdb::read(argv[1]);
+
+    if (file.empty())
+    {
+        std::cerr << "Empty file" << std::endl;
+        exit(1);
+    }
+
+    auto &db = file.front();
+    auto &atom_site = db["atom_site"];
+    auto n = atom_site.find(cif::key("label_atom_id") == "OXT").size();
+
+    std::cout << "File contains " << atom_site.size() << " atoms of which "
+              << n << (n == 1 ? " is" : " are") << " OXT" << std::endl
+              << "residues with an OXT are:" << std::endl;
+
+    for (const auto &[asym, comp, seqnr] :
+            atom_site.find<std::string, std::string, int>(
+                cif::key("label_atom_id") == "OXT",
+                "label_asym_id", "label_comp_id", "label_seq_id"))
+    {
+        std::cout << asym << ' ' << comp << ' ' << seqnr << std::endl;
+    }
+
+    return 0;
+}
+
+```
+
 Requirements
 ------------

 The code for this library was written in C++17. You therefore need a
-recent compiler to build it. For the development gcc 9.3 and clang 9.0
+recent compiler to build it. For the development gcc 9.4 and clang 9.0
 have been used as well as MSVC version 2019.

 Other requirements are:

 - [mrc](https://github.com/mhekkel/mrc), a resource compiler that
  allows including data files into the executable making them easier to
-  install. Strictly this is optional, but at the expense of functionality.
+  install. Strictly speaking this is optional, but at the expense of
+  functionality.
+- [libeigen](https://eigen.tuxfamily.org/index.php?title=Main_Page), a
+  library to do amongst others matrix calculations. This usually can be
+  installed using your package manager, in Debian/Ubuntu it is called
+  `libeigen3-dev`
+- zlib, the development version of this library. On Debian/Ubuntu this
+  is the package `zlib1g-dev`.
+- [boost](https://www.boost.org). The boost libraries are only needed if
+  you want to build the testing code.
+
+When building using MS Visual Studio, you will also need [libzeep](https://github.com/mhekkel/libzeep)
+since MSVC does not yet provide a C++ template required by libcifpp.

 Building
 --------
@@ -26,7 +86,7 @@ On linux e.g. you would issue the following commands to build and install
 libcifpp in your `$HOME/.local` folder:

 ```bash
- git clone https://github.com/PDB-REDO/libcifpp.git
+ git clone https://github.com/PDB-REDO/libcifpp.git --recurse-submodules
 cd libcifpp
 cmake -S . -B build -DCMAKE_INSTALL_PREFIX=$HOME/.local -DCMAKE_BUILD_TYPE=Release
 cmake --build build
--- a/8
+++ b/8
@@ -1,3 +1,11 @@
+Version 5.1
+- New parser, optimised for speed
+- Fix in unique ID generator
+
+Version 5.0.10
+- Fix in progress_bar, was using too much CPU
+- Optimised mmCIF parser
+
 Version 5.0.9
 - Fix in dihedral angle calculations
 - Added create_water to model
--- a/examples/example.cpp
+++ b/examples/example.cpp
@@ -1,24 +1,32 @@
-#include <iostream>
 #include <filesystem>
+#include <iostream>

 #include <cif++.hpp>

 namespace fs = std::filesystem;

-int main()
+int main(int argc, char *argv[])
 {
-	cif::file file;
-	file.load("1cbs.cif.gz");
+	if (argc != 2)
+		exit(1);

-	auto& db = file.front();
+	cif::file file = cif::pdb::read(argv[1]);
+
+	if (file.empty())
+	{
+		std::cerr << "Empty file" << std::endl;
+		exit(1);
+	}
+
+	auto &db = file.front();
 	auto &atom_site = db["atom_site"];
 	auto n = atom_site.find(cif::key("label_atom_id") == "OXT").size();

 	std::cout << "File contains " << atom_site.size() << " atoms of which " << n << (n == 1 ? " is" : " are") << " OXT" << std::endl
-		<< "residues with an OXT are:" << std::endl;
-	
-	for (const auto& [asym, comp, seqnr]: atom_site.find<std::string,std::string,int>(
-			cif::key("label_atom_id") == "OXT", "label_asym_id", "label_comp_id", "label_seq_id"))
+			  << "residues with an OXT are:" << std::endl;
+
+	for (const auto &[asym, comp, seqnr] : atom_site.find<std::string, std::string, int>(
+			 cif::key("label_atom_id") == "OXT", "label_asym_id", "label_comp_id", "label_seq_id"))
 	{
 		std::cout << asym << ' ' << comp << ' ' << seqnr << std::endl;
 	}
--- a/include/cif++/dictionary_parser.hpp
+++ b/include/cif++/dictionary_parser.hpp
@@ -32,5 +32,6 @@ namespace cif
 {

 validator parse_dictionary(std::string_view name, std::istream &is);
+void extend_dictionary(validator &v, std::istream &is);

 } // namespace cif
--- a/include/cif++/parser.hpp
+++ b/include/cif++/parser.hpp
@@ -29,7 +29,6 @@
 #include "cif++/row.hpp"

 #include <map>
-#include <regex>

 namespace cif
 {
@@ -54,8 +53,6 @@ class sac_parser
  public:
 	using datablock_index = std::map<std::string, std::size_t>;

-	sac_parser(std::istream &is, bool init = true);
-
 	virtual ~sac_parser() = default;

 	enum CharTraitsMask : uint8_t
@@ -66,9 +63,14 @@ class sac_parser
 		kAnyPrintMask = 1 << 3
 	};

-	static bool is_white(int ch)
+	static constexpr bool is_space(int ch)
 	{
-		return std::isspace(ch) or ch == '#';
+		return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n';
+	}
+
+	static constexpr bool is_white(int ch)
+	{
+		return is_space(ch) or ch == '#';
 	}

 	static constexpr bool is_ordinary(int ch)
@@ -92,26 +94,7 @@ class sac_parser
 		       (ch >= 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kAnyPrintMask) != 0);
 	}

-	static bool is_unquoted_string(std::string_view text)
-	{
-		bool result = text.empty() or is_ordinary(text.front());
-
-		if (result)
-		{
-			for (auto ch : text)
-			{
-				if (is_non_blank(ch))
-					continue;
-				result = false;
-				break;
-			}
-		}
-
-		static const std::regex kReservedRx(R"(loop_|stop_|global_|data_\S+|save_\S+)", std::regex_constants::icase);
-
-		// but be careful it does not contain e.g. stop_
-		return result and not std::regex_match(text.begin(), text.end(), kReservedRx);
-	}
+	static bool is_unquoted_string(std::string_view text);

  protected:
 	static constexpr uint8_t kCharTraitsTable[128] = {
@@ -133,7 +116,8 @@ class sac_parser
 		DATA,
 		LOOP,
 		GLOBAL,
-		SAVE,
+		SAVE_,
+		SAVE_NAME,
 		STOP,
 		Tag,
 		Value
@@ -148,7 +132,8 @@ class sac_parser
 			case CIFToken::DATA: return "DATA";
 			case CIFToken::LOOP: return "LOOP";
 			case CIFToken::GLOBAL: return "GLOBAL";
-			case CIFToken::SAVE: return "SAVE";
+			case CIFToken::SAVE_: return "SAVE";
+			case CIFToken::SAVE_NAME: return "SAVE+name";
 			case CIFToken::STOP: return "STOP";
 			case CIFToken::Tag: return "Tag";
 			case CIFToken::Value: return "Value";
@@ -156,41 +141,13 @@ class sac_parser
 		}
 	}

-	enum class CIFValue
-	{
-		Int,
-		Float,
-		Numeric,
-		String,
-		TextField,
-		Inapplicable,
-		Unknown
-	};
-
-	static constexpr const char *get_value_name(CIFValue type)
-	{
-		switch (type)
-		{
-			case CIFValue::Int: return "Int";
-			case CIFValue::Float: return "Float";
-			case CIFValue::Numeric: return "Numeric";
-			case CIFValue::String: return "String";
-			case CIFValue::TextField: return "TextField";
-			case CIFValue::Inapplicable: return "Inapplicable";
-			case CIFValue::Unknown: return "Unknown";
-			default: return "Invalid type parameter";
-		}
-	}
-
-	// get_next_char takes a char from the buffer, or if it is empty
-	// from the istream. This function also does carriage/linefeed
-	// translation.
+	// get_next_char takes the next character from the istream.
+	// This function also does carriage/linefeed translation.
 	int get_next_char();

+	// Put the last read character back into the istream
 	void retract();

-	int restart(int start);
-
 	CIFToken get_next_token();

 	void match(CIFToken token);
@@ -205,6 +162,9 @@ class sac_parser
 	void parse_file();

  protected:
+
+	sac_parser(std::istream &is, bool init = true);
+
 	void parse_global();

 	void parse_datablock();
@@ -227,13 +187,14 @@ class sac_parser

 	// production methods, these are pure virtual here

-	virtual void produce_datablock(const std::string &name) = 0;
-	virtual void produce_category(const std::string &name) = 0;
+	virtual void produce_datablock(std::string_view name) = 0;
+	virtual void produce_category(std::string_view name) = 0;
 	virtual void produce_row() = 0;
-	virtual void produce_item(const std::string &category, const std::string &item, const std::string &value) = 0;
+	virtual void produce_item(std::string_view category, std::string_view item, std::string_view value) = 0;

  protected:
-	enum State
+
+	enum class State
 	{
 		Start,
 		White,
@@ -246,23 +207,21 @@ class sac_parser
 		UnquotedString,
 		Tag,
 		TextField,
-		Float = 100,
-		Int = 110,
-		Value = 300,
-		DATA,
-		SAVE
+		TextFieldNL,
+		Reserved,
+		Value
 	};

 	std::streambuf &m_source;

 	// Parser state
-	bool m_validate;
 	uint32_t m_line_nr;
 	bool m_bol;
 	CIFToken m_lookahead;
-	std::string m_token_value;
-	CIFValue mTokenType;
-	std::vector<int> m_buffer;	// retract buffer, used to be a stack<char>
+
+	// token buffer
+	std::vector<char> m_token_buffer;
+	std::string_view m_token_value;
 };

 // --------------------------------------------------------------------
@@ -276,13 +235,13 @@ class parser : public sac_parser
 	{
 	}

-	void produce_datablock(const std::string &name) override;
+	void produce_datablock(std::string_view name) override;

-	void produce_category(const std::string &name) override;
+	void produce_category(std::string_view name) override;

 	void produce_row() override;

-	void produce_item(const std::string &category, const std::string &item, const std::string &value) override;
+	void produce_item(std::string_view category, std::string_view item, std::string_view value) override;

  protected:
 	file &m_file;
--- a/include/cif++/point.hpp
+++ b/include/cif++/point.hpp
@@ -31,6 +31,7 @@
 #include <array>
 #include <cmath>
 #include <complex>
+#include <cstdint>
 #include <functional>
 #include <valarray>

--- a/include/cif++/validate.hpp
+++ b/include/cif++/validate.hpp
@@ -228,8 +228,9 @@ class validator_factory

 	const validator &operator[](std::string_view dictionary_name);

+	const validator &construct_validator(std::string_view name, std::istream &is);
+
  private:
-	void construct_validator(std::string_view name, std::istream &is);

 	// --------------------------------------------------------------------

--- a/src/category.cpp
+++ b/src/category.cpp
@@ -1227,23 +1227,37 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
 {
 	using namespace cif::literals;

-	std::string id_tag = "id";
-	if (m_cat_validator != nullptr and m_cat_validator->m_keys.size() == 1)
-		id_tag = m_cat_validator->m_keys.front();
-
 	// calling size() often is a waste of resources
 	if (m_last_unique_num == 0)
 		m_last_unique_num = static_cast<uint32_t>(size());

-	for (;;)
+	std::string result = generator(static_cast<int>(m_last_unique_num++));
+
+	std::string id_tag = "id";
+	if (m_cat_validator != nullptr and m_cat_validator->m_keys.size() == 1)
 	{
-		std::string result = generator(static_cast<int>(m_last_unique_num++));
-
-		if (exists(key(id_tag) == result))
-			continue;
-
-		return result;
+		if (m_index == nullptr and m_cat_validator != nullptr)
+			m_index = new category_index(this);
+		
+		for (;;)
+		{
+			if (m_index->find_by_value({{ id_tag, result }}) == nullptr)
+				break;
+			result = generator(static_cast<int>(m_last_unique_num++));
+		}
 	}
+	else
+	{
+		for (;;)
+		{
+			if (not exists(key(id_tag) == result))
+				break;
+			
+			result = generator(static_cast<int>(m_last_unique_num++));
+		}
+	}
+
+	return result;
 }

 void category::update_value(const std::vector<row_handle> &rows, std::string_view tag, std::string_view value)
--- a/src/dictionary_parser.cpp
+++ b/src/dictionary_parser.cpp
@@ -117,7 +117,7 @@ class dictionary_parser : public parser
 		if (not m_collected_item_types)
 			m_collected_item_types = collect_item_types();

-		std::string saveFrameName = m_token_value;
+		std::string saveFrameName { m_token_value };

 		if (saveFrameName.empty())
 			error("Invalid save frame, should contain more than just 'save_' here");
@@ -127,7 +127,7 @@ class dictionary_parser : public parser
 		datablock dict(m_token_value);
 		datablock::iterator cat = dict.end();

-		match(CIFToken::SAVE);
+		match(CIFToken::SAVE_NAME);
 		while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag)
 		{
 			if (m_lookahead == CIFToken::LOOP)
@@ -183,7 +183,7 @@ class dictionary_parser : public parser
 			}
 		}

-		match(CIFToken::SAVE);
+		match(CIFToken::SAVE_);

 		if (isCategorySaveFrame)
 		{
@@ -481,4 +481,11 @@ validator parse_dictionary(std::string_view name, std::istream &is)
 	return result;
 }

-} // namespace cif
+void extend_dictionary(validator &v, std::istream &is)
+{
+	file f;
+	dictionary_parser p(v, is, f);
+	p.load_dictionary();
+}
+
+} // namespace cif
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -32,7 +32,6 @@
 #include <cassert>
 #include <iostream>
 #include <map>
-#include <regex>
 #include <stack>

 namespace cif
@@ -40,13 +39,152 @@ namespace cif

 // --------------------------------------------------------------------

+class reserved_words_automaton
+{
+  public:
+	reserved_words_automaton() {}
+
+	enum move_result
+	{
+		undefined,
+		no_keyword,
+		data,
+		global,
+		loop,
+		save,
+		save_plus,
+		stop
+	};
+
+	constexpr bool finished() const
+	{
+		return m_state <= 0; 
+	}
+
+	constexpr bool matched() const
+	{
+		return m_state < 0; 
+	}
+
+	constexpr move_result move(int ch)
+	{
+		move_result result = undefined;
+
+		switch (m_state)
+		{
+			case 0:
+				break;
+
+			case -1:		// data_
+				if (sac_parser::is_non_blank(ch))
+					m_seen_trailing_chars = true;
+				else if (m_seen_trailing_chars)
+					result = data;
+				else
+					result = no_keyword;
+				break;
+
+			case -2:		// global_
+				result = sac_parser::is_non_blank(ch) ? no_keyword : global;
+				break;
+
+			case -3:		// loop_
+				result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
+				break;
+
+			case -4:		// save_
+				if (sac_parser::is_non_blank(ch))
+					m_seen_trailing_chars = true;
+				else if (m_seen_trailing_chars)
+					result = save_plus;
+				else
+					result = save;
+				break;
+
+			case -5:		// stop_
+				result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
+				break;
+			
+			default:
+				assert(m_state > 0 and m_state < NODE_COUNT);
+
+				for (;;)
+				{
+					if (s_dag[m_state].ch == (ch & ~0x20))
+					{
+						m_state = s_dag[m_state].next_match;
+						break;
+					}
+
+					m_state = s_dag[m_state].next_nomatch;
+
+					if (m_state == 0)
+					{
+						result = no_keyword;
+						break;
+					}
+				}
+				break;
+		}
+
+		if (result != undefined)
+			m_state = 0;
+
+		return result;
+	}
+
+  private:
+	static constexpr struct node
+	{
+		int16_t ch;
+		int8_t next_match;
+		int8_t next_nomatch;
+	} s_dag[] = {
+		{ 0 },
+		{ 'D',  5, 2 },
+		{ 'G',  9, 3 },
+		{ 'L', 15, 4 },
+		{ 'S', 19, 0 },
+		{ 'A',  6, 0 },
+		{ 'T',  7, 0 },
+		{ 'A',  8, 0 },
+		{ '_', -1, 0 },
+		{ 'L', 10, 0 },
+		{ 'O', 11, 0 },
+		{ 'B', 12, 0 },
+		{ 'A', 13, 0 },
+		{ 'L', 14, 0 },
+		{ '_', -2, 0 },
+		{ 'O', 16, 0},
+		{ 'O', 17, 0 },
+		{ 'P', 18, 0 },
+		{ '_', -3, 0 },
+		{ 'A', 21, 20 },
+		{ 'T', 24, 0 },
+		{ 'V', 22, 0 },
+		{ 'E', 23, 0 },
+		{ '_', -4, 0 },
+		{ 'O', 25, 0 },
+		{ 'P', 26, 0 },
+		{ '_', -5, 0 },
+	};
+
+	static constexpr int NODE_COUNT = sizeof(s_dag) / sizeof(node);
+
+	int m_state = 1;
+	bool m_seen_trailing_chars = false;
+};
+
+// --------------------------------------------------------------------
+
 sac_parser::sac_parser(std::istream &is, bool init)
 	: m_source(*is.rdbuf())
 {
+	m_token_buffer.reserve(8192);
+
 	if (is.rdbuf() == nullptr)
 		throw std::runtime_error("Attempt to read from uninitialised stream");

-	m_validate = true;
 	m_line_nr = 1;
 	m_bol = true;

@@ -54,45 +192,54 @@ sac_parser::sac_parser(std::istream &is, bool init)
 		m_lookahead = get_next_token();
 }

+bool sac_parser::is_unquoted_string(std::string_view text)
+{
+	bool result = text.empty() or is_ordinary(text.front());
+	if (result)
+	{
+		reserved_words_automaton automaton;
+
+		for (char ch : text)
+		{
+			if (not is_non_blank(ch))
+			{
+				result = false;
+				break;
+			}
+
+			automaton.move(ch);
+		}
+
+		if (automaton.matched())
+			result = false;
+	}
+
+	return result;
+}
+
 // get_next_char takes a char from the buffer, or if it is empty
 // from the istream. This function also does carriage/linefeed
 // translation.
 int sac_parser::get_next_char()
 {
-	int result = std::char_traits<char>::eof();
-
-	if (m_buffer.empty())
-		result = m_source.sbumpc();
-	else
-	{
-		result = m_buffer.back();
-		m_buffer.pop_back();
-	}
-
-	// very simple CR/LF translation into LF
-	if (result == '\r')
-	{
-		int lookahead = m_source.sbumpc();
-		if (lookahead != '\n')
-			m_buffer.push_back(lookahead);
-		result = '\n';
-	}
+	int result = m_source.sbumpc();

 	if (result == std::char_traits<char>::eof())
-		m_token_value.push_back(0);
+		m_token_buffer.push_back(0);
 	else
-		m_token_value.push_back(std::char_traits<char>::to_char_type(result));
-
-	if (result == '\n')
-		++m_line_nr;
-
-	if (VERBOSE >= 6)
 	{
-		std::cerr << "get_next_char => ";
-		if (iscntrl(result) or not isprint(result))
-			std::cerr << int(result) << std::endl;
-		else
-			std::cerr << char(result) << std::endl;
+		if (result == '\r')
+		{
+			if (m_source.sgetc() == '\n')
+				m_source.sbumpc();
+
+			++m_line_nr;
+			result = '\n';
+		}
+		else if (result == '\n')
+			++m_line_nr;
+		
+		m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
 	}

 	return result;
@@ -100,44 +247,22 @@ int sac_parser::get_next_char()

 void sac_parser::retract()
 {
-	assert(not m_token_value.empty());
+	assert(not m_token_buffer.empty());

-	char ch = m_token_value.back();
+	char ch = m_token_buffer.back();
 	if (ch == '\n')
 		--m_line_nr;

-	m_buffer.push_back(ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch));
-	m_token_value.pop_back();
-}
-
-int sac_parser::restart(int start)
-{
-	int result = 0;
-
-	while (not m_token_value.empty())
-		retract();
-
-	switch (start)
+	if (ch != 0)
 	{
-		case State::Start:
-			result = State::Float;
-			break;
+		// since we always putback at most a single character,
+		// the test below should never fail.

-		case State::Float:
-			result = State::Int;
-			break;
-
-		case State::Int:
-			result = State::Value;
-			break;
-
-		default:
-			error("Invalid state in SacParser");
+		if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
+			throw std::runtime_error("putback failure");
 	}

-	m_bol = false;
-
-	return result;
+	m_token_buffer.pop_back();
 }

 sac_parser::CIFToken sac_parser::get_next_token()
@@ -146,11 +271,13 @@ sac_parser::CIFToken sac_parser::get_next_token()

 	CIFToken result = CIFToken::Unknown;
 	int quoteChar = 0;
-	int state = State::Start, start = State::Start;
+	State state = State::Start;
 	m_bol = false;

-	m_token_value.clear();
-	mTokenType = CIFValue::Unknown;
+	m_token_buffer.clear();
+	m_token_value = {};
+
+	reserved_words_automaton dag;

 	while (result == CIFToken::Unknown)
 	{
@@ -174,23 +301,27 @@ sac_parser::CIFToken sac_parser::get_next_token()
 					state = State::Tag;
 				else if (ch == ';' and m_bol)
 					state = State::TextField;
+				else if (ch == '?')
+					state = State::QuestionMark;
 				else if (ch == '\'' or ch == '"')
 				{
 					quoteChar = ch;
 					state = State::QuotedString;
 				}
+				else if (dag.move(ch) == reserved_words_automaton::undefined)
+					state = State::Reserved;
 				else
-					state = start = restart(start);
+					state = State::Value;
 				break;

 			case State::White:
 				if (ch == kEOF)
 					result = CIFToken::Eof;
-				else if (not isspace(ch))
+				else if (not is_space(ch))
 				{
 					state = State::Start;
 					retract();
-					m_token_value.clear();
+					m_token_buffer.clear();
 				}
 				else
 					m_bol = (ch == '\n');
@@ -201,38 +332,40 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				{
 					state = State::Start;
 					m_bol = true;
-					m_token_value.clear();
+					m_token_buffer.clear();
 				}
 				else if (ch == kEOF)
 					result = CIFToken::Eof;
 				else if (not is_any_print(ch))
 					error("invalid character in comment");
 				break;
+			
+			case State::QuestionMark:
+				if (not is_non_blank(ch))
+				{
+					retract();
+					result = CIFToken::Value;
+				}
+				else
+					state = State::Value;
+				break;

 			case State::TextField:
 				if (ch == '\n')
-					state = State::TextField + 1;
+					state = State::TextFieldNL;
 				else if (ch == kEOF)
 					error("unterminated textfield");
-				// else if (ch == '\\')
-				// 	state = State::Esc;
 				else if (not is_any_print(ch) and cif::VERBOSE > 2)
 					warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
 				break;

-			// case State::Esc:
-			// 	if (ch == '\n')
-
-			// 	break;
-
-			case State::TextField + 1:
+			case State::TextFieldNL:
 				if (is_text_lead(ch) or ch == ' ' or ch == '\t')
 					state = State::TextField;
 				else if (ch == ';')
 				{
-					assert(m_token_value.length() >= 2);
-					m_token_value = m_token_value.substr(1, m_token_value.length() - 3);
-					mTokenType = CIFValue::TextField;
+					assert(m_token_buffer.size() >= 2);
+					m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
 					result = CIFToken::Value;
 				}
 				else if (ch == kEOF)
@@ -255,12 +388,10 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				{
 					retract();
 					result = CIFToken::Value;
-					mTokenType = CIFValue::String;
-
-					if (m_token_value.length() < 2)
+					if (m_token_buffer.size() < 2)
 						error("Invalid quoted string token");

-					m_token_value = m_token_value.substr(1, m_token_value.length() - 2);
+					m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 2);
 				}
 				else if (ch == quoteChar)
 					;
@@ -277,149 +408,68 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				{
 					retract();
 					result = CIFToken::Tag;
+					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
 				}
 				break;

-			case State::Float:
-				if (ch == '+' or ch == '-')
+			case State::Reserved:
+				switch (dag.move(ch))
 				{
-					state = State::Float + 1;
+					case reserved_words_automaton::undefined:
+						break;
+
+					case reserved_words_automaton::no_keyword:
+						if (not is_non_blank(ch))
+						{
+							retract();
+							result = CIFToken::Value;
+							m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
+						}
+						else
+							state = State::Value;
+						break;
+
+					case reserved_words_automaton::data:
+						retract();
+						m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
+						result = CIFToken::DATA;
+						break;
+
+					case reserved_words_automaton::global:
+						retract();
+						result = CIFToken::GLOBAL;
+						break;
+
+					case reserved_words_automaton::loop:
+						retract();
+						result = CIFToken::LOOP;
+						break;
+
+					case reserved_words_automaton::save:
+						retract();
+						result = CIFToken::SAVE_;
+						break;
+
+					case reserved_words_automaton::save_plus:
+						retract();
+						m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
+						result = CIFToken::SAVE_NAME;
+						break;
+
+					case reserved_words_automaton::stop:
+						retract();
+						result = CIFToken::STOP;
+						break;
 				}
-				else if (isdigit(ch))
-					state = State::Float + 1;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 1:
-				//				if (ch == '(')	// numeric???
-				//					mState = State::NumericSuffix;
-				//				else
-				if (ch == '.')
-					state = State::Float + 2;
-				else if (tolower(ch) == 'e')
-					state = State::Float + 3;
-				else if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					mTokenType = CIFValue::Int;
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			// parsed '.'
-			case State::Float + 2:
-				if (tolower(ch) == 'e')
-					state = State::Float + 3;
-				else if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					mTokenType = CIFValue::Float;
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			// parsed 'e'
-			case State::Float + 3:
-				if (ch == '-' or ch == '+')
-					state = State::Float + 4;
-				else if (isdigit(ch))
-					state = State::Float + 5;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 4:
-				if (isdigit(ch))
-					state = State::Float + 5;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Float + 5:
-				if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					mTokenType = CIFValue::Float;
-				}
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Int:
-				if (isdigit(ch) or ch == '+' or ch == '-')
-					state = State::Int + 1;
-				else
-					state = start = restart(start);
-				break;
-
-			case State::Int + 1:
-				if (is_white(ch) or ch == kEOF)
-				{
-					retract();
-					result = CIFToken::Value;
-					mTokenType = CIFValue::Int;
-				}
-				else
-					state = start = restart(start);
 				break;

 			case State::Value:
-				if (ch == '_')
-				{
-					std::string s = to_lower_copy(m_token_value);
-
-					if (s == "data_")
-					{
-						state = State::DATA;
-						continue;
-					}
-					
-					if (s == "save_")
-					{
-						state = State::SAVE;
-						continue;
-					}
-				}
-
-				if (result == CIFToken::Unknown and not is_non_blank(ch))
-				{
-					retract();
-					result = CIFToken::Value;
-
-					if (m_token_value == ".")
-						mTokenType = CIFValue::Inapplicable;
-					else if (iequals(m_token_value, "global_"))
-						result = CIFToken::GLOBAL;
-					else if (iequals(m_token_value, "stop_"))
-						result = CIFToken::STOP;
-					else if (iequals(m_token_value, "loop_"))
-						result = CIFToken::LOOP;
-					else if (m_token_value == "?")
-					{
-						mTokenType = CIFValue::Unknown;
-						m_token_value.clear();
-					}
-				}
-				break;
-
-			case State::DATA:
-			case State::SAVE:
 				if (not is_non_blank(ch))
 				{
 					retract();
-
-					if (state == State::DATA)
-						result = CIFToken::DATA;
-					else
-						result = CIFToken::SAVE;
-
-					m_token_value.erase(m_token_value.begin(), m_token_value.begin() + 5);
+					result = CIFToken::Value;
+					m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
+					break;
 				}
 				break;

@@ -433,8 +483,6 @@ sac_parser::CIFToken sac_parser::get_next_token()
 	if (VERBOSE >= 5)
 	{
 		std::cerr << get_token_name(result);
-		if (mTokenType != CIFValue::Unknown)
-			std::cerr << ' ' << get_value_name(mTokenType);
 		if (result != CIFToken::Eof)
 			std::cerr << " " << std::quoted(m_token_value);
 		std::cerr << std::endl;
@@ -506,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
 				break;

 			case string_quote:
-				if (std::isspace(ch))
+				if (is_space(ch))
 					state = start;
 				else
 					state = string;
@@ -518,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
 				break;

 			case data:
-				if (isspace(ch) and dblk[si] == 0)
+				if (is_space(ch) and dblk[si] == 0)
 					found = true;
 				else if (dblk[si++] != ch)
 					state = start;
@@ -596,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
 				break;

 			case string_quote:
-				if (std::isspace(ch))
+				if (is_space(ch))
 					state = start;
 				else
 					state = string;
@@ -620,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
 			case data_name:
 				if (is_non_blank(ch))
 					datablock.insert(datablock.end(), char(ch));
-				else if (isspace(ch))
+				else if (is_space(ch))
 				{
 					if (not datablock.empty())
 						index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
@@ -696,7 +744,7 @@ void sac_parser::parse_datablock()
 	static const std::string kUnitializedCategory("<invalid>");
 	std::string cat = kUnitializedCategory;	// intial value acts as a guard for empty category names

-	while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE)
+	while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE_NAME)
 	{
 		switch (m_lookahead)
 		{
@@ -761,7 +809,7 @@ void sac_parser::parse_datablock()
 				break;
 			}

-			case CIFToken::SAVE:
+			case CIFToken::SAVE_NAME:
 				parse_save_frame();
 				break;

@@ -779,7 +827,7 @@ void sac_parser::parse_save_frame()

 // --------------------------------------------------------------------

-void parser::produce_datablock(const std::string &name)
+void parser::produce_datablock(std::string_view name)
 {
 	if (VERBOSE >= 4)
 		std::cerr << "producing data_" << name << std::endl;
@@ -788,7 +836,7 @@ void parser::produce_datablock(const std::string &name)
 	m_datablock = &(*iter);
 }

-void parser::produce_category(const std::string &name)
+void parser::produce_category(std::string_view name)
 {
 	if (VERBOSE >= 4)
 		std::cerr << "producing category " << name << std::endl;
@@ -810,7 +858,7 @@ void parser::produce_row()
 	// m_row.lineNr(m_line_nr);
 }

-void parser::produce_item(const std::string &category, const std::string &item, const std::string &value)
+void parser::produce_item(std::string_view category, std::string_view item, std::string_view value)
 {
 	if (VERBOSE >= 4)
 		std::cerr << "producing _" << category << '.' << item << " -> " << value << std::endl;
@@ -821,4 +869,4 @@ void parser::produce_item(const std::string &category, const std::string &item,
 	m_row[item] = m_token_value;
 }

-} // namespace cif
+} // namespace cif
--- a/src/text.cpp
+++ b/src/text.cpp
@@ -236,28 +236,19 @@ std::string cif_id_for_number(int number)
 {
 	std::string result;

-	if (number >= 26 * 26 * 26)
-		result = 'L' + std::to_string(number);
-	else
+	do
 	{
-		if (number >= 26 * 26)
-		{
-			int v = number / (26 * 26);
-			result += char('A' - 1 + v);
-			number %= (26 * 26);
-		}
+		int r = number % 26;
+		result += 'A' + r;

-		if (number >= 26)
-		{
-			int v = number / 26;
-			result += char('A' - 1 + v);
-			number %= 26;
-		}
-
-		result += char('A' + number);
+		number = (number - r) / 26 - 1;
 	}
+	while (number >= 0);
+
+	std::reverse(result.begin(), result.end());

 	assert(not result.empty());
+
 	return result;
 }

--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -40,7 +40,6 @@
 #include <iostream>
 #include <map>
 #include <mutex>
-#include <regex>
 #include <sstream>
 #include <thread>

@@ -161,6 +160,8 @@ struct progress_bar_impl
 	void print_progress();
 	void print_done();

+	using time_point = std::chrono::time_point<std::chrono::system_clock>;
+
 	int64_t m_max_value;
 	std::atomic<int64_t> m_consumed;
 	int64_t m_last_consumed = 0;
@@ -168,8 +169,8 @@ struct progress_bar_impl
 	std::string m_action, m_message;
 	std::mutex m_mutex;
 	std::thread m_thread;
-	std::chrono::time_point<std::chrono::system_clock>
-		m_start = std::chrono::system_clock::now();
+	time_point m_start = std::chrono::system_clock::now();
+	time_point m_last = std::chrono::system_clock::now();
 	bool m_stop = false;
 };

@@ -192,7 +193,9 @@ void progress_bar_impl::run()
 	{
 		while (not m_stop)
 		{
-			if (std::chrono::system_clock::now() - m_start < 2s)
+			auto now = std::chrono::system_clock::now();
+
+			if (now - m_start < 2s or now - m_last < 100ms)
 			{
 				std::this_thread::sleep_for(10ms);
 				continue;
@@ -206,6 +209,7 @@ void progress_bar_impl::run()
 			print_progress();

 			printedAny = true;
+			m_last = std::chrono::system_clock::now();
 		}
 	}
 	catch (...)
--- a/src/validate.cpp
+++ b/src/validate.cpp
@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name)
 	}
 }

-void validator_factory::construct_validator(std::string_view name, std::istream &is)
+const validator &validator_factory::construct_validator(std::string_view name, std::istream &is)
 {
-	m_validators.emplace_back(parse_dictionary(name, is));
+	return m_validators.emplace_back(parse_dictionary(name, is));
 }

 } // namespace cif
--- a/test/io-test.cpp
+++ b/test/io-test.cpp
@@ -0,0 +1,39 @@
+#include <cif++.hpp>
+
+class dummy_parser : public cif::sac_parser
+{
+  public:
+	dummy_parser(std::istream &is)
+		: sac_parser(is)
+	{
+	}
+
+	void produce_datablock(std::string_view name) override
+	{
+	}
+
+	void produce_category(std::string_view name) override
+	{
+	}
+
+	void produce_row() override
+	{
+	}
+
+	void produce_item(std::string_view category, std::string_view item, std::string_view value) override
+	{
+	}
+};
+
+
+int main()
+{
+	cif::gzio::ifstream in("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
+
+	dummy_parser parser(in);
+	parser.parse_file();
+
+	// cif::file f("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
+
+	return 0;
+}
--- a/test/unit-v2-test.cpp
+++ b/test/unit-v2-test.cpp
@@ -75,6 +75,30 @@ bool init_unit_test()

 // --------------------------------------------------------------------

+BOOST_AUTO_TEST_CASE(id_1)
+{
+	BOOST_TEST(cif::cif_id_for_number(0) == "A");
+	BOOST_TEST(cif::cif_id_for_number(25) == "Z");
+	BOOST_TEST(cif::cif_id_for_number(26) == "AA");
+	BOOST_TEST(cif::cif_id_for_number(26 + 1) == "AB");
+
+	BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26 - 1) == "ZZ");
+	BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26) == "AAA");
+	BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26 + 1) == "AAB");
+
+	std::set<std::string> testset;
+
+	for (int i = 0; i < 100000; ++i)
+	{
+		std::string id = cif::cif_id_for_number(i);
+		BOOST_TEST(testset.count(id) == 0);
+		testset.insert(id);
+	}
+	BOOST_TEST(testset.size() == 100000);
+}
+
+// --------------------------------------------------------------------
+
 BOOST_AUTO_TEST_CASE(cc_1)
 {
 	std::tuple<std::string_view, float, char> tests[] = {
@@ -2357,8 +2381,6 @@ _test.text ??

 BOOST_AUTO_TEST_CASE(output_test_1)
 {
-	cif::VERBOSE = 5;
-
 	auto data1 = R"(
 data_Q
 loop_
@@ -2863,7 +2885,7 @@ save__cat_1.name

 	std::istream is_dict(&buffer);

-	auto validator = cif::parse_dictionary("test_dict.dic", is_dict);
+	auto &validator = cif::validator_factory::instance().construct_validator("test_dict.dic", is_dict);

 	cif::file f;
 	f.set_validator(&validator);
@@ -2901,8 +2923,6 @@ _cat_1.name
 	ss << f;

 	cif::file f2(ss);
-
-	f2.set_validator(&validator);
 	BOOST_ASSERT(f2.is_valid());

 	auto &audit_conform = f2.front()["audit_conform"];
Author	SHA1	Message	Date
Maarten L. Hekkelman	836aed6ea9	Fix includes to contain <cstdint>	2023-06-08 13:15:43 +02:00
Maarten L. Hekkelman	50df250415	Merge branch 'develop' into trunk	2023-06-08 10:12:03 +02:00
Maarten L. Hekkelman	2409fc5b7b	update changelog, version bump	2023-06-08 10:10:49 +02:00
Maarten L. Hekkelman	8a1184a24c	Fix cif_id_for_number	2023-06-07 19:11:20 +02:00
Maarten L. Hekkelman	d2fbc54765	New cache location	2023-06-07 14:07:27 +02:00
Maarten L. Hekkelman	1bcb26ba75	extend validator faster unique_id	2023-06-07 13:08:36 +02:00
Maarten L. Hekkelman	32f4749d84	faster cif parser	2023-06-07 11:19:35 +02:00
Maarten L. Hekkelman	da12be879a	progress_bar consuming too much time	2023-06-07 09:15:17 +02:00
Maarten L. Hekkelman	94a38ad4e8	Merge branch 'develop' of github.com:PDB-REDO/libcifpp into develop	2023-06-06 14:31:26 +02:00
Maarten L. Hekkelman	20ef79a172	for c++17, limited version of std::string_view	2023-06-06 14:30:11 +02:00
Maarten L. Hekkelman	92bf25476e	Speed improvements	2023-06-06 14:12:21 +02:00
Maarten L. Hekkelman	b55e074dd7	reserve some token buffer space	2023-06-06 09:33:31 +02:00
Maarten L. Hekkelman	7b654a837d	with reserved words automaton	2023-06-06 09:22:55 +02:00
Maarten L. Hekkelman	ae9d247d22	optimised the parser a bit	2023-06-05 13:43:31 +02:00
Maarten L. Hekkelman	16b7deafe8	Better is_unquoted_string test	2023-06-02 17:09:57 +02:00
Maarten L. Hekkelman	f2cfe28458	Update README	2023-05-31 15:56:50 +02:00
Maarten L. Hekkelman	2e8a52949e	Update example and README	2023-05-31 15:54:53 +02:00
Maarten L. Hekkelman	441e142767	Update readme	2023-05-31 15:42:54 +02:00