Compare commits

...

18 Commits

Author SHA1 Message Date
Maarten L. Hekkelman
836aed6ea9 Fix includes to contain <cstdint> 2023-06-08 13:15:43 +02:00
Maarten L. Hekkelman
50df250415 Merge branch 'develop' into trunk 2023-06-08 10:12:03 +02:00
Maarten L. Hekkelman
2409fc5b7b update changelog, version bump 2023-06-08 10:10:49 +02:00
Maarten L. Hekkelman
8a1184a24c Fix cif_id_for_number 2023-06-07 19:11:20 +02:00
Maarten L. Hekkelman
d2fbc54765 New cache location 2023-06-07 14:07:27 +02:00
Maarten L. Hekkelman
1bcb26ba75 extend validator
faster unique_id
2023-06-07 13:08:36 +02:00
Maarten L. Hekkelman
32f4749d84 faster cif parser 2023-06-07 11:19:35 +02:00
Maarten L. Hekkelman
da12be879a progress_bar consuming too much time 2023-06-07 09:15:17 +02:00
Maarten L. Hekkelman
94a38ad4e8 Merge branch 'develop' of github.com:PDB-REDO/libcifpp into develop 2023-06-06 14:31:26 +02:00
Maarten L. Hekkelman
20ef79a172 for c++17, limited version of std::string_view 2023-06-06 14:30:11 +02:00
Maarten L. Hekkelman
92bf25476e Speed improvements 2023-06-06 14:12:21 +02:00
Maarten L. Hekkelman
b55e074dd7 reserve some token buffer space 2023-06-06 09:33:31 +02:00
Maarten L. Hekkelman
7b654a837d with reserved words automaton 2023-06-06 09:22:55 +02:00
Maarten L. Hekkelman
ae9d247d22 optimised the parser a bit 2023-06-05 13:43:31 +02:00
Maarten L. Hekkelman
16b7deafe8 Better is_unquoted_string test 2023-06-02 17:09:57 +02:00
Maarten L. Hekkelman
f2cfe28458 Update README 2023-05-31 15:56:50 +02:00
Maarten L. Hekkelman
2e8a52949e Update example and README 2023-05-31 15:54:53 +02:00
Maarten L. Hekkelman
441e142767 Update readme 2023-05-31 15:42:54 +02:00
16 changed files with 531 additions and 360 deletions

View File

@@ -25,7 +25,7 @@
cmake_minimum_required(VERSION 3.16)
# set the project name
project(cifpp VERSION 5.0.9 LANGUAGES CXX)
project(cifpp VERSION 5.1.0.1 LANGUAGES CXX)
list(PREPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
@@ -382,6 +382,16 @@ install(FILES
DESTINATION ${CIFPP_DATA_DIR}
)
if(${CIFPP_CACHE_DIR})
install(FILES
${PROJECT_SOURCE_DIR}/rsrc/mmcif_ddl.dic
${PROJECT_SOURCE_DIR}/rsrc/mmcif_pdbx.dic
${PROJECT_SOURCE_DIR}/rsrc/mmcif_ma.dic
${COMPONENTS_CIF}
DESTINATION ${CIFPP_CACHE_DIR}
)
endif()
set(CONFIG_TEMPLATE_FILE ${PROJECT_SOURCE_DIR}/cmake/cifppConfig.cmake.in)
configure_package_config_file(

View File

@@ -3,18 +3,78 @@ libcifpp
This library contains code to work with mmCIF and PDB files.
Synopsis
--------
```c++
// A simple program counting residues with an OXT atom
#include <filesystem>
#include <iostream>
#include <cif++.hpp>
namespace fs = std::filesystem;
int main(int argc, char *argv[])
{
if (argc != 2)
exit(1);
// Read file, can be PDB or mmCIF and can even be compressed with gzip.
cif::file file = cif::pdb::read(argv[1]);
if (file.empty())
{
std::cerr << "Empty file" << std::endl;
exit(1);
}
auto &db = file.front();
auto &atom_site = db["atom_site"];
auto n = atom_site.find(cif::key("label_atom_id") == "OXT").size();
std::cout << "File contains " << atom_site.size() << " atoms of which "
<< n << (n == 1 ? " is" : " are") << " OXT" << std::endl
<< "residues with an OXT are:" << std::endl;
for (const auto &[asym, comp, seqnr] :
atom_site.find<std::string, std::string, int>(
cif::key("label_atom_id") == "OXT",
"label_asym_id", "label_comp_id", "label_seq_id"))
{
std::cout << asym << ' ' << comp << ' ' << seqnr << std::endl;
}
return 0;
}
```
Requirements
------------
The code for this library was written in C++17. You therefore need a
recent compiler to build it. For the development gcc 9.3 and clang 9.0
recent compiler to build it. For the development gcc 9.4 and clang 9.0
have been used as well as MSVC version 2019.
Other requirements are:
- [mrc](https://github.com/mhekkel/mrc), a resource compiler that
allows including data files into the executable making them easier to
install. Strictly this is optional, but at the expense of functionality.
install. Strictly speaking this is optional, but at the expense of
functionality.
- [libeigen](https://eigen.tuxfamily.org/index.php?title=Main_Page), a
library to do amongst others matrix calculations. This usually can be
installed using your package manager, in Debian/Ubuntu it is called
`libeigen3-dev`
- zlib, the development version of this library. On Debian/Ubuntu this
is the package `zlib1g-dev`.
- [boost](https://www.boost.org). The boost libraries are only needed if
you want to build the testing code.
When building using MS Visual Studio, you will also need [libzeep](https://github.com/mhekkel/libzeep)
since MSVC does not yet provide a C++ template required by libcifpp.
Building
--------
@@ -26,7 +86,7 @@ On linux e.g. you would issue the following commands to build and install
libcifpp in your `$HOME/.local` folder:
```bash
git clone https://github.com/PDB-REDO/libcifpp.git
git clone https://github.com/PDB-REDO/libcifpp.git --recurse-submodules
cd libcifpp
cmake -S . -B build -DCMAKE_INSTALL_PREFIX=$HOME/.local -DCMAKE_BUILD_TYPE=Release
cmake --build build

View File

@@ -1,3 +1,11 @@
Version 5.1
- New parser, optimised for speed
- Fix in unique ID generator
Version 5.0.10
- Fix in progress_bar, was using too much CPU
- Optimised mmCIF parser
Version 5.0.9
- Fix in dihedral angle calculations
- Added create_water to model

View File

@@ -1,24 +1,32 @@
#include <iostream>
#include <filesystem>
#include <iostream>
#include <cif++.hpp>
namespace fs = std::filesystem;
int main()
int main(int argc, char *argv[])
{
cif::file file;
file.load("1cbs.cif.gz");
if (argc != 2)
exit(1);
auto& db = file.front();
cif::file file = cif::pdb::read(argv[1]);
if (file.empty())
{
std::cerr << "Empty file" << std::endl;
exit(1);
}
auto &db = file.front();
auto &atom_site = db["atom_site"];
auto n = atom_site.find(cif::key("label_atom_id") == "OXT").size();
std::cout << "File contains " << atom_site.size() << " atoms of which " << n << (n == 1 ? " is" : " are") << " OXT" << std::endl
<< "residues with an OXT are:" << std::endl;
for (const auto& [asym, comp, seqnr]: atom_site.find<std::string,std::string,int>(
cif::key("label_atom_id") == "OXT", "label_asym_id", "label_comp_id", "label_seq_id"))
<< "residues with an OXT are:" << std::endl;
for (const auto &[asym, comp, seqnr] : atom_site.find<std::string, std::string, int>(
cif::key("label_atom_id") == "OXT", "label_asym_id", "label_comp_id", "label_seq_id"))
{
std::cout << asym << ' ' << comp << ' ' << seqnr << std::endl;
}

View File

@@ -32,5 +32,6 @@ namespace cif
{
validator parse_dictionary(std::string_view name, std::istream &is);
void extend_dictionary(validator &v, std::istream &is);
} // namespace cif

View File

@@ -29,7 +29,6 @@
#include "cif++/row.hpp"
#include <map>
#include <regex>
namespace cif
{
@@ -54,8 +53,6 @@ class sac_parser
public:
using datablock_index = std::map<std::string, std::size_t>;
sac_parser(std::istream &is, bool init = true);
virtual ~sac_parser() = default;
enum CharTraitsMask : uint8_t
@@ -66,9 +63,14 @@ class sac_parser
kAnyPrintMask = 1 << 3
};
static bool is_white(int ch)
static constexpr bool is_space(int ch)
{
return std::isspace(ch) or ch == '#';
return ch == ' ' or ch == '\t' or ch == '\r' or ch == '\n';
}
static constexpr bool is_white(int ch)
{
return is_space(ch) or ch == '#';
}
static constexpr bool is_ordinary(int ch)
@@ -92,26 +94,7 @@ class sac_parser
(ch >= 0x20 and ch <= 0x7f and (kCharTraitsTable[ch - 0x20] & kAnyPrintMask) != 0);
}
static bool is_unquoted_string(std::string_view text)
{
bool result = text.empty() or is_ordinary(text.front());
if (result)
{
for (auto ch : text)
{
if (is_non_blank(ch))
continue;
result = false;
break;
}
}
static const std::regex kReservedRx(R"(loop_|stop_|global_|data_\S+|save_\S+)", std::regex_constants::icase);
// but be careful it does not contain e.g. stop_
return result and not std::regex_match(text.begin(), text.end(), kReservedRx);
}
static bool is_unquoted_string(std::string_view text);
protected:
static constexpr uint8_t kCharTraitsTable[128] = {
@@ -133,7 +116,8 @@ class sac_parser
DATA,
LOOP,
GLOBAL,
SAVE,
SAVE_,
SAVE_NAME,
STOP,
Tag,
Value
@@ -148,7 +132,8 @@ class sac_parser
case CIFToken::DATA: return "DATA";
case CIFToken::LOOP: return "LOOP";
case CIFToken::GLOBAL: return "GLOBAL";
case CIFToken::SAVE: return "SAVE";
case CIFToken::SAVE_: return "SAVE";
case CIFToken::SAVE_NAME: return "SAVE+name";
case CIFToken::STOP: return "STOP";
case CIFToken::Tag: return "Tag";
case CIFToken::Value: return "Value";
@@ -156,41 +141,13 @@ class sac_parser
}
}
enum class CIFValue
{
Int,
Float,
Numeric,
String,
TextField,
Inapplicable,
Unknown
};
static constexpr const char *get_value_name(CIFValue type)
{
switch (type)
{
case CIFValue::Int: return "Int";
case CIFValue::Float: return "Float";
case CIFValue::Numeric: return "Numeric";
case CIFValue::String: return "String";
case CIFValue::TextField: return "TextField";
case CIFValue::Inapplicable: return "Inapplicable";
case CIFValue::Unknown: return "Unknown";
default: return "Invalid type parameter";
}
}
// get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed
// translation.
// get_next_char takes the next character from the istream.
// This function also does carriage/linefeed translation.
int get_next_char();
// Put the last read character back into the istream
void retract();
int restart(int start);
CIFToken get_next_token();
void match(CIFToken token);
@@ -205,6 +162,9 @@ class sac_parser
void parse_file();
protected:
sac_parser(std::istream &is, bool init = true);
void parse_global();
void parse_datablock();
@@ -227,13 +187,14 @@ class sac_parser
// production methods, these are pure virtual here
virtual void produce_datablock(const std::string &name) = 0;
virtual void produce_category(const std::string &name) = 0;
virtual void produce_datablock(std::string_view name) = 0;
virtual void produce_category(std::string_view name) = 0;
virtual void produce_row() = 0;
virtual void produce_item(const std::string &category, const std::string &item, const std::string &value) = 0;
virtual void produce_item(std::string_view category, std::string_view item, std::string_view value) = 0;
protected:
enum State
enum class State
{
Start,
White,
@@ -246,23 +207,21 @@ class sac_parser
UnquotedString,
Tag,
TextField,
Float = 100,
Int = 110,
Value = 300,
DATA,
SAVE
TextFieldNL,
Reserved,
Value
};
std::streambuf &m_source;
// Parser state
bool m_validate;
uint32_t m_line_nr;
bool m_bol;
CIFToken m_lookahead;
std::string m_token_value;
CIFValue mTokenType;
std::vector<int> m_buffer; // retract buffer, used to be a stack<char>
// token buffer
std::vector<char> m_token_buffer;
std::string_view m_token_value;
};
// --------------------------------------------------------------------
@@ -276,13 +235,13 @@ class parser : public sac_parser
{
}
void produce_datablock(const std::string &name) override;
void produce_datablock(std::string_view name) override;
void produce_category(const std::string &name) override;
void produce_category(std::string_view name) override;
void produce_row() override;
void produce_item(const std::string &category, const std::string &item, const std::string &value) override;
void produce_item(std::string_view category, std::string_view item, std::string_view value) override;
protected:
file &m_file;

View File

@@ -31,6 +31,7 @@
#include <array>
#include <cmath>
#include <complex>
#include <cstdint>
#include <functional>
#include <valarray>

View File

@@ -228,8 +228,9 @@ class validator_factory
const validator &operator[](std::string_view dictionary_name);
const validator &construct_validator(std::string_view name, std::istream &is);
private:
void construct_validator(std::string_view name, std::istream &is);
// --------------------------------------------------------------------

View File

@@ -1227,23 +1227,37 @@ std::string category::get_unique_id(std::function<std::string(int)> generator)
{
using namespace cif::literals;
std::string id_tag = "id";
if (m_cat_validator != nullptr and m_cat_validator->m_keys.size() == 1)
id_tag = m_cat_validator->m_keys.front();
// calling size() often is a waste of resources
if (m_last_unique_num == 0)
m_last_unique_num = static_cast<uint32_t>(size());
for (;;)
std::string result = generator(static_cast<int>(m_last_unique_num++));
std::string id_tag = "id";
if (m_cat_validator != nullptr and m_cat_validator->m_keys.size() == 1)
{
std::string result = generator(static_cast<int>(m_last_unique_num++));
if (exists(key(id_tag) == result))
continue;
return result;
if (m_index == nullptr and m_cat_validator != nullptr)
m_index = new category_index(this);
for (;;)
{
if (m_index->find_by_value({{ id_tag, result }}) == nullptr)
break;
result = generator(static_cast<int>(m_last_unique_num++));
}
}
else
{
for (;;)
{
if (not exists(key(id_tag) == result))
break;
result = generator(static_cast<int>(m_last_unique_num++));
}
}
return result;
}
void category::update_value(const std::vector<row_handle> &rows, std::string_view tag, std::string_view value)

View File

@@ -117,7 +117,7 @@ class dictionary_parser : public parser
if (not m_collected_item_types)
m_collected_item_types = collect_item_types();
std::string saveFrameName = m_token_value;
std::string saveFrameName { m_token_value };
if (saveFrameName.empty())
error("Invalid save frame, should contain more than just 'save_' here");
@@ -127,7 +127,7 @@ class dictionary_parser : public parser
datablock dict(m_token_value);
datablock::iterator cat = dict.end();
match(CIFToken::SAVE);
match(CIFToken::SAVE_NAME);
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag)
{
if (m_lookahead == CIFToken::LOOP)
@@ -183,7 +183,7 @@ class dictionary_parser : public parser
}
}
match(CIFToken::SAVE);
match(CIFToken::SAVE_);
if (isCategorySaveFrame)
{
@@ -481,4 +481,11 @@ validator parse_dictionary(std::string_view name, std::istream &is)
return result;
}
} // namespace cif
void extend_dictionary(validator &v, std::istream &is)
{
file f;
dictionary_parser p(v, is, f);
p.load_dictionary();
}
} // namespace cif

View File

@@ -32,7 +32,6 @@
#include <cassert>
#include <iostream>
#include <map>
#include <regex>
#include <stack>
namespace cif
@@ -40,13 +39,152 @@ namespace cif
// --------------------------------------------------------------------
class reserved_words_automaton
{
public:
reserved_words_automaton() {}
enum move_result
{
undefined,
no_keyword,
data,
global,
loop,
save,
save_plus,
stop
};
constexpr bool finished() const
{
return m_state <= 0;
}
constexpr bool matched() const
{
return m_state < 0;
}
constexpr move_result move(int ch)
{
move_result result = undefined;
switch (m_state)
{
case 0:
break;
case -1: // data_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
result = data;
else
result = no_keyword;
break;
case -2: // global_
result = sac_parser::is_non_blank(ch) ? no_keyword : global;
break;
case -3: // loop_
result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
break;
case -4: // save_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
result = save_plus;
else
result = save;
break;
case -5: // stop_
result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
break;
default:
assert(m_state > 0 and m_state < NODE_COUNT);
for (;;)
{
if (s_dag[m_state].ch == (ch & ~0x20))
{
m_state = s_dag[m_state].next_match;
break;
}
m_state = s_dag[m_state].next_nomatch;
if (m_state == 0)
{
result = no_keyword;
break;
}
}
break;
}
if (result != undefined)
m_state = 0;
return result;
}
private:
static constexpr struct node
{
int16_t ch;
int8_t next_match;
int8_t next_nomatch;
} s_dag[] = {
{ 0 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'L', 15, 4 },
{ 'S', 19, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ '_', -1, 0 },
{ 'L', 10, 0 },
{ 'O', 11, 0 },
{ 'B', 12, 0 },
{ 'A', 13, 0 },
{ 'L', 14, 0 },
{ '_', -2, 0 },
{ 'O', 16, 0},
{ 'O', 17, 0 },
{ 'P', 18, 0 },
{ '_', -3, 0 },
{ 'A', 21, 20 },
{ 'T', 24, 0 },
{ 'V', 22, 0 },
{ 'E', 23, 0 },
{ '_', -4, 0 },
{ 'O', 25, 0 },
{ 'P', 26, 0 },
{ '_', -5, 0 },
};
static constexpr int NODE_COUNT = sizeof(s_dag) / sizeof(node);
int m_state = 1;
bool m_seen_trailing_chars = false;
};
// --------------------------------------------------------------------
sac_parser::sac_parser(std::istream &is, bool init)
: m_source(*is.rdbuf())
{
m_token_buffer.reserve(8192);
if (is.rdbuf() == nullptr)
throw std::runtime_error("Attempt to read from uninitialised stream");
m_validate = true;
m_line_nr = 1;
m_bol = true;
@@ -54,45 +192,54 @@ sac_parser::sac_parser(std::istream &is, bool init)
m_lookahead = get_next_token();
}
bool sac_parser::is_unquoted_string(std::string_view text)
{
bool result = text.empty() or is_ordinary(text.front());
if (result)
{
reserved_words_automaton automaton;
for (char ch : text)
{
if (not is_non_blank(ch))
{
result = false;
break;
}
automaton.move(ch);
}
if (automaton.matched())
result = false;
}
return result;
}
// get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed
// translation.
int sac_parser::get_next_char()
{
int result = std::char_traits<char>::eof();
if (m_buffer.empty())
result = m_source.sbumpc();
else
{
result = m_buffer.back();
m_buffer.pop_back();
}
// very simple CR/LF translation into LF
if (result == '\r')
{
int lookahead = m_source.sbumpc();
if (lookahead != '\n')
m_buffer.push_back(lookahead);
result = '\n';
}
int result = m_source.sbumpc();
if (result == std::char_traits<char>::eof())
m_token_value.push_back(0);
m_token_buffer.push_back(0);
else
m_token_value.push_back(std::char_traits<char>::to_char_type(result));
if (result == '\n')
++m_line_nr;
if (VERBOSE >= 6)
{
std::cerr << "get_next_char => ";
if (iscntrl(result) or not isprint(result))
std::cerr << int(result) << std::endl;
else
std::cerr << char(result) << std::endl;
if (result == '\r')
{
if (m_source.sgetc() == '\n')
m_source.sbumpc();
++m_line_nr;
result = '\n';
}
else if (result == '\n')
++m_line_nr;
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
}
return result;
@@ -100,44 +247,22 @@ int sac_parser::get_next_char()
void sac_parser::retract()
{
assert(not m_token_value.empty());
assert(not m_token_buffer.empty());
char ch = m_token_value.back();
char ch = m_token_buffer.back();
if (ch == '\n')
--m_line_nr;
m_buffer.push_back(ch == 0 ? std::char_traits<char>::eof() : std::char_traits<char>::to_int_type(ch));
m_token_value.pop_back();
}
int sac_parser::restart(int start)
{
int result = 0;
while (not m_token_value.empty())
retract();
switch (start)
if (ch != 0)
{
case State::Start:
result = State::Float;
break;
// since we always putback at most a single character,
// the test below should never fail.
case State::Float:
result = State::Int;
break;
case State::Int:
result = State::Value;
break;
default:
error("Invalid state in SacParser");
if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
throw std::runtime_error("putback failure");
}
m_bol = false;
return result;
m_token_buffer.pop_back();
}
sac_parser::CIFToken sac_parser::get_next_token()
@@ -146,11 +271,13 @@ sac_parser::CIFToken sac_parser::get_next_token()
CIFToken result = CIFToken::Unknown;
int quoteChar = 0;
int state = State::Start, start = State::Start;
State state = State::Start;
m_bol = false;
m_token_value.clear();
mTokenType = CIFValue::Unknown;
m_token_buffer.clear();
m_token_value = {};
reserved_words_automaton dag;
while (result == CIFToken::Unknown)
{
@@ -174,23 +301,27 @@ sac_parser::CIFToken sac_parser::get_next_token()
state = State::Tag;
else if (ch == ';' and m_bol)
state = State::TextField;
else if (ch == '?')
state = State::QuestionMark;
else if (ch == '\'' or ch == '"')
{
quoteChar = ch;
state = State::QuotedString;
}
else if (dag.move(ch) == reserved_words_automaton::undefined)
state = State::Reserved;
else
state = start = restart(start);
state = State::Value;
break;
case State::White:
if (ch == kEOF)
result = CIFToken::Eof;
else if (not isspace(ch))
else if (not is_space(ch))
{
state = State::Start;
retract();
m_token_value.clear();
m_token_buffer.clear();
}
else
m_bol = (ch == '\n');
@@ -201,38 +332,40 @@ sac_parser::CIFToken sac_parser::get_next_token()
{
state = State::Start;
m_bol = true;
m_token_value.clear();
m_token_buffer.clear();
}
else if (ch == kEOF)
result = CIFToken::Eof;
else if (not is_any_print(ch))
error("invalid character in comment");
break;
case State::QuestionMark:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::Value;
}
else
state = State::Value;
break;
case State::TextField:
if (ch == '\n')
state = State::TextField + 1;
state = State::TextFieldNL;
else if (ch == kEOF)
error("unterminated textfield");
// else if (ch == '\\')
// state = State::Esc;
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
break;
// case State::Esc:
// if (ch == '\n')
// break;
case State::TextField + 1:
case State::TextFieldNL:
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
state = State::TextField;
else if (ch == ';')
{
assert(m_token_value.length() >= 2);
m_token_value = m_token_value.substr(1, m_token_value.length() - 3);
mTokenType = CIFValue::TextField;
assert(m_token_buffer.size() >= 2);
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
result = CIFToken::Value;
}
else if (ch == kEOF)
@@ -255,12 +388,10 @@ sac_parser::CIFToken sac_parser::get_next_token()
{
retract();
result = CIFToken::Value;
mTokenType = CIFValue::String;
if (m_token_value.length() < 2)
if (m_token_buffer.size() < 2)
error("Invalid quoted string token");
m_token_value = m_token_value.substr(1, m_token_value.length() - 2);
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 2);
}
else if (ch == quoteChar)
;
@@ -277,149 +408,68 @@ sac_parser::CIFToken sac_parser::get_next_token()
{
retract();
result = CIFToken::Tag;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
break;
case State::Float:
if (ch == '+' or ch == '-')
case State::Reserved:
switch (dag.move(ch))
{
state = State::Float + 1;
case reserved_words_automaton::undefined:
break;
case reserved_words_automaton::no_keyword:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = State::Value;
break;
case reserved_words_automaton::data:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
result = CIFToken::DATA;
break;
case reserved_words_automaton::global:
retract();
result = CIFToken::GLOBAL;
break;
case reserved_words_automaton::loop:
retract();
result = CIFToken::LOOP;
break;
case reserved_words_automaton::save:
retract();
result = CIFToken::SAVE_;
break;
case reserved_words_automaton::save_plus:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
result = CIFToken::SAVE_NAME;
break;
case reserved_words_automaton::stop:
retract();
result = CIFToken::STOP;
break;
}
else if (isdigit(ch))
state = State::Float + 1;
else
state = start = restart(start);
break;
case State::Float + 1:
// if (ch == '(') // numeric???
// mState = State::NumericSuffix;
// else
if (ch == '.')
state = State::Float + 2;
else if (tolower(ch) == 'e')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
mTokenType = CIFValue::Int;
}
else
state = start = restart(start);
break;
// parsed '.'
case State::Float + 2:
if (tolower(ch) == 'e')
state = State::Float + 3;
else if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
mTokenType = CIFValue::Float;
}
else
state = start = restart(start);
break;
// parsed 'e'
case State::Float + 3:
if (ch == '-' or ch == '+')
state = State::Float + 4;
else if (isdigit(ch))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 4:
if (isdigit(ch))
state = State::Float + 5;
else
state = start = restart(start);
break;
case State::Float + 5:
if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
mTokenType = CIFValue::Float;
}
else
state = start = restart(start);
break;
case State::Int:
if (isdigit(ch) or ch == '+' or ch == '-')
state = State::Int + 1;
else
state = start = restart(start);
break;
case State::Int + 1:
if (is_white(ch) or ch == kEOF)
{
retract();
result = CIFToken::Value;
mTokenType = CIFValue::Int;
}
else
state = start = restart(start);
break;
case State::Value:
if (ch == '_')
{
std::string s = to_lower_copy(m_token_value);
if (s == "data_")
{
state = State::DATA;
continue;
}
if (s == "save_")
{
state = State::SAVE;
continue;
}
}
if (result == CIFToken::Unknown and not is_non_blank(ch))
{
retract();
result = CIFToken::Value;
if (m_token_value == ".")
mTokenType = CIFValue::Inapplicable;
else if (iequals(m_token_value, "global_"))
result = CIFToken::GLOBAL;
else if (iequals(m_token_value, "stop_"))
result = CIFToken::STOP;
else if (iequals(m_token_value, "loop_"))
result = CIFToken::LOOP;
else if (m_token_value == "?")
{
mTokenType = CIFValue::Unknown;
m_token_value.clear();
}
}
break;
case State::DATA:
case State::SAVE:
if (not is_non_blank(ch))
{
retract();
if (state == State::DATA)
result = CIFToken::DATA;
else
result = CIFToken::SAVE;
m_token_value.erase(m_token_value.begin(), m_token_value.begin() + 5);
result = CIFToken::Value;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
break;
}
break;
@@ -433,8 +483,6 @@ sac_parser::CIFToken sac_parser::get_next_token()
if (VERBOSE >= 5)
{
std::cerr << get_token_name(result);
if (mTokenType != CIFValue::Unknown)
std::cerr << ' ' << get_value_name(mTokenType);
if (result != CIFToken::Eof)
std::cerr << " " << std::quoted(m_token_value);
std::cerr << std::endl;
@@ -506,7 +554,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break;
case string_quote:
if (std::isspace(ch))
if (is_space(ch))
state = start;
else
state = string;
@@ -518,7 +566,7 @@ bool sac_parser::parse_single_datablock(const std::string &datablock)
break;
case data:
if (isspace(ch) and dblk[si] == 0)
if (is_space(ch) and dblk[si] == 0)
found = true;
else if (dblk[si++] != ch)
state = start;
@@ -596,7 +644,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
break;
case string_quote:
if (std::isspace(ch))
if (is_space(ch))
state = start;
else
state = string;
@@ -620,7 +668,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
case data_name:
if (is_non_blank(ch))
datablock.insert(datablock.end(), char(ch));
else if (isspace(ch))
else if (is_space(ch))
{
if (not datablock.empty())
index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
@@ -696,7 +744,7 @@ void sac_parser::parse_datablock()
static const std::string kUnitializedCategory("<invalid>");
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE)
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::Tag or m_lookahead == CIFToken::SAVE_NAME)
{
switch (m_lookahead)
{
@@ -761,7 +809,7 @@ void sac_parser::parse_datablock()
break;
}
case CIFToken::SAVE:
case CIFToken::SAVE_NAME:
parse_save_frame();
break;
@@ -779,7 +827,7 @@ void sac_parser::parse_save_frame()
// --------------------------------------------------------------------
void parser::produce_datablock(const std::string &name)
void parser::produce_datablock(std::string_view name)
{
if (VERBOSE >= 4)
std::cerr << "producing data_" << name << std::endl;
@@ -788,7 +836,7 @@ void parser::produce_datablock(const std::string &name)
m_datablock = &(*iter);
}
void parser::produce_category(const std::string &name)
void parser::produce_category(std::string_view name)
{
if (VERBOSE >= 4)
std::cerr << "producing category " << name << std::endl;
@@ -810,7 +858,7 @@ void parser::produce_row()
// m_row.lineNr(m_line_nr);
}
void parser::produce_item(const std::string &category, const std::string &item, const std::string &value)
void parser::produce_item(std::string_view category, std::string_view item, std::string_view value)
{
if (VERBOSE >= 4)
std::cerr << "producing _" << category << '.' << item << " -> " << value << std::endl;
@@ -821,4 +869,4 @@ void parser::produce_item(const std::string &category, const std::string &item,
m_row[item] = m_token_value;
}
} // namespace cif
} // namespace cif

View File

@@ -236,28 +236,19 @@ std::string cif_id_for_number(int number)
{
std::string result;
if (number >= 26 * 26 * 26)
result = 'L' + std::to_string(number);
else
do
{
if (number >= 26 * 26)
{
int v = number / (26 * 26);
result += char('A' - 1 + v);
number %= (26 * 26);
}
int r = number % 26;
result += 'A' + r;
if (number >= 26)
{
int v = number / 26;
result += char('A' - 1 + v);
number %= 26;
}
result += char('A' + number);
number = (number - r) / 26 - 1;
}
while (number >= 0);
std::reverse(result.begin(), result.end());
assert(not result.empty());
return result;
}

View File

@@ -40,7 +40,6 @@
#include <iostream>
#include <map>
#include <mutex>
#include <regex>
#include <sstream>
#include <thread>
@@ -161,6 +160,8 @@ struct progress_bar_impl
void print_progress();
void print_done();
using time_point = std::chrono::time_point<std::chrono::system_clock>;
int64_t m_max_value;
std::atomic<int64_t> m_consumed;
int64_t m_last_consumed = 0;
@@ -168,8 +169,8 @@ struct progress_bar_impl
std::string m_action, m_message;
std::mutex m_mutex;
std::thread m_thread;
std::chrono::time_point<std::chrono::system_clock>
m_start = std::chrono::system_clock::now();
time_point m_start = std::chrono::system_clock::now();
time_point m_last = std::chrono::system_clock::now();
bool m_stop = false;
};
@@ -192,7 +193,9 @@ void progress_bar_impl::run()
{
while (not m_stop)
{
if (std::chrono::system_clock::now() - m_start < 2s)
auto now = std::chrono::system_clock::now();
if (now - m_start < 2s or now - m_last < 100ms)
{
std::this_thread::sleep_for(10ms);
continue;
@@ -206,6 +209,7 @@ void progress_bar_impl::run()
print_progress();
printedAny = true;
m_last = std::chrono::system_clock::now();
}
}
catch (...)

View File

@@ -491,9 +491,9 @@ const validator &validator_factory::operator[](std::string_view dictionary_name)
}
}
void validator_factory::construct_validator(std::string_view name, std::istream &is)
const validator &validator_factory::construct_validator(std::string_view name, std::istream &is)
{
m_validators.emplace_back(parse_dictionary(name, is));
return m_validators.emplace_back(parse_dictionary(name, is));
}
} // namespace cif

39
test/io-test.cpp Normal file
View File

@@ -0,0 +1,39 @@
#include <cif++.hpp>
class dummy_parser : public cif::sac_parser
{
public:
dummy_parser(std::istream &is)
: sac_parser(is)
{
}
void produce_datablock(std::string_view name) override
{
}
void produce_category(std::string_view name) override
{
}
void produce_row() override
{
}
void produce_item(std::string_view category, std::string_view item, std::string_view value) override
{
}
};
int main()
{
cif::gzio::ifstream in("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
dummy_parser parser(in);
parser.parse_file();
// cif::file f("/srv/data/pdb/mmCIF/gl/8glv.cif.gz");
return 0;
}

View File

@@ -75,6 +75,30 @@ bool init_unit_test()
// --------------------------------------------------------------------
BOOST_AUTO_TEST_CASE(id_1)
{
BOOST_TEST(cif::cif_id_for_number(0) == "A");
BOOST_TEST(cif::cif_id_for_number(25) == "Z");
BOOST_TEST(cif::cif_id_for_number(26) == "AA");
BOOST_TEST(cif::cif_id_for_number(26 + 1) == "AB");
BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26 - 1) == "ZZ");
BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26) == "AAA");
BOOST_TEST(cif::cif_id_for_number(26 + 26 * 26 + 1) == "AAB");
std::set<std::string> testset;
for (int i = 0; i < 100000; ++i)
{
std::string id = cif::cif_id_for_number(i);
BOOST_TEST(testset.count(id) == 0);
testset.insert(id);
}
BOOST_TEST(testset.size() == 100000);
}
// --------------------------------------------------------------------
BOOST_AUTO_TEST_CASE(cc_1)
{
std::tuple<std::string_view, float, char> tests[] = {
@@ -2357,8 +2381,6 @@ _test.text ??
BOOST_AUTO_TEST_CASE(output_test_1)
{
cif::VERBOSE = 5;
auto data1 = R"(
data_Q
loop_
@@ -2863,7 +2885,7 @@ save__cat_1.name
std::istream is_dict(&buffer);
auto validator = cif::parse_dictionary("test_dict.dic", is_dict);
auto &validator = cif::validator_factory::instance().construct_validator("test_dict.dic", is_dict);
cif::file f;
f.set_validator(&validator);
@@ -2901,8 +2923,6 @@ _cat_1.name
ss << f;
cif::file f2(ss);
f2.set_validator(&validator);
BOOST_ASSERT(f2.is_valid());
auto &audit_conform = f2.front()["audit_conform"];