Files
libcifpp/src/parser.cpp
2025-02-17 09:40:36 +01:00

879 lines
18 KiB
C++

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 NKI/AVL, Netherlands Cancer Institute
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cif++/utilities.hpp"
#include "cif++/forward_decl.hpp"
#include "cif++/parser.hpp"
#include "cif++/file.hpp"
#include <cassert>
#include <iostream>
#include <map>
#include <stack>
namespace cif
{
// --------------------------------------------------------------------
class reserved_words_automaton
{
public:
reserved_words_automaton() {}
enum move_result
{
undefined,
no_keyword,
data,
global,
loop,
save,
save_plus,
stop
};
constexpr bool finished() const
{
return m_state <= 0;
}
constexpr bool matched() const
{
return m_state < 0;
}
constexpr move_result move(int ch)
{
move_result result = undefined;
switch (m_state)
{
case 0:
break;
case -1: // data_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
result = data;
else
result = no_keyword;
break;
case -2: // global_
result = sac_parser::is_non_blank(ch) ? no_keyword : global;
break;
case -3: // loop_
result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
break;
case -4: // save_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
result = save_plus;
else
result = save;
break;
case -5: // stop_
result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
break;
default:
assert(m_state > 0 and m_state < NODE_COUNT);
for (;;)
{
if (s_dag[m_state].ch == (ch & ~0x20))
{
m_state = s_dag[m_state].next_match;
break;
}
m_state = s_dag[m_state].next_nomatch;
if (m_state == 0)
{
result = no_keyword;
break;
}
}
break;
}
if (result != undefined)
m_state = 0;
return result;
}
private:
static constexpr struct node
{
int16_t ch;
int8_t next_match;
int8_t next_nomatch;
} s_dag[] = {
{ 0 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'L', 15, 4 },
{ 'S', 19, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ '_', -1, 0 },
{ 'L', 10, 0 },
{ 'O', 11, 0 },
{ 'B', 12, 0 },
{ 'A', 13, 0 },
{ 'L', 14, 0 },
{ '_', -2, 0 },
{ 'O', 16, 0},
{ 'O', 17, 0 },
{ 'P', 18, 0 },
{ '_', -3, 0 },
{ 'A', 21, 20 },
{ 'T', 24, 0 },
{ 'V', 22, 0 },
{ 'E', 23, 0 },
{ '_', -4, 0 },
{ 'O', 25, 0 },
{ 'P', 26, 0 },
{ '_', -5, 0 },
};
static constexpr int NODE_COUNT = sizeof(s_dag) / sizeof(node);
int m_state = 1;
bool m_seen_trailing_chars = false;
};
// --------------------------------------------------------------------
sac_parser::sac_parser(std::istream &is, bool init)
: m_source(*is.rdbuf())
{
m_token_buffer.reserve(8192);
if (is.rdbuf() == nullptr)
throw std::runtime_error("Attempt to read from uninitialised stream");
m_line_nr = 1;
m_bol = true;
if (init)
m_lookahead = get_next_token();
}
bool sac_parser::is_unquoted_string(std::string_view text)
{
bool result = text.empty() or is_ordinary(text.front());
if (result)
{
reserved_words_automaton automaton;
for (char ch : text)
{
if (not is_non_blank(ch))
{
result = false;
break;
}
automaton.move(ch);
}
if (automaton.matched())
result = false;
}
return result;
}
// get_next_char takes a char from the buffer, or if it is empty
// from the istream. This function also does carriage/linefeed
// translation.
int sac_parser::get_next_char()
{
int result = m_source.sbumpc();
if (result == std::char_traits<char>::eof())
m_token_buffer.push_back(0);
else
{
if (result == '\r')
{
if (m_source.sgetc() == '\n')
m_source.sbumpc();
++m_line_nr;
result = '\n';
}
else if (result == '\n')
++m_line_nr;
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
}
return result;
}
void sac_parser::retract()
{
assert(not m_token_buffer.empty());
char ch = m_token_buffer.back();
if (ch == '\n')
--m_line_nr;
if (ch != 0)
{
// since we always putback at most a single character,
// the test below should never fail.
if (m_source.sputbackc(ch) == std::char_traits<char>::eof())
throw std::runtime_error("putback failure");
}
m_token_buffer.pop_back();
}
sac_parser::CIFToken sac_parser::get_next_token()
{
const auto kEOF = std::char_traits<char>::eof();
CIFToken result = CIFToken::UNKNOWN;
int quoteChar = 0;
State state = State::Start;
m_bol = false;
m_token_buffer.clear();
m_token_value = {};
reserved_words_automaton dag;
while (result == CIFToken::UNKNOWN)
{
auto ch = get_next_char();
switch (state)
{
case State::Start:
if (ch == kEOF)
result = CIFToken::END_OF_FILE;
else if (ch == '\n')
{
m_bol = true;
state = State::White;
}
else if (ch == ' ' or ch == '\t')
state = State::White;
else if (ch == '#')
state = State::Comment;
else if (ch == '_')
state = State::ItemName;
else if (ch == ';' and m_bol)
state = State::TextItem;
else if (ch == '?')
state = State::QuestionMark;
else if (ch == '\'' or ch == '"')
{
quoteChar = ch;
state = State::QuotedString;
}
else if (dag.move(ch) == reserved_words_automaton::undefined)
state = State::Reserved;
else
state = State::Value;
break;
case State::White:
if (ch == kEOF)
result = CIFToken::END_OF_FILE;
else if (not is_space(ch))
{
state = State::Start;
retract();
m_token_buffer.clear();
}
else
m_bol = (ch == '\n');
break;
case State::Comment:
if (ch == '\n')
{
state = State::Start;
m_bol = true;
m_token_buffer.clear();
}
else if (ch == kEOF)
result = CIFToken::END_OF_FILE;
else if (not is_any_print(ch))
error("invalid character in comment");
break;
case State::QuestionMark:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::VALUE;
}
else
state = State::Value;
break;
case State::TextItem:
if (ch == '\n')
state = State::TextItemNL;
else if (ch == kEOF)
error("unterminated textfield");
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
break;
case State::TextItemNL:
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
state = State::TextItem;
else if (ch == ';')
{
assert(m_token_buffer.size() >= 2);
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
result = CIFToken::VALUE;
}
else if (ch == kEOF)
error("unterminated textfield");
else if (ch != '\n')
error("invalid character in text field");
break;
case State::QuotedString:
if (ch == kEOF)
error("unterminated quoted string");
else if (ch == quoteChar)
state = State::QuotedStringQuote;
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in quoted string: '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
break;
case State::QuotedStringQuote:
if (is_white(ch))
{
retract();
result = CIFToken::VALUE;
if (m_token_buffer.size() < 2)
error("Invalid quoted string token");
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 2);
}
else if (ch == quoteChar)
;
else if (is_any_print(ch))
state = State::QuotedString;
else if (ch == kEOF)
error("unterminated quoted string");
else
error("invalid character in quoted string");
break;
case State::ItemName:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::ITEM_NAME;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
break;
case State::Reserved:
switch (dag.move(ch))
{
case reserved_words_automaton::undefined:
break;
case reserved_words_automaton::no_keyword:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::VALUE;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
}
else
state = State::Value;
break;
case reserved_words_automaton::data:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
result = CIFToken::DATA;
break;
case reserved_words_automaton::global:
retract();
result = CIFToken::GLOBAL;
break;
case reserved_words_automaton::loop:
retract();
result = CIFToken::LOOP;
break;
case reserved_words_automaton::save:
retract();
result = CIFToken::SAVE_;
break;
case reserved_words_automaton::save_plus:
retract();
m_token_value = std::string_view(m_token_buffer.data() + 5, m_token_buffer.size() - 5);
result = CIFToken::SAVE_NAME;
break;
case reserved_words_automaton::stop:
retract();
result = CIFToken::STOP;
break;
}
break;
case State::Value:
if (not is_non_blank(ch))
{
retract();
result = CIFToken::VALUE;
m_token_value = std::string_view(m_token_buffer.data(), m_token_buffer.size());
break;
}
break;
default:
assert(false);
error("Invalid state in get_next_token");
break;
}
}
if (VERBOSE >= 5)
{
std::cerr << get_token_name(result);
if (result != CIFToken::END_OF_FILE)
std::cerr << " " << std::quoted(m_token_value);
std::cerr << '\n';
}
return result;
}
void sac_parser::match(CIFToken token)
{
if (m_lookahead != token)
error(std::string("Unexpected token, expected ") + get_token_name(token) + " but found " + get_token_name(m_lookahead));
m_lookahead = get_next_token();
}
bool sac_parser::parse_single_datablock(const std::string &datablock)
{
// first locate the start, as fast as we can
enum
{
start,
comment,
string,
string_quote,
qstring,
data
} state = start;
int quote = 0;
bool bol = true;
std::string dblk = "data_" + datablock;
std::string::size_type si = 0;
bool found = false;
for (auto ch = m_source.sbumpc(); not found and ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
{
switch (state)
{
case start:
switch (ch)
{
case '#': state = comment; break;
case 'd':
case 'D':
state = data;
si = 1;
break;
case '\'':
case '"':
state = string;
quote = ch;
break;
case ';':
if (bol)
state = qstring;
break;
}
break;
case comment:
if (ch == '\n')
state = start;
break;
case string:
if (ch == quote)
state = string_quote;
break;
case string_quote:
if (is_space(ch))
state = start;
else
state = string;
break;
case qstring:
if (ch == ';' and bol)
state = start;
break;
case data:
if (is_space(ch) and dblk[si] == 0)
found = true;
else if (dblk[si++] != ch)
state = start;
break;
}
bol = (ch == '\n');
}
if (found)
{
produce_datablock(datablock);
m_lookahead = get_next_token();
parse_datablock();
}
return found;
}
sac_parser::datablock_index sac_parser::index_datablocks()
{
datablock_index index;
// first locate the start, as fast as we can
enum
{
start,
comment,
string,
string_quote,
qstring,
data,
data_name
} state = start;
int quote = 0;
bool bol = true;
const char dblk[] = "data_";
std::string::size_type si = 0;
std::string datablock;
// Seek to beginning of file
m_source.pubseekpos(0);
for (auto ch = m_source.sbumpc(); ch != std::streambuf::traits_type::eof(); ch = m_source.sbumpc())
{
switch (state)
{
case start:
switch (ch)
{
case '#': state = comment; break;
case 'd':
case 'D':
state = data;
si = 1;
break;
case '\'':
case '"':
state = string;
quote = ch;
break;
case ';':
if (bol)
state = qstring;
break;
}
break;
case comment:
if (ch == '\n')
state = start;
break;
case string:
if (ch == quote)
state = string_quote;
break;
case string_quote:
if (is_space(ch))
state = start;
else
state = string;
break;
case qstring:
if (ch == ';' and bol)
state = start;
break;
case data:
if (dblk[si] == 0 and is_non_blank(ch))
{
datablock = {static_cast<char>(ch)};
state = data_name;
}
else if (dblk[si++] != ch)
state = start;
break;
case data_name:
if (is_non_blank(ch))
datablock.insert(datablock.end(), (char)std::toupper(ch));
else if (is_space(ch))
{
if (not datablock.empty())
index[datablock] = m_source.pubseekoff(0, std::ios_base::cur, std::ios_base::in);
state = start;
}
else
state = start;
break;
}
bol = (ch == '\n');
}
return index;
}
bool sac_parser::parse_single_datablock(const std::string &datablock, const datablock_index &index)
{
bool result = false;
auto i = index.find(datablock);
if (i != index.end())
{
m_source.pubseekpos(i->second, std::ios_base::in);
produce_datablock(datablock);
m_lookahead = get_next_token();
parse_datablock();
result = true;
}
return result;
}
void sac_parser::parse_file()
{
while (m_lookahead != CIFToken::END_OF_FILE)
{
switch (m_lookahead)
{
case CIFToken::GLOBAL:
parse_global();
break;
case CIFToken::DATA:
produce_datablock(m_token_value);
match(CIFToken::DATA);
parse_datablock();
break;
default:
error("This file does not seem to be an mmCIF file");
break;
}
}
}
void sac_parser::parse_global()
{
match(CIFToken::GLOBAL);
while (m_lookahead == CIFToken::ITEM_NAME)
{
match(CIFToken::ITEM_NAME);
match(CIFToken::VALUE);
}
}
void sac_parser::parse_datablock()
{
static const std::string kUnitializedCategory("<invalid>");
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::ITEM_NAME or m_lookahead == CIFToken::SAVE_NAME)
{
switch (m_lookahead)
{
case CIFToken::LOOP:
{
cat = kUnitializedCategory; // should start a new category
match(CIFToken::LOOP);
std::vector<std::string> item_names;
while (m_lookahead == CIFToken::ITEM_NAME)
{
std::string catName, itemName;
std::tie(catName, itemName) = split_item_name(m_token_value);
if (cat == kUnitializedCategory)
{
produce_category(catName);
cat = catName;
}
else if (not iequals(cat, catName))
error("inconsistent categories in loop_");
item_names.push_back(itemName);
match(CIFToken::ITEM_NAME);
}
while (m_lookahead == CIFToken::VALUE)
{
produce_row();
for (auto item_name : item_names)
{
produce_item(cat, item_name, m_token_value);
match(CIFToken::VALUE);
}
}
cat.clear();
break;
}
case CIFToken::ITEM_NAME:
{
std::string catName, itemName;
std::tie(catName, itemName) = split_item_name(m_token_value);
if (not iequals(cat, catName))
{
produce_category(catName);
cat = catName;
produce_row();
}
match(CIFToken::ITEM_NAME);
produce_item(cat, itemName, m_token_value);
match(CIFToken::VALUE);
break;
}
case CIFToken::SAVE_NAME:
parse_save_frame();
break;
default:
assert(false);
break;
}
}
}
void sac_parser::parse_save_frame()
{
error("A regular CIF file should not contain a save frame");
}
// --------------------------------------------------------------------
void parser::produce_datablock(std::string_view name)
{
if (VERBOSE >= 4)
std::cerr << "producing data_" << name << '\n';
const auto &[iter, ignore] = m_file.emplace(name);
m_datablock = &(*iter);
if (m_validator)
m_datablock->set_validator(m_validator);
}
void parser::produce_category(std::string_view name)
{
if (VERBOSE >= 4)
std::cerr << "producing category " << name << '\n';
const auto &[cat, ignore] = m_datablock->emplace(name);
m_category = &*cat;
}
void parser::produce_row()
{
if (VERBOSE >= 4 and m_category != nullptr)
std::cerr << "producing row for category " << m_category->name() << '\n';
if (m_category == nullptr)
error("inconsistent categories in loop_");
m_category->emplace({});
m_row = m_category->back();
// m_row.lineNr(m_line_nr);
}
void parser::produce_item(std::string_view category, std::string_view item, std::string_view value)
{
if (VERBOSE >= 4)
std::cerr << "producing _" << category << '.' << item << " -> " << value << '\n';
if (m_category == nullptr or not iequals(category, m_category->name()))
error("inconsistent categories in loop_");
m_row[item] = m_token_value;
}
} // namespace cif