mirror of
https://github.com/PDB-REDO/libcifpp.git
synced 2026-06-04 13:54:25 +08:00
Implement backslashed wrapping of long strings according to the cif 1.1 specification.
This commit is contained in:
@@ -280,6 +280,11 @@ class sac_parser
|
||||
ItemName,
|
||||
TextItem,
|
||||
TextItemNL,
|
||||
|
||||
TextItemBS,
|
||||
TextItemBS2,
|
||||
TextItemBSNL,
|
||||
|
||||
Reserved,
|
||||
Value
|
||||
};
|
||||
@@ -289,6 +294,7 @@ class sac_parser
|
||||
// Parser state
|
||||
uint32_t m_line_nr;
|
||||
bool m_bol;
|
||||
bool m_backslash_strings = false;
|
||||
CIFToken m_lookahead;
|
||||
|
||||
// token buffer
|
||||
|
||||
@@ -24,10 +24,11 @@
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cif++/utilities.hpp"
|
||||
#include "cif++/forward_decl.hpp"
|
||||
#include "cif++/parser.hpp"
|
||||
|
||||
#include "cif++/file.hpp"
|
||||
#include "cif++/forward_decl.hpp"
|
||||
#include "cif++/utilities.hpp"
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
@@ -58,12 +59,12 @@ class reserved_words_automaton
|
||||
|
||||
constexpr bool finished() const
|
||||
{
|
||||
return m_state <= 0;
|
||||
return m_state <= 0;
|
||||
}
|
||||
|
||||
constexpr bool matched() const
|
||||
{
|
||||
return m_state < 0;
|
||||
return m_state < 0;
|
||||
}
|
||||
|
||||
constexpr move_result move(int ch)
|
||||
@@ -75,7 +76,7 @@ class reserved_words_automaton
|
||||
case 0:
|
||||
break;
|
||||
|
||||
case -1: // data_
|
||||
case -1: // data_
|
||||
if (sac_parser::is_non_blank(ch))
|
||||
m_seen_trailing_chars = true;
|
||||
else if (m_seen_trailing_chars)
|
||||
@@ -84,15 +85,15 @@ class reserved_words_automaton
|
||||
result = no_keyword;
|
||||
break;
|
||||
|
||||
case -2: // global_
|
||||
case -2: // global_
|
||||
result = sac_parser::is_non_blank(ch) ? no_keyword : global;
|
||||
break;
|
||||
|
||||
case -3: // loop_
|
||||
case -3: // loop_
|
||||
result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
|
||||
break;
|
||||
|
||||
case -4: // save_
|
||||
case -4: // save_
|
||||
if (sac_parser::is_non_blank(ch))
|
||||
m_seen_trailing_chars = true;
|
||||
else if (m_seen_trailing_chars)
|
||||
@@ -101,10 +102,10 @@ class reserved_words_automaton
|
||||
result = save;
|
||||
break;
|
||||
|
||||
case -5: // stop_
|
||||
case -5: // stop_
|
||||
result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
assert(m_state > 0 and m_state < NODE_COUNT);
|
||||
|
||||
@@ -141,13 +142,13 @@ class reserved_words_automaton
|
||||
int8_t next_nomatch;
|
||||
} s_dag[] = {
|
||||
{ 0 },
|
||||
{ 'D', 5, 2 },
|
||||
{ 'G', 9, 3 },
|
||||
{ 'D', 5, 2 },
|
||||
{ 'G', 9, 3 },
|
||||
{ 'L', 15, 4 },
|
||||
{ 'S', 19, 0 },
|
||||
{ 'A', 6, 0 },
|
||||
{ 'T', 7, 0 },
|
||||
{ 'A', 8, 0 },
|
||||
{ 'A', 6, 0 },
|
||||
{ 'T', 7, 0 },
|
||||
{ 'A', 8, 0 },
|
||||
{ '_', -1, 0 },
|
||||
{ 'L', 10, 0 },
|
||||
{ 'O', 11, 0 },
|
||||
@@ -155,7 +156,7 @@ class reserved_words_automaton
|
||||
{ 'A', 13, 0 },
|
||||
{ 'L', 14, 0 },
|
||||
{ '_', -2, 0 },
|
||||
{ 'O', 16, 0},
|
||||
{ 'O', 16, 0 },
|
||||
{ 'O', 17, 0 },
|
||||
{ 'P', 18, 0 },
|
||||
{ '_', -3, 0 },
|
||||
@@ -238,7 +239,7 @@ int sac_parser::get_next_char()
|
||||
}
|
||||
else if (result == '\n')
|
||||
++m_line_nr;
|
||||
|
||||
|
||||
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
|
||||
}
|
||||
|
||||
@@ -300,7 +301,12 @@ sac_parser::CIFToken sac_parser::get_next_token()
|
||||
else if (ch == '_')
|
||||
state = State::ItemName;
|
||||
else if (ch == ';' and m_bol)
|
||||
state = State::TextItem;
|
||||
{
|
||||
if (m_backslash_strings)
|
||||
state = State::TextItemBS;
|
||||
else
|
||||
state = State::TextItem;
|
||||
}
|
||||
else if (ch == '?')
|
||||
state = State::QuestionMark;
|
||||
else if (ch == '\'' or ch == '"')
|
||||
@@ -326,12 +332,14 @@ sac_parser::CIFToken sac_parser::get_next_token()
|
||||
else
|
||||
m_bol = (ch == '\n');
|
||||
break;
|
||||
|
||||
|
||||
case State::Comment:
|
||||
if (ch == '\n')
|
||||
{
|
||||
state = State::Start;
|
||||
m_bol = true;
|
||||
if (m_token_buffer.size() == 3 and m_token_buffer == std::vector{ '#', '\\', '\n' })
|
||||
m_backslash_strings = true;
|
||||
m_token_buffer.clear();
|
||||
}
|
||||
else if (ch == kEOF)
|
||||
@@ -339,7 +347,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
|
||||
else if (not is_any_print(ch))
|
||||
error("invalid character in comment");
|
||||
break;
|
||||
|
||||
|
||||
case State::QuestionMark:
|
||||
if (not is_non_blank(ch))
|
||||
{
|
||||
@@ -350,13 +358,52 @@ sac_parser::CIFToken sac_parser::get_next_token()
|
||||
state = State::Value;
|
||||
break;
|
||||
|
||||
case State::TextItemBS:
|
||||
if (ch == '\\')
|
||||
{
|
||||
state = State::TextItemBS2;
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
|
||||
case State::TextItem:
|
||||
if (ch == '\n')
|
||||
state = State::TextItemNL;
|
||||
else if (ch == kEOF)
|
||||
error("unterminated textfield");
|
||||
else if (not is_any_print(ch) and cif::VERBOSE > 2)
|
||||
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
|
||||
warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
|
||||
break;
|
||||
|
||||
case State::TextItemBS2:
|
||||
if (ch == '\n')
|
||||
{
|
||||
if (m_token_buffer[m_token_buffer.size() - 2] == '\\')
|
||||
{
|
||||
m_token_buffer.pop_back();
|
||||
m_token_buffer.pop_back();
|
||||
}
|
||||
state = State::TextItemBSNL;
|
||||
}
|
||||
else if (ch == kEOF)
|
||||
error("unterminated textfield");
|
||||
else if (not is_any_print(ch) and cif::VERBOSE > 2)
|
||||
warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
|
||||
break;
|
||||
|
||||
case State::TextItemBSNL:
|
||||
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
|
||||
state = State::TextItemBS;
|
||||
else if (ch == ';')
|
||||
{
|
||||
assert(m_token_buffer.size() >= 2);
|
||||
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
|
||||
result = CIFToken::VALUE;
|
||||
}
|
||||
else if (ch == kEOF)
|
||||
error("unterminated textfield");
|
||||
else if (ch != '\n')
|
||||
error("invalid character in text field");
|
||||
break;
|
||||
|
||||
case State::TextItemNL:
|
||||
@@ -380,7 +427,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
|
||||
else if (ch == quoteChar)
|
||||
state = State::QuotedStringQuote;
|
||||
else if (not is_any_print(ch) and cif::VERBOSE > 2)
|
||||
warning("invalid character in quoted string: '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
|
||||
warning("invalid character in quoted string: '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
|
||||
break;
|
||||
|
||||
case State::QuotedStringQuote:
|
||||
@@ -661,7 +708,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
|
||||
case data:
|
||||
if (dblk[si] == 0 and is_non_blank(ch))
|
||||
{
|
||||
datablock = {static_cast<char>(ch)};
|
||||
datablock = { static_cast<char>(ch) };
|
||||
state = data_name;
|
||||
}
|
||||
else if (dblk[si++] != ch)
|
||||
@@ -745,7 +792,7 @@ void sac_parser::parse_global()
|
||||
void sac_parser::parse_datablock()
|
||||
{
|
||||
static const std::string kUnitializedCategory("<invalid>");
|
||||
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
|
||||
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
|
||||
|
||||
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::ITEM_NAME or m_lookahead == CIFToken::SAVE_NAME)
|
||||
{
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
|
||||
#include "test-main.hpp"
|
||||
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <cif++.hpp>
|
||||
|
||||
#include <stdexcept>
|
||||
@@ -65,6 +66,55 @@ TEST_CASE("text_1")
|
||||
|
||||
// --------------------------------------------------------------------
|
||||
|
||||
TEST_CASE("text_2")
|
||||
{
|
||||
// Test based on https://www.iucr.org/resources/cif/spec/version1.1/semantics
|
||||
|
||||
// There is a problem with this specification though, the fourth example
|
||||
// as given on the website consists of three lines, the first being blank.
|
||||
|
||||
auto f = R"(
|
||||
#\
|
||||
data_X
|
||||
|
||||
# Here is another example of folding. The following three text fields would be equivalent:
|
||||
loop_
|
||||
_cat.f1
|
||||
;C:\foldername\filename
|
||||
;
|
||||
|
||||
;\
|
||||
C:\foldername\filename
|
||||
;
|
||||
|
||||
;\
|
||||
C:\foldername\file\
|
||||
name
|
||||
;
|
||||
|
||||
# but the next example would be a two-line value where the first line had the value "C:\foldername\file\" and the second had the value "name":
|
||||
|
||||
;
|
||||
C:\foldername\file\
|
||||
name
|
||||
;
|
||||
)"_cf;
|
||||
|
||||
auto &db = f.front();
|
||||
auto &cat = db["cat"];
|
||||
for (size_t ix = 0; std::string v : cat.rows<std::string>("f1"))
|
||||
{
|
||||
if (++ix == 4)
|
||||
CHECK(v == R"(
|
||||
C:\foldername\file\
|
||||
name)");
|
||||
else
|
||||
CHECK(v == R"(C:\foldername\filename)");
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------
|
||||
|
||||
TEST_CASE("from_chars_1")
|
||||
{
|
||||
auto f = R"(data_TEST
|
||||
|
||||
Reference in New Issue
Block a user