Implement backslashed wrapping of long strings according to the cif 1.1 specification.

This commit is contained in:
Maarten L. Hekkelman
2025-12-31 16:08:17 +01:00
parent f19c6d078e
commit b9bcf07f84
3 changed files with 127 additions and 24 deletions

View File

@@ -280,6 +280,11 @@ class sac_parser
ItemName,
TextItem,
TextItemNL,
TextItemBS,
TextItemBS2,
TextItemBSNL,
Reserved,
Value
};
@@ -289,6 +294,7 @@ class sac_parser
// Parser state
uint32_t m_line_nr;
bool m_bol;
bool m_backslash_strings = false;
CIFToken m_lookahead;
// token buffer

View File

@@ -24,10 +24,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cif++/utilities.hpp"
#include "cif++/forward_decl.hpp"
#include "cif++/parser.hpp"
#include "cif++/file.hpp"
#include "cif++/forward_decl.hpp"
#include "cif++/utilities.hpp"
#include <cassert>
#include <iostream>
@@ -58,12 +59,12 @@ class reserved_words_automaton
constexpr bool finished() const
{
return m_state <= 0;
return m_state <= 0;
}
constexpr bool matched() const
{
return m_state < 0;
return m_state < 0;
}
constexpr move_result move(int ch)
@@ -75,7 +76,7 @@ class reserved_words_automaton
case 0:
break;
case -1: // data_
case -1: // data_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
@@ -84,15 +85,15 @@ class reserved_words_automaton
result = no_keyword;
break;
case -2: // global_
case -2: // global_
result = sac_parser::is_non_blank(ch) ? no_keyword : global;
break;
case -3: // loop_
case -3: // loop_
result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
break;
case -4: // save_
case -4: // save_
if (sac_parser::is_non_blank(ch))
m_seen_trailing_chars = true;
else if (m_seen_trailing_chars)
@@ -101,10 +102,10 @@ class reserved_words_automaton
result = save;
break;
case -5: // stop_
case -5: // stop_
result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
break;
default:
assert(m_state > 0 and m_state < NODE_COUNT);
@@ -141,13 +142,13 @@ class reserved_words_automaton
int8_t next_nomatch;
} s_dag[] = {
{ 0 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'D', 5, 2 },
{ 'G', 9, 3 },
{ 'L', 15, 4 },
{ 'S', 19, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ 'A', 6, 0 },
{ 'T', 7, 0 },
{ 'A', 8, 0 },
{ '_', -1, 0 },
{ 'L', 10, 0 },
{ 'O', 11, 0 },
@@ -155,7 +156,7 @@ class reserved_words_automaton
{ 'A', 13, 0 },
{ 'L', 14, 0 },
{ '_', -2, 0 },
{ 'O', 16, 0},
{ 'O', 16, 0 },
{ 'O', 17, 0 },
{ 'P', 18, 0 },
{ '_', -3, 0 },
@@ -238,7 +239,7 @@ int sac_parser::get_next_char()
}
else if (result == '\n')
++m_line_nr;
m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
}
@@ -300,7 +301,12 @@ sac_parser::CIFToken sac_parser::get_next_token()
else if (ch == '_')
state = State::ItemName;
else if (ch == ';' and m_bol)
state = State::TextItem;
{
if (m_backslash_strings)
state = State::TextItemBS;
else
state = State::TextItem;
}
else if (ch == '?')
state = State::QuestionMark;
else if (ch == '\'' or ch == '"')
@@ -326,12 +332,14 @@ sac_parser::CIFToken sac_parser::get_next_token()
else
m_bol = (ch == '\n');
break;
case State::Comment:
if (ch == '\n')
{
state = State::Start;
m_bol = true;
if (m_token_buffer.size() == 3 and m_token_buffer == std::vector{ '#', '\\', '\n' })
m_backslash_strings = true;
m_token_buffer.clear();
}
else if (ch == kEOF)
@@ -339,7 +347,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
else if (not is_any_print(ch))
error("invalid character in comment");
break;
case State::QuestionMark:
if (not is_non_blank(ch))
{
@@ -350,13 +358,52 @@ sac_parser::CIFToken sac_parser::get_next_token()
state = State::Value;
break;
case State::TextItemBS:
if (ch == '\\')
{
state = State::TextItemBS2;
break;
}
[[fallthrough]];
case State::TextItem:
if (ch == '\n')
state = State::TextItemNL;
else if (ch == kEOF)
error("unterminated textfield");
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
break;
case State::TextItemBS2:
if (ch == '\n')
{
if (m_token_buffer[m_token_buffer.size() - 2] == '\\')
{
m_token_buffer.pop_back();
m_token_buffer.pop_back();
}
state = State::TextItemBSNL;
}
else if (ch == kEOF)
error("unterminated textfield");
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
break;
case State::TextItemBSNL:
if (is_text_lead(ch) or ch == ' ' or ch == '\t')
state = State::TextItemBS;
else if (ch == ';')
{
assert(m_token_buffer.size() >= 2);
m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
result = CIFToken::VALUE;
}
else if (ch == kEOF)
error("unterminated textfield");
else if (ch != '\n')
error("invalid character in text field");
break;
case State::TextItemNL:
@@ -380,7 +427,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
else if (ch == quoteChar)
state = State::QuotedStringQuote;
else if (not is_any_print(ch) and cif::VERBOSE > 2)
warning("invalid character in quoted string: '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
warning("invalid character in quoted string: '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
break;
case State::QuotedStringQuote:
@@ -661,7 +708,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
case data:
if (dblk[si] == 0 and is_non_blank(ch))
{
datablock = {static_cast<char>(ch)};
datablock = { static_cast<char>(ch) };
state = data_name;
}
else if (dblk[si++] != ch)
@@ -745,7 +792,7 @@ void sac_parser::parse_global()
void sac_parser::parse_datablock()
{
static const std::string kUnitializedCategory("<invalid>");
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names
while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::ITEM_NAME or m_lookahead == CIFToken::SAVE_NAME)
{

View File

@@ -26,6 +26,7 @@
#include "test-main.hpp"
#include <catch2/catch_test_macros.hpp>
#include <cif++.hpp>
#include <stdexcept>
@@ -65,6 +66,55 @@ TEST_CASE("text_1")
// --------------------------------------------------------------------
TEST_CASE("text_2")
{
// Test based on https://www.iucr.org/resources/cif/spec/version1.1/semantics
// There is a problem with this specification though, the fourth example
// as given on the website consists of three lines, the first being blank.
auto f = R"(
#\
data_X
# Here is another example of folding. The following three text fields would be equivalent:
loop_
_cat.f1
;C:\foldername\filename
;
;\
C:\foldername\filename
;
;\
C:\foldername\file\
name
;
# but the next example would be a two-line value where the first line had the value "C:\foldername\file\" and the second had the value "name":
;
C:\foldername\file\
name
;
)"_cf;
auto &db = f.front();
auto &cat = db["cat"];
for (size_t ix = 0; std::string v : cat.rows<std::string>("f1"))
{
if (++ix == 4)
CHECK(v == R"(
C:\foldername\file\
name)");
else
CHECK(v == R"(C:\foldername\filename)");
}
}
// --------------------------------------------------------------------
TEST_CASE("from_chars_1")
{
auto f = R"(data_TEST