From b9bcf07f841f00a86c66776edfc99c512ff459df Mon Sep 17 00:00:00 2001 From: "Maarten L. Hekkelman" Date: Wed, 31 Dec 2025 16:08:17 +0100 Subject: [PATCH] Implement backslashed wrapping of long strings according to the cif 1.1 specification. --- include/cif++/parser.hpp | 6 +++ src/parser.cpp | 95 ++++++++++++++++++++++++++++++---------- test/unit-v2-test.cpp | 50 +++++++++++++++++++++ 3 files changed, 127 insertions(+), 24 deletions(-) diff --git a/include/cif++/parser.hpp b/include/cif++/parser.hpp index 08bc974..766ba49 100644 --- a/include/cif++/parser.hpp +++ b/include/cif++/parser.hpp @@ -280,6 +280,11 @@ class sac_parser ItemName, TextItem, TextItemNL, + + TextItemBS, + TextItemBS2, + TextItemBSNL, + Reserved, Value }; @@ -289,6 +294,7 @@ class sac_parser // Parser state uint32_t m_line_nr; bool m_bol; + bool m_backslash_strings = false; CIFToken m_lookahead; // token buffer diff --git a/src/parser.cpp b/src/parser.cpp index 75f986c..14277bc 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -24,10 +24,11 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "cif++/utilities.hpp" -#include "cif++/forward_decl.hpp" #include "cif++/parser.hpp" + #include "cif++/file.hpp" +#include "cif++/forward_decl.hpp" +#include "cif++/utilities.hpp" #include #include @@ -58,12 +59,12 @@ class reserved_words_automaton constexpr bool finished() const { - return m_state <= 0; + return m_state <= 0; } constexpr bool matched() const { - return m_state < 0; + return m_state < 0; } constexpr move_result move(int ch) @@ -75,7 +76,7 @@ class reserved_words_automaton case 0: break; - case -1: // data_ + case -1: // data_ if (sac_parser::is_non_blank(ch)) m_seen_trailing_chars = true; else if (m_seen_trailing_chars) @@ -84,15 +85,15 @@ class reserved_words_automaton result = no_keyword; break; - case -2: // global_ + case -2: // global_ result = sac_parser::is_non_blank(ch) ? no_keyword : global; break; - case -3: // loop_ + case -3: // loop_ result = sac_parser::is_non_blank(ch) ? no_keyword : loop; break; - case -4: // save_ + case -4: // save_ if (sac_parser::is_non_blank(ch)) m_seen_trailing_chars = true; else if (m_seen_trailing_chars) @@ -101,10 +102,10 @@ class reserved_words_automaton result = save; break; - case -5: // stop_ + case -5: // stop_ result = sac_parser::is_non_blank(ch) ? no_keyword : stop; break; - + default: assert(m_state > 0 and m_state < NODE_COUNT); @@ -141,13 +142,13 @@ class reserved_words_automaton int8_t next_nomatch; } s_dag[] = { { 0 }, - { 'D', 5, 2 }, - { 'G', 9, 3 }, + { 'D', 5, 2 }, + { 'G', 9, 3 }, { 'L', 15, 4 }, { 'S', 19, 0 }, - { 'A', 6, 0 }, - { 'T', 7, 0 }, - { 'A', 8, 0 }, + { 'A', 6, 0 }, + { 'T', 7, 0 }, + { 'A', 8, 0 }, { '_', -1, 0 }, { 'L', 10, 0 }, { 'O', 11, 0 }, @@ -155,7 +156,7 @@ class reserved_words_automaton { 'A', 13, 0 }, { 'L', 14, 0 }, { '_', -2, 0 }, - { 'O', 16, 0}, + { 'O', 16, 0 }, { 'O', 17, 0 }, { 'P', 18, 0 }, { '_', -3, 0 }, @@ -238,7 +239,7 @@ int sac_parser::get_next_char() } else if (result == '\n') ++m_line_nr; - + m_token_buffer.push_back(std::char_traits::to_char_type(result)); } @@ -300,7 +301,12 @@ sac_parser::CIFToken sac_parser::get_next_token() else if (ch == '_') state = State::ItemName; else if (ch == ';' and m_bol) - state = State::TextItem; + { + if (m_backslash_strings) + state = State::TextItemBS; + else + state = State::TextItem; + } else if (ch == '?') state = State::QuestionMark; else if (ch == '\'' or ch == '"') @@ -326,12 +332,14 @@ sac_parser::CIFToken sac_parser::get_next_token() else m_bol = (ch == '\n'); break; - + case State::Comment: if (ch == '\n') { state = State::Start; m_bol = true; + if (m_token_buffer.size() == 3 and m_token_buffer == std::vector{ '#', '\\', '\n' }) + m_backslash_strings = true; m_token_buffer.clear(); } else if (ch == kEOF) @@ -339,7 +347,7 @@ sac_parser::CIFToken sac_parser::get_next_token() else if (not is_any_print(ch)) error("invalid character in comment"); break; - + case State::QuestionMark: if (not is_non_blank(ch)) { @@ -350,13 +358,52 @@ sac_parser::CIFToken sac_parser::get_next_token() state = State::Value; break; + case State::TextItemBS: + if (ch == '\\') + { + state = State::TextItemBS2; + break; + } + [[fallthrough]]; + case State::TextItem: if (ch == '\n') state = State::TextItemNL; else if (ch == kEOF) error("unterminated textfield"); else if (not is_any_print(ch) and cif::VERBOSE > 2) - warning("invalid character in text field '" + std::string({static_cast(ch)}) + "' (" + std::to_string((int)ch) + ")"); + warning("invalid character in text field '" + std::string({ static_cast(ch) }) + "' (" + std::to_string((int)ch) + ")"); + break; + + case State::TextItemBS2: + if (ch == '\n') + { + if (m_token_buffer[m_token_buffer.size() - 2] == '\\') + { + m_token_buffer.pop_back(); + m_token_buffer.pop_back(); + } + state = State::TextItemBSNL; + } + else if (ch == kEOF) + error("unterminated textfield"); + else if (not is_any_print(ch) and cif::VERBOSE > 2) + warning("invalid character in text field '" + std::string({ static_cast(ch) }) + "' (" + std::to_string((int)ch) + ")"); + break; + + case State::TextItemBSNL: + if (is_text_lead(ch) or ch == ' ' or ch == '\t') + state = State::TextItemBS; + else if (ch == ';') + { + assert(m_token_buffer.size() >= 2); + m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3); + result = CIFToken::VALUE; + } + else if (ch == kEOF) + error("unterminated textfield"); + else if (ch != '\n') + error("invalid character in text field"); break; case State::TextItemNL: @@ -380,7 +427,7 @@ sac_parser::CIFToken sac_parser::get_next_token() else if (ch == quoteChar) state = State::QuotedStringQuote; else if (not is_any_print(ch) and cif::VERBOSE > 2) - warning("invalid character in quoted string: '" + std::string({static_cast(ch)}) + "' (" + std::to_string((int)ch) + ")"); + warning("invalid character in quoted string: '" + std::string({ static_cast(ch) }) + "' (" + std::to_string((int)ch) + ")"); break; case State::QuotedStringQuote: @@ -661,7 +708,7 @@ sac_parser::datablock_index sac_parser::index_datablocks() case data: if (dblk[si] == 0 and is_non_blank(ch)) { - datablock = {static_cast(ch)}; + datablock = { static_cast(ch) }; state = data_name; } else if (dblk[si++] != ch) @@ -745,7 +792,7 @@ void sac_parser::parse_global() void sac_parser::parse_datablock() { static const std::string kUnitializedCategory(""); - std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names + std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::ITEM_NAME or m_lookahead == CIFToken::SAVE_NAME) { diff --git a/test/unit-v2-test.cpp b/test/unit-v2-test.cpp index 79c3d86..9b506b4 100644 --- a/test/unit-v2-test.cpp +++ b/test/unit-v2-test.cpp @@ -26,6 +26,7 @@ #include "test-main.hpp" +#include #include #include @@ -65,6 +66,55 @@ TEST_CASE("text_1") // -------------------------------------------------------------------- +TEST_CASE("text_2") +{ + // Test based on https://www.iucr.org/resources/cif/spec/version1.1/semantics + + // There is a problem with this specification though, the fourth example + // as given on the website consists of three lines, the first being blank. + + auto f = R"( +#\ +data_X + +# Here is another example of folding. The following three text fields would be equivalent: +loop_ +_cat.f1 +;C:\foldername\filename +; + +;\ +C:\foldername\filename +; + +;\ +C:\foldername\file\ +name +; + +# but the next example would be a two-line value where the first line had the value "C:\foldername\file\" and the second had the value "name": + +; +C:\foldername\file\ +name +; + )"_cf; + + auto &db = f.front(); + auto &cat = db["cat"]; + for (size_t ix = 0; std::string v : cat.rows("f1")) + { + if (++ix == 4) + CHECK(v == R"( +C:\foldername\file\ +name)"); + else + CHECK(v == R"(C:\foldername\filename)"); + } +} + +// -------------------------------------------------------------------- + TEST_CASE("from_chars_1") { auto f = R"(data_TEST