Implement backslashed wrapping of long strings according to the cif 1.1 specification.

2026-06-04 13:54:25 +08:00 · 2025-12-31 16:08:17 +01:00
parent f19c6d078e
commit b9bcf07f84
3 changed files with 127 additions and 24 deletions
--- a/include/cif++/parser.hpp
+++ b/include/cif++/parser.hpp
@@ -280,6 +280,11 @@ class sac_parser
 		ItemName,
 		TextItem,
 		TextItemNL,
+
+		TextItemBS,
+		TextItemBS2,
+		TextItemBSNL,
+
 		Reserved,
 		Value
 	};
@@ -289,6 +294,7 @@ class sac_parser
 	// Parser state
 	uint32_t m_line_nr;
 	bool m_bol;
+	bool m_backslash_strings = false;
 	CIFToken m_lookahead;

 	// token buffer
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -24,10 +24,11 @@
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

-#include "cif++/utilities.hpp"
-#include "cif++/forward_decl.hpp"
 #include "cif++/parser.hpp"
+
 #include "cif++/file.hpp"
+#include "cif++/forward_decl.hpp"
+#include "cif++/utilities.hpp"

 #include <cassert>
 #include <iostream>
@@ -58,12 +59,12 @@ class reserved_words_automaton

 	constexpr bool finished() const
 	{
-		return m_state <= 0; 
+		return m_state <= 0;
 	}

 	constexpr bool matched() const
 	{
-		return m_state < 0; 
+		return m_state < 0;
 	}

 	constexpr move_result move(int ch)
@@ -75,7 +76,7 @@ class reserved_words_automaton
 			case 0:
 				break;

-			case -1:		// data_
+			case -1: // data_
 				if (sac_parser::is_non_blank(ch))
 					m_seen_trailing_chars = true;
 				else if (m_seen_trailing_chars)
@@ -84,15 +85,15 @@ class reserved_words_automaton
 					result = no_keyword;
 				break;

-			case -2:		// global_
+			case -2: // global_
 				result = sac_parser::is_non_blank(ch) ? no_keyword : global;
 				break;

-			case -3:		// loop_
+			case -3: // loop_
 				result = sac_parser::is_non_blank(ch) ? no_keyword : loop;
 				break;

-			case -4:		// save_
+			case -4: // save_
 				if (sac_parser::is_non_blank(ch))
 					m_seen_trailing_chars = true;
 				else if (m_seen_trailing_chars)
@@ -101,10 +102,10 @@ class reserved_words_automaton
 					result = save;
 				break;

-			case -5:		// stop_
+			case -5: // stop_
 				result = sac_parser::is_non_blank(ch) ? no_keyword : stop;
 				break;
-			
+
 			default:
 				assert(m_state > 0 and m_state < NODE_COUNT);

@@ -141,13 +142,13 @@ class reserved_words_automaton
 		int8_t next_nomatch;
 	} s_dag[] = {
 		{ 0 },
-		{ 'D',  5, 2 },
-		{ 'G',  9, 3 },
+		{ 'D', 5, 2 },
+		{ 'G', 9, 3 },
 		{ 'L', 15, 4 },
 		{ 'S', 19, 0 },
-		{ 'A',  6, 0 },
-		{ 'T',  7, 0 },
-		{ 'A',  8, 0 },
+		{ 'A', 6, 0 },
+		{ 'T', 7, 0 },
+		{ 'A', 8, 0 },
 		{ '_', -1, 0 },
 		{ 'L', 10, 0 },
 		{ 'O', 11, 0 },
@@ -155,7 +156,7 @@ class reserved_words_automaton
 		{ 'A', 13, 0 },
 		{ 'L', 14, 0 },
 		{ '_', -2, 0 },
-		{ 'O', 16, 0},
+		{ 'O', 16, 0 },
 		{ 'O', 17, 0 },
 		{ 'P', 18, 0 },
 		{ '_', -3, 0 },
@@ -238,7 +239,7 @@ int sac_parser::get_next_char()
 		}
 		else if (result == '\n')
 			++m_line_nr;
-		
+
 		m_token_buffer.push_back(std::char_traits<char>::to_char_type(result));
 	}

@@ -300,7 +301,12 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				else if (ch == '_')
 					state = State::ItemName;
 				else if (ch == ';' and m_bol)
-					state = State::TextItem;
+				{
+					if (m_backslash_strings)
+						state = State::TextItemBS;
+					else
+						state = State::TextItem;
+				}
 				else if (ch == '?')
 					state = State::QuestionMark;
 				else if (ch == '\'' or ch == '"')
@@ -326,12 +332,14 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				else
 					m_bol = (ch == '\n');
 				break;
-			
+
 			case State::Comment:
 				if (ch == '\n')
 				{
 					state = State::Start;
 					m_bol = true;
+					if (m_token_buffer.size() == 3 and m_token_buffer == std::vector{ '#', '\\', '\n' })
+						m_backslash_strings = true;
 					m_token_buffer.clear();
 				}
 				else if (ch == kEOF)
@@ -339,7 +347,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				else if (not is_any_print(ch))
 					error("invalid character in comment");
 				break;
-			
+
 			case State::QuestionMark:
 				if (not is_non_blank(ch))
 				{
@@ -350,13 +358,52 @@ sac_parser::CIFToken sac_parser::get_next_token()
 					state = State::Value;
 				break;

+			case State::TextItemBS:
+				if (ch == '\\')
+				{
+					state = State::TextItemBS2;
+					break;
+				}
+				[[fallthrough]];
+
 			case State::TextItem:
 				if (ch == '\n')
 					state = State::TextItemNL;
 				else if (ch == kEOF)
 					error("unterminated textfield");
 				else if (not is_any_print(ch) and cif::VERBOSE > 2)
-					warning("invalid character in text field '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
+					warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
+				break;
+
+			case State::TextItemBS2:
+				if (ch == '\n')
+				{
+					if (m_token_buffer[m_token_buffer.size() - 2] == '\\')
+					{
+						m_token_buffer.pop_back();
+						m_token_buffer.pop_back();
+					}
+					state = State::TextItemBSNL;
+				}
+				else if (ch == kEOF)
+					error("unterminated textfield");
+				else if (not is_any_print(ch) and cif::VERBOSE > 2)
+					warning("invalid character in text field '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
+				break;
+
+			case State::TextItemBSNL:
+				if (is_text_lead(ch) or ch == ' ' or ch == '\t')
+					state = State::TextItemBS;
+				else if (ch == ';')
+				{
+					assert(m_token_buffer.size() >= 2);
+					m_token_value = std::string_view(m_token_buffer.data() + 1, m_token_buffer.size() - 3);
+					result = CIFToken::VALUE;
+				}
+				else if (ch == kEOF)
+					error("unterminated textfield");
+				else if (ch != '\n')
+					error("invalid character in text field");
 				break;

 			case State::TextItemNL:
@@ -380,7 +427,7 @@ sac_parser::CIFToken sac_parser::get_next_token()
 				else if (ch == quoteChar)
 					state = State::QuotedStringQuote;
 				else if (not is_any_print(ch) and cif::VERBOSE > 2)
-					warning("invalid character in quoted string: '" + std::string({static_cast<char>(ch)}) + "' (" + std::to_string((int)ch) + ")");
+					warning("invalid character in quoted string: '" + std::string({ static_cast<char>(ch) }) + "' (" + std::to_string((int)ch) + ")");
 				break;

 			case State::QuotedStringQuote:
@@ -661,7 +708,7 @@ sac_parser::datablock_index sac_parser::index_datablocks()
 			case data:
 				if (dblk[si] == 0 and is_non_blank(ch))
 				{
-					datablock = {static_cast<char>(ch)};
+					datablock = { static_cast<char>(ch) };
 					state = data_name;
 				}
 				else if (dblk[si++] != ch)
@@ -745,7 +792,7 @@ void sac_parser::parse_global()
 void sac_parser::parse_datablock()
 {
 	static const std::string kUnitializedCategory("<invalid>");
-	std::string cat = kUnitializedCategory;	// intial value acts as a guard for empty category names
+	std::string cat = kUnitializedCategory; // intial value acts as a guard for empty category names

 	while (m_lookahead == CIFToken::LOOP or m_lookahead == CIFToken::ITEM_NAME or m_lookahead == CIFToken::SAVE_NAME)
 	{
--- a/test/unit-v2-test.cpp
+++ b/test/unit-v2-test.cpp
@@ -26,6 +26,7 @@

 #include "test-main.hpp"

+#include <catch2/catch_test_macros.hpp>
 #include <cif++.hpp>

 #include <stdexcept>
@@ -65,6 +66,55 @@ TEST_CASE("text_1")

 // --------------------------------------------------------------------

+TEST_CASE("text_2")
+{
+	// Test based on https://www.iucr.org/resources/cif/spec/version1.1/semantics
+
+	// There is a problem with this specification though, the fourth example
+	// as given on the website consists of three lines, the first being blank.
+
+	auto f = R"(
+#\
+data_X
+
+# Here is another example of folding. The following three text fields would be equivalent: 
+loop_
+_cat.f1
+;C:\foldername\filename
+;
+
+;\
+C:\foldername\filename
+;
+
+;\
+C:\foldername\file\
+name
+;
+
+# but the next example would be a two-line value where the first line had the value "C:\foldername\file\" and the second had the value "name":
+
+;
+C:\foldername\file\
+name
+;
+	)"_cf;
+
+	auto &db = f.front();
+	auto &cat = db["cat"];
+	for (size_t ix = 0; std::string v : cat.rows<std::string>("f1"))
+	{
+		if (++ix == 4)
+			CHECK(v == R"(
+C:\foldername\file\
+name)");
+		else
+			CHECK(v == R"(C:\foldername\filename)");
+	}
+}
+
+// --------------------------------------------------------------------
+
 TEST_CASE("from_chars_1")
 {
 auto f = R"(data_TEST