Fix handling of tokens that contain quotes surrounded by spaces in CifDict::ToString

* `A "A" A` must be quoted as `'A "A" A', B 'B' B` must be quoted as `"B 'B' B"`.
* `C "C" 'C' C` must be quoted as `\n;C "C" 'C' C\n;`.
* Previously, `CifDict` didn't quote these correctly and e.g. `A "A" A` would become three tokens because of the bad early return on the first space in the token.

This makes `CifDict::ToString` ~15% faster as the trivial check allows us to skip the extra newline check for most tokens.

Reported in https://github.com/google-deepmind/alphafold3/issues/471.

PiperOrigin-RevId: 791587762
Change-Id: If6616263e570dff534633bd0302afd23471f946f
This commit is contained in:
Augustin Zidek
2025-08-06 02:04:51 -07:00
committed by Copybara-Service
parent f2edd59f87
commit 722437b4ff

View File

@@ -150,19 +150,39 @@ absl::StatusOr<std::vector<absl::string_view>> TokenizeInternal(
return tokens;
}
bool IsTrivialToken(const absl::string_view value) {
// Returns true if the token doesn't need any quoting and isn't multiline.
if (value.empty()) {
return false;
}
return std::all_of(value.begin(), value.end(), [](char c) {
return absl::ascii_isalnum(c) || c == '.' || c == '?' || c == '-';
});
}
bool IsMultiLineToken(const absl::string_view value) {
// A token is multiline if it has a newline or both single and double quotes.
bool has_single_quotes = false;
bool has_double_quotes = false;
for (const char c : value) {
if (c == '\n') {
return true;
} else if (c == '\'') {
has_single_quotes = true;
} else if (c == '"') {
has_double_quotes = true;
}
}
return has_single_quotes && has_double_quotes;
}
absl::string_view GetEscapeQuote(const absl::string_view value) {
// Empty values should not happen, but if so, they should be quoted.
if (value.empty()) {
return "\"";
}
// Shortcut for the most common cases where no quoting needed.
if (std::all_of(value.begin(), value.end(), [](char c) {
return absl::ascii_isalnum(c) || c == '.' || c == '?' || c == '-';
})) {
return "";
}
// The value must not start with one of these CIF keywords.
if (absl::StartsWithIgnoreCase(value, "data_") ||
absl::StartsWithIgnoreCase(value, "loop_") ||
@@ -179,14 +199,27 @@ absl::string_view GetEscapeQuote(const absl::string_view value) {
return "\"";
}
// No quotes or whitespace allowed inside.
// No quotes or whitespace allowed inside. Rare case when both double and
// single quotes are present is handled by IsMultiLineToken.
bool use_double_quote = true;
bool use_single_quote = true;
bool needs_quote = false;
for (const char c : value) {
if (c == '"') {
return "'";
} else if (c == '\'' || c == ' ' || c == '\t') {
return "\"";
if (c == ' ' || c == '\t') {
needs_quote = true;
} else if (c == '"') {
needs_quote = true;
use_double_quote = false;
} else if (c == '\'') {
needs_quote = true;
use_single_quote = false;
}
}
if (needs_quote && use_double_quote) {
return "\"";
} else if (needs_quote && use_single_quote) {
return "'";
}
return "";
}
@@ -254,7 +287,11 @@ class Column {
int max_value_length = 0;
for (size_t i = 0; i < values->size(); ++i) {
absl::string_view value = (*values)[i];
if (absl::StrContains(value, '\n')) {
if (IsTrivialToken(value)) {
// Shortcut for the most common cases where no quoting/multiline needed.
max_value_length = std::max<int>(max_value_length, value.size());
continue;
} else if (IsMultiLineToken(value)) {
values_with_newlines_.insert(i);
} else {
absl::string_view quote = GetEscapeQuote(value);