mirror of
https://github.com/google-deepmind/alphafold3.git
synced 2026-06-02 11:54:36 +08:00
Fix handling of tokens that contain quotes surrounded by spaces in CifDict::ToString
* `A "A" A` must be quoted as `'A "A" A', B 'B' B` must be quoted as `"B 'B' B"`. * `C "C" 'C' C` must be quoted as `\n;C "C" 'C' C\n;`. * Previously, `CifDict` didn't quote these correctly and e.g. `A "A" A` would become three tokens because of the bad early return on the first space in the token. This makes `CifDict::ToString` ~15% faster as the trivial check allows us to skip the extra newline check for most tokens. Reported in https://github.com/google-deepmind/alphafold3/issues/471. PiperOrigin-RevId: 791587762 Change-Id: If6616263e570dff534633bd0302afd23471f946f
This commit is contained in:
committed by
Copybara-Service
parent
f2edd59f87
commit
722437b4ff
@@ -150,19 +150,39 @@ absl::StatusOr<std::vector<absl::string_view>> TokenizeInternal(
|
||||
return tokens;
|
||||
}
|
||||
|
||||
bool IsTrivialToken(const absl::string_view value) {
|
||||
// Returns true if the token doesn't need any quoting and isn't multiline.
|
||||
if (value.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return std::all_of(value.begin(), value.end(), [](char c) {
|
||||
return absl::ascii_isalnum(c) || c == '.' || c == '?' || c == '-';
|
||||
});
|
||||
}
|
||||
|
||||
bool IsMultiLineToken(const absl::string_view value) {
|
||||
// A token is multiline if it has a newline or both single and double quotes.
|
||||
bool has_single_quotes = false;
|
||||
bool has_double_quotes = false;
|
||||
for (const char c : value) {
|
||||
if (c == '\n') {
|
||||
return true;
|
||||
} else if (c == '\'') {
|
||||
has_single_quotes = true;
|
||||
} else if (c == '"') {
|
||||
has_double_quotes = true;
|
||||
}
|
||||
}
|
||||
return has_single_quotes && has_double_quotes;
|
||||
}
|
||||
|
||||
absl::string_view GetEscapeQuote(const absl::string_view value) {
|
||||
// Empty values should not happen, but if so, they should be quoted.
|
||||
if (value.empty()) {
|
||||
return "\"";
|
||||
}
|
||||
|
||||
// Shortcut for the most common cases where no quoting needed.
|
||||
if (std::all_of(value.begin(), value.end(), [](char c) {
|
||||
return absl::ascii_isalnum(c) || c == '.' || c == '?' || c == '-';
|
||||
})) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// The value must not start with one of these CIF keywords.
|
||||
if (absl::StartsWithIgnoreCase(value, "data_") ||
|
||||
absl::StartsWithIgnoreCase(value, "loop_") ||
|
||||
@@ -179,14 +199,27 @@ absl::string_view GetEscapeQuote(const absl::string_view value) {
|
||||
return "\"";
|
||||
}
|
||||
|
||||
// No quotes or whitespace allowed inside.
|
||||
// No quotes or whitespace allowed inside. Rare case when both double and
|
||||
// single quotes are present is handled by IsMultiLineToken.
|
||||
bool use_double_quote = true;
|
||||
bool use_single_quote = true;
|
||||
bool needs_quote = false;
|
||||
for (const char c : value) {
|
||||
if (c == '"') {
|
||||
return "'";
|
||||
} else if (c == '\'' || c == ' ' || c == '\t') {
|
||||
return "\"";
|
||||
if (c == ' ' || c == '\t') {
|
||||
needs_quote = true;
|
||||
} else if (c == '"') {
|
||||
needs_quote = true;
|
||||
use_double_quote = false;
|
||||
} else if (c == '\'') {
|
||||
needs_quote = true;
|
||||
use_single_quote = false;
|
||||
}
|
||||
}
|
||||
if (needs_quote && use_double_quote) {
|
||||
return "\"";
|
||||
} else if (needs_quote && use_single_quote) {
|
||||
return "'";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
@@ -254,7 +287,11 @@ class Column {
|
||||
int max_value_length = 0;
|
||||
for (size_t i = 0; i < values->size(); ++i) {
|
||||
absl::string_view value = (*values)[i];
|
||||
if (absl::StrContains(value, '\n')) {
|
||||
if (IsTrivialToken(value)) {
|
||||
// Shortcut for the most common cases where no quoting/multiline needed.
|
||||
max_value_length = std::max<int>(max_value_length, value.size());
|
||||
continue;
|
||||
} else if (IsMultiLineToken(value)) {
|
||||
values_with_newlines_.insert(i);
|
||||
} else {
|
||||
absl::string_view quote = GetEscapeQuote(value);
|
||||
|
||||
Reference in New Issue
Block a user