mirror of
https://github.com/google-deepmind/alphafold3.git
synced 2026-06-02 11:54:36 +08:00
Optimize CIF loop parsing by ~4% - remove modulo in the hot path
PiperOrigin-RevId: 903070425 Change-Id: Ibc7fc6735231504a4a9c5f11793854ce332d8c46
This commit is contained in:
committed by
Copybara-Service
parent
f6a5aecf6e
commit
cfeeedd24d
@@ -384,13 +384,17 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
|
||||
cif["data_"].emplace_back(first_token);
|
||||
|
||||
// Counters for CIF loop_ regions.
|
||||
int loop_token_index = 0;
|
||||
int num_loop_keys = 0;
|
||||
// Loops have usually O(10) columns but could have up to O(10^6) rows. It is
|
||||
// therefore wasteful to look up the cif vector where to add a loop value
|
||||
// since that means doing `columns * rows` map lookups. If we save pointers to
|
||||
// these loop column fields instead, we need only 1 cif lookup per column.
|
||||
std::vector<std::vector<std::string>*> loop_column_values;
|
||||
// Use a resetting column index instead of computing modulo on every token.
|
||||
// This is the hot path for _atom_site tables with millions of values.
|
||||
int token_column_index = 0;
|
||||
// Total number of loop values seen (for column size validation).
|
||||
int loop_token_count = 0;
|
||||
|
||||
// Skip the first element since we already processed it above.
|
||||
for (auto token_itr = tokens->begin() + 1; token_itr != tokens->end();
|
||||
@@ -399,23 +403,22 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
|
||||
if (absl::EqualsIgnoreCase(token, "loop_")) {
|
||||
// A new loop started, check the previous loop and get rid of its data.
|
||||
absl::Status loop_status =
|
||||
CheckLoopColumnSizes(num_loop_keys, loop_token_index);
|
||||
CheckLoopColumnSizes(num_loop_keys, loop_token_count);
|
||||
if (!loop_status.ok()) {
|
||||
return loop_status;
|
||||
}
|
||||
loop_flag = true;
|
||||
loop_column_values.clear();
|
||||
loop_token_index = 0;
|
||||
token_column_index = 0;
|
||||
loop_token_count = 0;
|
||||
num_loop_keys = 0;
|
||||
continue;
|
||||
} else if (loop_flag) {
|
||||
// The second condition checks we are in the first column. Some mmCIF
|
||||
// files (e.g. 4q9r) have values in later columns starting with an
|
||||
// underscore and we don't want to read these as keys.
|
||||
int token_column_index =
|
||||
num_loop_keys == 0 ? 0 : loop_token_index % num_loop_keys;
|
||||
if (token_column_index == 0 && !token.empty() && token[0] == '_') {
|
||||
if (loop_token_index > 0) {
|
||||
if (loop_token_count > 0) {
|
||||
// We are out of the loop.
|
||||
loop_flag = false;
|
||||
} else {
|
||||
@@ -449,7 +452,11 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
|
||||
" expected at most: ", loop_column_values.size()));
|
||||
}
|
||||
loop_column_values[token_column_index]->emplace_back(token);
|
||||
loop_token_index++;
|
||||
loop_token_count++;
|
||||
if (++token_column_index == num_loop_keys) {
|
||||
// We have completed a row, reset the column index for the next row.
|
||||
token_column_index = 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -470,7 +477,7 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
|
||||
}
|
||||
}
|
||||
absl::Status loop_status =
|
||||
CheckLoopColumnSizes(num_loop_keys, loop_token_index);
|
||||
CheckLoopColumnSizes(num_loop_keys, loop_token_count);
|
||||
if (!loop_status.ok()) {
|
||||
return loop_status;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user