Optimize CIF loop parsing by ~4% - remove modulo in the hot path

PiperOrigin-RevId: 903070425 Change-Id: Ibc7fc6735231504a4a9c5f11793854ce332d8c46
2026-06-02 11:54:36 +08:00 · 2026-04-21 01:36:47 -07:00
parent f6a5aecf6e
commit cfeeedd24d
1 changed files with 15 additions and 8 deletions
--- a/src/alphafold3/parsers/cpp/cif_dict_lib.cc
+++ b/src/alphafold3/parsers/cpp/cif_dict_lib.cc
@@ -384,13 +384,17 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
  cif["data_"].emplace_back(first_token);

  // Counters for CIF loop_ regions.
-  int loop_token_index = 0;
  int num_loop_keys = 0;
  // Loops have usually O(10) columns but could have up to O(10^6) rows. It is
  // therefore wasteful to look up the cif vector where to add a loop value
  // since that means doing `columns * rows` map lookups. If we save pointers to
  // these loop column fields instead, we need only 1 cif lookup per column.
  std::vector<std::vector<std::string>*> loop_column_values;
+  // Use a resetting column index instead of computing modulo on every token.
+  // This is the hot path for _atom_site tables with millions of values.
+  int token_column_index = 0;
+  // Total number of loop values seen (for column size validation).
+  int loop_token_count = 0;

  // Skip the first element since we already processed it above.
  for (auto token_itr = tokens->begin() + 1; token_itr != tokens->end();
@@ -399,23 +403,22 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
    if (absl::EqualsIgnoreCase(token, "loop_")) {
      // A new loop started, check the previous loop and get rid of its data.
      absl::Status loop_status =
-          CheckLoopColumnSizes(num_loop_keys, loop_token_index);
+          CheckLoopColumnSizes(num_loop_keys, loop_token_count);
      if (!loop_status.ok()) {
        return loop_status;
      }
      loop_flag = true;
      loop_column_values.clear();
-      loop_token_index = 0;
+      token_column_index = 0;
+      loop_token_count = 0;
      num_loop_keys = 0;
      continue;
    } else if (loop_flag) {
      // The second condition checks we are in the first column. Some mmCIF
      // files (e.g. 4q9r) have values in later columns starting with an
      // underscore and we don't want to read these as keys.
-      int token_column_index =
-          num_loop_keys == 0 ? 0 : loop_token_index % num_loop_keys;
      if (token_column_index == 0 && !token.empty() && token[0] == '_') {
-        if (loop_token_index > 0) {
+        if (loop_token_count > 0) {
          // We are out of the loop.
          loop_flag = false;
        } else {
@@ -449,7 +452,11 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
                           " expected at most: ", loop_column_values.size()));
        }
        loop_column_values[token_column_index]->emplace_back(token);
-        loop_token_index++;
+        loop_token_count++;
+        if (++token_column_index == num_loop_keys) {
+          // We have completed a row, reset the column index for the next row.
+          token_column_index = 0;
+        }
        continue;
      }
    }
@@ -470,7 +477,7 @@ absl::StatusOr<CifDict> CifDict::FromString(absl::string_view cif_string) {
    }
  }
  absl::Status loop_status =
-      CheckLoopColumnSizes(num_loop_keys, loop_token_index);
+      CheckLoopColumnSizes(num_loop_keys, loop_token_count);
  if (!loop_status.ok()) {
    return loop_status;
  }