Add pLDDT and PAE files saving to AF runner, update Colab.

PAE methods are migrated from Colab notebook_utils.py to AFOS and included in run_alphafold.py to save a new file. Colab notebook is updated accordingly. pLDDT data is transformed into JSON as saved as well, as an additional output file. PiperOrigin-RevId: 525687298 Change-Id: If7f0bcf7d3b39901dae58a67958eaa5687645de2
2026-06-04 14:58:05 +08:00 · 2023-04-20 02:21:56 -07:00
parent 4d83e3fc08
commit 2819de4ddd
7 changed files with 167 additions and 37 deletions
--- a/alphafold/common/confidence.py
+++ b/alphafold/common/confidence.py
@@ -14,7 +14,9 @@

 """Functions for processing confidence metrics."""

+import json
 from typing import Dict, Optional, Tuple
+
 import numpy as np
 import scipy.special

@@ -36,6 +38,43 @@ def compute_plddt(logits: np.ndarray) -> np.ndarray:
  return predicted_lddt_ca * 100


+def _confidence_category(score: float) -> str:
+  """Categorizes pLDDT into: disordered (D), low (L), medium (M), high (H)."""
+  if 0 <= score < 50:
+    return 'D'
+  if 50 <= score < 70:
+    return 'L'
+  elif 70 <= score < 90:
+    return 'M'
+  elif 90 <= score <= 100:
+    return 'H'
+  else:
+    raise ValueError(f'Invalid pLDDT score {score}')
+
+
+def confidence_json(plddt: np.ndarray) -> str:
+  """Returns JSON with confidence score and category for every residue.
+
+  Args:
+    plddt: Per-residue confidence metric data.
+
+  Returns:
+    String with a formatted JSON.
+
+  Raises:
+    ValueError: If `plddt` has a rank different than 1.
+  """
+  if plddt.ndim != 1:
+    raise ValueError(f'The plddt array must be rank 1, got: {plddt.shape}.')
+
+  confidence = {
+      'residueNumber': list(range(1, len(plddt) + 1)),
+      'confidenceScore': [round(float(s), 2) for s in plddt],
+      'confidenceCategory': [_confidence_category(s) for s in plddt],
+  }
+  return json.dumps(confidence, indent=None, separators=(',', ':'))
+
+
 def _calculate_bin_centers(breaks: np.ndarray):
  """Gets the bin centers from the bin edges.

@@ -108,6 +147,32 @@ def compute_predicted_aligned_error(
  }


+def pae_json(pae: np.ndarray, max_pae: float) -> str:
+  """Returns the PAE in the same format as is used in the AFDB.
+
+  Note that the values are presented as floats to 1 decimal place, whereas AFDB
+  returns integer values.
+
+  Args:
+    pae: The n_res x n_res PAE array.
+    max_pae: The maximum possible PAE value.
+
+  Returns:
+    PAE output format as a JSON string.
+  """
+  # Check the PAE array is the correct shape.
+  if pae.ndim != 2 or pae.shape[0] != pae.shape[1]:
+    raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
+
+  # Round the predicted aligned errors to 1 decimal place.
+  rounded_errors = np.round(pae.astype(np.float64), decimals=1)
+  formatted_output = [{
+      'predicted_aligned_error': rounded_errors.tolist(),
+      'max_predicted_aligned_error': max_pae,
+  }]
+  return json.dumps(formatted_output, indent=None, separators=(',', ':'))
+
+
 def predicted_tm_score(
    logits: np.ndarray,
    breaks: np.ndarray,
--- a/alphafold/common/confidence_test.py
+++ b/alphafold/common/confidence_test.py
@@ -0,0 +1,48 @@
+# Copyright 2023 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test confidence metrics."""
+
+
+from absl.testing import absltest
+from alphafold.common import confidence
+import numpy as np
+
+
+class ConfidenceTest(absltest.TestCase):
+
+  def test_pae_json(self):
+    pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
+    pae_json = confidence.pae_json(pae=pae, max_pae=31.75)
+    self.assertEqual(
+        pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
+        '"max_predicted_aligned_error":31.75}]')
+
+  def test_confidence_json(self):
+    plddt = np.array([42, 42.42])
+
+    confidence_json = confidence.confidence_json(plddt=plddt)
+
+    print(confidence_json)
+
+    self.assertEqual(
+        confidence_json,
+        ('{"residueNumber":[1,2],'
+         '"confidenceScore":[42.0,42.42],'
+         '"confidenceCategory":["D","D"]}'),
+    )
+
+
+if __name__ == '__main__':
+  absltest.main()
--- a/alphafold/notebooks/notebook_utils.py
+++ b/alphafold/notebooks/notebook_utils.py
@@ -13,7 +13,6 @@
 # limitations under the License.

 """Helper methods for the AlphaFold Colab notebook."""
-import json
 from typing import AbstractSet, Any, Mapping, Optional, Sequence

 from alphafold.common import residue_constants
@@ -143,31 +142,6 @@ def empty_placeholder_template_features(
  }


-def get_pae_json(pae: np.ndarray, max_pae: float) -> str:
-  """Returns the PAE in the same format as is used in the AFDB.
-
-  Note that the values are presented as floats to 1 decimal place,
-  whereas AFDB returns integer values.
-
-  Args:
-    pae: The n_res x n_res PAE array.
-    max_pae: The maximum possible PAE value.
-  Returns:
-    PAE output format as a JSON string.
-  """
-  # Check the PAE array is the correct shape.
-  if (pae.ndim != 2 or pae.shape[0] != pae.shape[1]):
-    raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
-
-  # Round the predicted aligned errors to 1 decimal place.
-  rounded_errors = np.round(pae.astype(np.float64), decimals=1)
-  formatted_output = [{
-      'predicted_aligned_error': rounded_errors.tolist(),
-      'max_predicted_aligned_error': max_pae
-  }]
-  return json.dumps(formatted_output, indent=None, separators=(',', ':'))
-
-
 def check_cell_execution_order(
    cells_ran: AbstractSet[int], cell_number: int) -> None:
  """Check that the cell execution order is correct.
--- a/alphafold/notebooks/notebook_utils_test.py
+++ b/alphafold/notebooks/notebook_utils_test.py
@@ -184,13 +184,6 @@ class NotebookUtilsTest(parameterized.TestCase):
        [np.array([], dtype=templates.TEMPLATE_FEATURES[feat_name]).dtype
         for feat_name in template_features])

-  def test_get_pae_json(self):
-    pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
-    pae_json = notebook_utils.get_pae_json(pae=pae, max_pae=31.75)
-    self.assertEqual(
-        pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
-        '"max_predicted_aligned_error":31.75}]')
-
  def test_check_cell_execution_order_correct(self):
    notebook_utils.check_cell_execution_order({1, 2}, 3)

--- a/notebooks/AlphaFold.ipynb
+++ b/notebooks/AlphaFold.ipynb
@@ -374,6 +374,7 @@
        "from alphafold.data import pipeline_multimer\n",
        "from alphafold.data.tools import jackhmmer\n",
        "\n",
+        "from alphafold.common import confidence\n",
        "from alphafold.common import protein\n",
        "\n",
        "from alphafold.relax import relax\n",
@@ -786,7 +787,7 @@
        "pae_output_path = os.path.join(output_dir, 'predicted_aligned_error.json')\n",
        "if pae_outputs:\n",
        "  # Save predicted aligned error in the same format as the AF EMBL DB.\n",
-        "  pae_data = notebook_utils.get_pae_json(pae=pae, max_pae=max_pae.item())\n",
+        "  pae_data = confidence.get_pae_json(pae=pae, max_pae=max_pae.item())\n",
        "  with open(pae_output_path, 'w') as f:\n",
        "    f.write(pae_data)\n",
        "\n",
--- a/run_alphafold.py
+++ b/run_alphafold.py
@@ -22,11 +22,12 @@ import random
 import shutil
 import sys
 import time
-from typing import Any, Dict, Mapping, Union
+from typing import Any, Dict, Union

 from absl import app
 from absl import flags
 from absl import logging
+from alphafold.common import confidence
 from alphafold.common import protein
 from alphafold.common import residue_constants
 from alphafold.data import pipeline
@@ -171,6 +172,38 @@ def _jnp_to_np(output: Dict[str, Any]) -> Dict[str, Any]:
  return output


+def _save_confidence_json_file(
+    plddt: np.ndarray, output_dir: str, model_name: str
+) -> None:
+  confidence_json = confidence.confidence_json(plddt)
+
+  # Save the confidence json.
+  confidence_json_output_path = os.path.join(
+      output_dir, f'confidence_{model_name}.json'
+  )
+  with open(confidence_json_output_path, 'w') as f:
+    f.write(confidence_json)
+
+
+def _save_pae_json_file(
+    pae: np.ndarray, max_pae: float, output_dir: str, model_name: str
+) -> None:
+  """Check prediction result for PAE data and save to a JSON file if present.
+
+  Args:
+    pae: The n_res x n_res PAE array.
+    max_pae: The maximum possible PAE value.
+    output_dir: Directory to which files are saved.
+    model_name: Name of a model.
+  """
+  pae_json = confidence.pae_json(pae, max_pae)
+
+  # Save the PAE json.
+  pae_json_output_path = os.path.join(output_dir, f'pae_{model_name}.json')
+  with open(pae_json_output_path, 'w') as f:
+    f.write(pae_json)
+
+
 def predict_structure(
    fasta_path: str,
    fasta_name: str,
@@ -240,8 +273,17 @@ def predict_structure(
          model_name, fasta_name, t_diff)

    plddt = prediction_result['plddt']
+    _save_confidence_json_file(plddt, output_dir, model_name)
    ranking_confidences[model_name] = prediction_result['ranking_confidence']

+    if (
+        'predicted_aligned_error' in prediction_result
+        and 'max_predicted_aligned_error' in prediction_result
+    ):
+      pae = prediction_result['predicted_aligned_error']
+      max_pae = prediction_result['max_predicted_aligned_error']
+      _save_pae_json_file(pae, float(max_pae), output_dir, model_name)
+
    # Remove jax dependency from results.
    np_prediction_result = _jnp_to_np(dict(prediction_result))

--- a/run_alphafold_test.py
+++ b/run_alphafold_test.py
@@ -84,8 +84,15 @@ class RunAlphafoldTest(parameterized.TestCase):

    target_output_files = os.listdir(os.path.join(out_dir, 'test'))
    expected_files = [
-        'features.pkl', 'msas', 'ranked_0.pdb', 'ranking_debug.json',
-        'result_model1.pkl', 'timings.json', 'unrelaxed_model1.pdb',
+        'features.pkl',
+        'msas',
+        'ranked_0.pdb',
+        'ranking_debug.json',
+        'result_model1.pkl',
+        'timings.json',
+        'unrelaxed_model1.pdb',
+        'pae_model1.json',
+        'confidence_model1.json',
    ]
    if models_to_relax == run_alphafold.ModelsToRelax.ALL:
      expected_files.extend(['relaxed_model1.pdb', 'relax_metrics.json'])