Add pLDDT and PAE files saving to AF runner, update Colab.

PAE methods are migrated from Colab notebook_utils.py to AFOS and included in run_alphafold.py to save a new file. Colab notebook is updated accordingly. pLDDT data is transformed into JSON as saved as well, as an additional output file.

PiperOrigin-RevId: 525687298
Change-Id: If7f0bcf7d3b39901dae58a67958eaa5687645de2
This commit is contained in:
DeepMind
2023-04-20 02:21:56 -07:00
committed by Copybara-Service
parent 4d83e3fc08
commit 2819de4ddd
7 changed files with 167 additions and 37 deletions

View File

@@ -14,7 +14,9 @@
"""Functions for processing confidence metrics."""
import json
from typing import Dict, Optional, Tuple
import numpy as np
import scipy.special
@@ -36,6 +38,43 @@ def compute_plddt(logits: np.ndarray) -> np.ndarray:
return predicted_lddt_ca * 100
def _confidence_category(score: float) -> str:
"""Categorizes pLDDT into: disordered (D), low (L), medium (M), high (H)."""
if 0 <= score < 50:
return 'D'
if 50 <= score < 70:
return 'L'
elif 70 <= score < 90:
return 'M'
elif 90 <= score <= 100:
return 'H'
else:
raise ValueError(f'Invalid pLDDT score {score}')
def confidence_json(plddt: np.ndarray) -> str:
"""Returns JSON with confidence score and category for every residue.
Args:
plddt: Per-residue confidence metric data.
Returns:
String with a formatted JSON.
Raises:
ValueError: If `plddt` has a rank different than 1.
"""
if plddt.ndim != 1:
raise ValueError(f'The plddt array must be rank 1, got: {plddt.shape}.')
confidence = {
'residueNumber': list(range(1, len(plddt) + 1)),
'confidenceScore': [round(float(s), 2) for s in plddt],
'confidenceCategory': [_confidence_category(s) for s in plddt],
}
return json.dumps(confidence, indent=None, separators=(',', ':'))
def _calculate_bin_centers(breaks: np.ndarray):
"""Gets the bin centers from the bin edges.
@@ -108,6 +147,32 @@ def compute_predicted_aligned_error(
}
def pae_json(pae: np.ndarray, max_pae: float) -> str:
"""Returns the PAE in the same format as is used in the AFDB.
Note that the values are presented as floats to 1 decimal place, whereas AFDB
returns integer values.
Args:
pae: The n_res x n_res PAE array.
max_pae: The maximum possible PAE value.
Returns:
PAE output format as a JSON string.
"""
# Check the PAE array is the correct shape.
if pae.ndim != 2 or pae.shape[0] != pae.shape[1]:
raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
# Round the predicted aligned errors to 1 decimal place.
rounded_errors = np.round(pae.astype(np.float64), decimals=1)
formatted_output = [{
'predicted_aligned_error': rounded_errors.tolist(),
'max_predicted_aligned_error': max_pae,
}]
return json.dumps(formatted_output, indent=None, separators=(',', ':'))
def predicted_tm_score(
logits: np.ndarray,
breaks: np.ndarray,

View File

@@ -0,0 +1,48 @@
# Copyright 2023 DeepMind Technologies Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test confidence metrics."""
from absl.testing import absltest
from alphafold.common import confidence
import numpy as np
class ConfidenceTest(absltest.TestCase):
def test_pae_json(self):
pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
pae_json = confidence.pae_json(pae=pae, max_pae=31.75)
self.assertEqual(
pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
'"max_predicted_aligned_error":31.75}]')
def test_confidence_json(self):
plddt = np.array([42, 42.42])
confidence_json = confidence.confidence_json(plddt=plddt)
print(confidence_json)
self.assertEqual(
confidence_json,
('{"residueNumber":[1,2],'
'"confidenceScore":[42.0,42.42],'
'"confidenceCategory":["D","D"]}'),
)
if __name__ == '__main__':
absltest.main()

View File

@@ -13,7 +13,6 @@
# limitations under the License.
"""Helper methods for the AlphaFold Colab notebook."""
import json
from typing import AbstractSet, Any, Mapping, Optional, Sequence
from alphafold.common import residue_constants
@@ -143,31 +142,6 @@ def empty_placeholder_template_features(
}
def get_pae_json(pae: np.ndarray, max_pae: float) -> str:
"""Returns the PAE in the same format as is used in the AFDB.
Note that the values are presented as floats to 1 decimal place,
whereas AFDB returns integer values.
Args:
pae: The n_res x n_res PAE array.
max_pae: The maximum possible PAE value.
Returns:
PAE output format as a JSON string.
"""
# Check the PAE array is the correct shape.
if (pae.ndim != 2 or pae.shape[0] != pae.shape[1]):
raise ValueError(f'PAE must be a square matrix, got {pae.shape}')
# Round the predicted aligned errors to 1 decimal place.
rounded_errors = np.round(pae.astype(np.float64), decimals=1)
formatted_output = [{
'predicted_aligned_error': rounded_errors.tolist(),
'max_predicted_aligned_error': max_pae
}]
return json.dumps(formatted_output, indent=None, separators=(',', ':'))
def check_cell_execution_order(
cells_ran: AbstractSet[int], cell_number: int) -> None:
"""Check that the cell execution order is correct.

View File

@@ -184,13 +184,6 @@ class NotebookUtilsTest(parameterized.TestCase):
[np.array([], dtype=templates.TEMPLATE_FEATURES[feat_name]).dtype
for feat_name in template_features])
def test_get_pae_json(self):
pae = np.array([[0.01, 13.12345], [20.0987, 0.0]])
pae_json = notebook_utils.get_pae_json(pae=pae, max_pae=31.75)
self.assertEqual(
pae_json, '[{"predicted_aligned_error":[[0.0,13.1],[20.1,0.0]],'
'"max_predicted_aligned_error":31.75}]')
def test_check_cell_execution_order_correct(self):
notebook_utils.check_cell_execution_order({1, 2}, 3)

View File

@@ -374,6 +374,7 @@
"from alphafold.data import pipeline_multimer\n",
"from alphafold.data.tools import jackhmmer\n",
"\n",
"from alphafold.common import confidence\n",
"from alphafold.common import protein\n",
"\n",
"from alphafold.relax import relax\n",
@@ -786,7 +787,7 @@
"pae_output_path = os.path.join(output_dir, 'predicted_aligned_error.json')\n",
"if pae_outputs:\n",
" # Save predicted aligned error in the same format as the AF EMBL DB.\n",
" pae_data = notebook_utils.get_pae_json(pae=pae, max_pae=max_pae.item())\n",
" pae_data = confidence.get_pae_json(pae=pae, max_pae=max_pae.item())\n",
" with open(pae_output_path, 'w') as f:\n",
" f.write(pae_data)\n",
"\n",

View File

@@ -22,11 +22,12 @@ import random
import shutil
import sys
import time
from typing import Any, Dict, Mapping, Union
from typing import Any, Dict, Union
from absl import app
from absl import flags
from absl import logging
from alphafold.common import confidence
from alphafold.common import protein
from alphafold.common import residue_constants
from alphafold.data import pipeline
@@ -171,6 +172,38 @@ def _jnp_to_np(output: Dict[str, Any]) -> Dict[str, Any]:
return output
def _save_confidence_json_file(
plddt: np.ndarray, output_dir: str, model_name: str
) -> None:
confidence_json = confidence.confidence_json(plddt)
# Save the confidence json.
confidence_json_output_path = os.path.join(
output_dir, f'confidence_{model_name}.json'
)
with open(confidence_json_output_path, 'w') as f:
f.write(confidence_json)
def _save_pae_json_file(
pae: np.ndarray, max_pae: float, output_dir: str, model_name: str
) -> None:
"""Check prediction result for PAE data and save to a JSON file if present.
Args:
pae: The n_res x n_res PAE array.
max_pae: The maximum possible PAE value.
output_dir: Directory to which files are saved.
model_name: Name of a model.
"""
pae_json = confidence.pae_json(pae, max_pae)
# Save the PAE json.
pae_json_output_path = os.path.join(output_dir, f'pae_{model_name}.json')
with open(pae_json_output_path, 'w') as f:
f.write(pae_json)
def predict_structure(
fasta_path: str,
fasta_name: str,
@@ -240,8 +273,17 @@ def predict_structure(
model_name, fasta_name, t_diff)
plddt = prediction_result['plddt']
_save_confidence_json_file(plddt, output_dir, model_name)
ranking_confidences[model_name] = prediction_result['ranking_confidence']
if (
'predicted_aligned_error' in prediction_result
and 'max_predicted_aligned_error' in prediction_result
):
pae = prediction_result['predicted_aligned_error']
max_pae = prediction_result['max_predicted_aligned_error']
_save_pae_json_file(pae, float(max_pae), output_dir, model_name)
# Remove jax dependency from results.
np_prediction_result = _jnp_to_np(dict(prediction_result))

View File

@@ -84,8 +84,15 @@ class RunAlphafoldTest(parameterized.TestCase):
target_output_files = os.listdir(os.path.join(out_dir, 'test'))
expected_files = [
'features.pkl', 'msas', 'ranked_0.pdb', 'ranking_debug.json',
'result_model1.pkl', 'timings.json', 'unrelaxed_model1.pdb',
'features.pkl',
'msas',
'ranked_0.pdb',
'ranking_debug.json',
'result_model1.pkl',
'timings.json',
'unrelaxed_model1.pdb',
'pae_model1.json',
'confidence_model1.json',
]
if models_to_relax == run_alphafold.ModelsToRelax.ALL:
expected_files.extend(['relaxed_model1.pdb', 'relax_metrics.json'])