From 4292bd6c95b3ae34cf00d4e5bc09156ef4f4eb21 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Fri, 14 Nov 2025 16:05:10 -0800 Subject: [PATCH] Addressing code review comments: test skip, docs Signed-off-by: Boris Fomitchev --- docs/source/Inference.md | 26 +++++++++++++++++++++++++- tests/compare_utils.py | 3 +++ tests/test_cuequivariance.py | 3 ++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/source/Inference.md b/docs/source/Inference.md index 1e40f59..1f7678f 100644 --- a/docs/source/Inference.md +++ b/docs/source/Inference.md @@ -143,14 +143,38 @@ Some commonly used command line flags are here. A full list of flags can be view ### Advanced Options for Increasing Efficiency -#### Speeding up inference +#### Turning on TF32 (TensorFloat-32) precision on compatible hardware + +When running on latest NVIDIA GPUs, starting from Ampere, you can enable TF32 precision to get about 1.3x performance boost. +TF32 uses 1 sign bit, 8 exponent bits (like FP32), and 10 mantissa (significand) bits (like FP16), packed into a 32-bit word. +It was found generally safe to use OF2 with TF32 instead of full FP32. To enable it globally in Torch: + +``` +torch.backends.cuda.matmul.allow_tf32 = True # Enable TF32 for matrix multiplications +torch.backends.cudnn.allow_tf32 = True # Enable TF32 for convolutions +``` +Make sure NVIDIA_TF32_OVERRIDE environment variable is either not defined or set to 1. + +#### Applying lower BF16 precision to EvoformerStack and ExtraMSAStack + +BF16 occupies 16 bits: 1 sign bit, 8 exponent bits (same as FP32), and 7 mantissa (fraction) bits. Its dynamic range is equivalent to FP32, but BF16 can only represent numbers with about three decimal digits of precision. +It was found generally safe to apply BF16 precision cast to EvoformerStack and ExtraMSAStack. This allows to achieve ~1.5x speedup compared to TF32 inferenceof the whole model. +To apply BF16, use '--precision=bf16' argument. '--precision=fp16' is also supported, but not recommended due to numerical instability. + +#### Speeding up inference with custom attention and multiplicative update kernels The **DeepSpeed DS4Sci_EvoformerAttention kernel** is a memory-efficient attention kernel developed as part of a collaboration between OpenFold and the DeepSpeed4Science initiative. If your system supports deepseed, using deepspeed generally leads an inference speedup of 2 - 3x without significant additional memory use. You may specify this option by selecting the `--use_deepspeed_inference` argument. +OF2 supports the CUEquivariance [triangle_multiplicative_update](https://docs.nvidia.com/cuda/cuequivariance/api/generated/cuequivariance_torch.triangle_multiplicative_update.html) and [triangle_attention](https://docs.nvidia.com/cuda/cuequivariance/api/generated/cuequivariance_torch.triangle_attention.html) kernels which can speed up inference/training of the model 1.2 to 1.5 on top of DeepSpeed and even more for sequences with > 1000 residues. To enable, pass '--use_cuequivariance_attention' and '--use_cuequivariance_multiplicative_update' arguments to run_pretrained_openfold.py. +CUEquivariance does fall back to DeepSpeed on shapes it does not efficiently support, so enable both for best effect. + If DeepSpeed is unavailable for your system, you may also try using [FlashAttention](https://github.com/HazyResearch/flash-attention) by adding `globals.use_flash = True` to the `--experiment_config_json`. Note that FlashAttention appears to work best for sequences with < 1000 residues. +#### Speeding up inference with TensorRT +Alternatively (or together with CUEquivariance), you can try applying [TensorRT](https://developer.nvidia.com/tensorrt) to key modules. OF2 comes with built-in TensorRT lazy compilation support for EvoformerStack. To enable, pass '--trt_mode-run', '--trt_engine_dir', '--trt_max_sequence_len', '--trt_num_profiles' and '--trt_optimization_level' arguments to run_pretrained_openfold.py. + #### Large-scale batch inference For large-scale batch inference, we offer an optional tracing mode, which massively improves runtimes at the cost of a lengthy model compilation process. To enable it, add `--trace_model` to the inference command. diff --git a/tests/compare_utils.py b/tests/compare_utils.py index 326f5e2..4ef6412 100644 --- a/tests/compare_utils.py +++ b/tests/compare_utils.py @@ -27,6 +27,9 @@ def skip_unless_ds4s_installed(): "deepspeed.ops.deepspeed4science") is not None return unittest.skipUnless(ds4s_is_installed, "Requires DeepSpeed with version ≥ 0.10.4") +def skip_unless_cueq_installed(): + cueq_is_installed = importlib.util.find_spec("cuequivariance_torch") is not None + return unittest.skipUnless(cueq_is_installed, "Requires cuEquivariance") def skip_unless_flash_attn_installed(): fa_is_installed = importlib.util.find_spec("flash_attn") is not None diff --git a/tests/test_cuequivariance.py b/tests/test_cuequivariance.py index 3ab3425..ba6cfdc 100644 --- a/tests/test_cuequivariance.py +++ b/tests/test_cuequivariance.py @@ -36,7 +36,7 @@ import tests.compare_utils as compare_utils from tests.data_utils import random_template_feats, random_attention_inputs - +@compare_utils.skip_unless_cueq_installed() class TestCuEquivarianceKernel(unittest.TestCase): def test_compare_template_stack(self): @@ -133,6 +133,7 @@ class TestCuEquivarianceKernel(unittest.TestCase): # https://github.com/aqlaboratory/openfold/issues/532 with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float32): model = compare_utils.get_global_pretrained_openfold() + model.globals.use_deepspeed_evo_attention = False model.globals.use_cuequivariance_attention = False model.globals.use_cuequivariance_multiplicative_update = False out_repro = model(batch)