feat: update ligand templating, bump atomworks version

2026-06-04 13:24:22 +08:00 · 2025-09-17 21:41:57 -07:00
parent b3e9466b52
commit 2e218aafc2
10 changed files with 6290 additions and 5703 deletions
--- a/docs/rf3/examples/8cdz.cif
+++ b/docs/rf3/examples/8cdz.cif
--- a/docs/rf3/examples/8cdz_templating_ligand.json
+++ b/docs/rf3/examples/8cdz_templating_ligand.json
@@ -1,12 +0,0 @@
-{
-    "name": "8cdz_templating_ligand",
-    "components": [
-        {
-            "path": "docs/rf3/examples/8cdz.cif"
-        }
-    ],
-    "ground_truth_conformer_selection": ["E"],
-    "msa_paths": {
-        "A": "docs/rf3/examples/msas/8cdz_A.a3m.gz"
-    }
-}
--- a/docs/rf3/examples/9dfn.cif
+++ b/docs/rf3/examples/9dfn.cif
--- a/docs/rf3/examples/9dfn_template_ligand_and_protein.json
+++ b/docs/rf3/examples/9dfn_template_ligand_and_protein.json
@@ -0,0 +1,13 @@
+[
+    {
+        "name": "9dfn_template_ligand_and_protein",
+        "components": [
+            {
+                "path": "docs/rf3/examples/9dfn.cif"
+            }
+        ],
+        "template_selection": ["A", "C", "D"],
+        "ground_truth_conformer_selection": ["C", "D"]
+
+    }
+]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ dependencies = [
    "beartype>=0.18.0,<1",

    # ... dataloading
-    "atomworks==1.0.0",
+    "atomworks==1.0.2"
 ]


--- a/src/modelhub/data/paired_msa.py
+++ b/src/modelhub/data/paired_msa.py
@@ -5,10 +5,8 @@ from pathlib import Path
 from typing import Any

 import numpy as np
-from biotite.structure import AtomArray, concatenate
-
+from atomworks.common import exists
 from atomworks.enums import ChainType
-from atomworks.ml.common import exists
 from atomworks.ml.datasets import logger
 from atomworks.ml.datasets.datasets import StructuralDatasetWrapper
 from atomworks.ml.datasets.parsers import (
@@ -23,6 +21,7 @@ from atomworks.ml.transforms._checks import (
 from atomworks.ml.transforms.base import Transform, TransformedDict
 from atomworks.ml.transforms.msa._msa_loading_utils import load_msa_data_from_path
 from atomworks.ml.utils.rng import capture_rng_states
+from biotite.structure import AtomArray, concatenate


 # input data wrapper that allows multiple input files separated by ':'
--- a/src/modelhub/data/pipelines.py
+++ b/src/modelhub/data/pipelines.py
@@ -2,8 +2,6 @@ from os import PathLike
 from pathlib import Path

 import numpy as np
-from omegaconf import DictConfig
-
 from atomworks.common import exists
 from atomworks.constants import (
    AF3_EXCLUDED_LIGANDS,
@@ -99,6 +97,8 @@ from atomworks.ml.transforms.msa.msa import (
 from atomworks.ml.transforms.random_atomize_residues import RandomAtomizeResidues
 from atomworks.ml.transforms.rdkit_utils import GetRDKitChiralCenters
 from atomworks.ml.transforms.symmetry import FindAutomorphismsWithNetworkX
+from omegaconf import DictConfig
+
 from modelhub.data.extra_xforms import CheckForNaNsInInputs
 from modelhub.data.pipeline_utils import (
    annotate_post_crop_hash,
@@ -272,7 +272,7 @@ def build_af3_transform_pipeline(
        TrainingRoute(
            SetOccToZeroOnBfactor(b_factor_min, b_factor_max),
        ),
-        RemoveUnresolvedPNUnits(),
+        TrainingRoute(RemoveUnresolvedPNUnits()),
        RemovePolymersWithTooFewResolvedResidues(min_residues=4),
        MaskPolymerResiduesWithUnresolvedFrameAtoms(),
        ConditionalRoute(
--- a/src/modelhub/inference_engines/README.md
+++ b/src/modelhub/inference_engines/README.md
@@ -67,7 +67,7 @@ For this example, the pTM in the `metrics.csv` should be `>0.8` (even without an

 RF3 supports `.a3m` and `.fasta` files as input MSA formats; `.a3m` is recommended. We do not at the moment support pre-paired MSAs (we will pair on-the-fly) or on-the-fly MSA computation, but both are on the roadmap. Please raise an issue if these limitations are critical for your project and we can prioritize accordingly.

-📝 **Example JSON configuration** (working example found at `docs/rf3/examples/3en2_from_json_with_msa.json`):
+📝 **Example JSON configuration** (full example found at `docs/rf3/examples/3en2_from_json_with_msa.json`):

 ```json
 {
@@ -93,7 +93,7 @@ rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json'
 If performing inference from a prepared `.cif` file, MSAs can also be specified directly as a category within the raw CIF data.
 We will automatically extract the correct MSA paths during parsing.

-📝 **Example CIF header** (working example found at `docs/rf3/examples/3en2_from_file.cif`):
+📝 **Example CIF header** (full example found at `docs/rf3/examples/3en2_from_file.cif`):
 ```cif
 data_3EN2
 #
@@ -112,7 +112,7 @@ rf3 fold inputs='docs/rf3/3en2_from_file.cif
 > Without an MSA and using default settings, the above examples will trigger "early stopping." This means that if the model determines early on that a correct prediction is unlikely, it will stop computation and only output a `metrics.csv` and `.score` file to save compute resources. You can adjust this behavior using the `early_stopping_plddt_threshold` argument (see below). In our group, we find this argument can save wasted compute on erroneous inputs.

 > [!TIP]
-> To ensure that a provided MSA is loaded correctly, you may use the `raise_if_missing_msa_for_protein_of_length_n` command-line argument. For example, `rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json raise_if_missing_msa_for_protein_of_length_n=10` would raise an error if there were any proteins >=10 residues without compatible MSAs.
+> To ensure that a provided MSA is loaded correctly, you may use the `raise_if_missing_msa_for_protein_of_length_n` command-line argument. For example, `rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json' raise_if_missing_msa_for_protein_of_length_n=10` would raise an error if there were any proteins >=10 residues without compatible MSAs.

 > [!TIP]
 > For non-canonical amino acids, most MSA generation algorithms substitute `X` (unknown residue)! Ensure your MSAs adhere to this convention.
@@ -138,7 +138,7 @@ We will automatically distribute predictions across GPU's if running in a multi-

 ### 1️⃣ **Single JSON with Multiple Examples**

-📝 **Example JSON configuration** (working example found at `docs/rf3/examples/multiple_example_from_json.json`)
+📝 **Example JSON configuration** (full example found at `docs/rf3/examples/multiple_example_from_json.json`)

 ```json
 [
@@ -221,7 +221,7 @@ For convenience, we also support a `json` API analogous to that implemented by A
 > [!TIP]
 > **Performance Tip**: For small molecules, a general rule-of-thumb is that performance is best when using `CCD` codes directly, followed by `cif`/`sdf` files, and finally SMILES.

-📝 **Example JSON configuration with arbitrary biomolecules** (working example found at `docs/rf3/examples/7o1r_from_json.json`):
+📝 **Example JSON configuration with arbitrary biomolecules** (full example found at `docs/rf3/examples/7o1r_from_json.json`):
 ```json
 [
    {
@@ -292,7 +292,7 @@ Such `.cif` files complete with appropriate bonds can be composed with AtomWorks

 If you would prefer to use the JSON API, bonds can be explicitly given using PyMol-like strings of the form `chain_id/res_name/res_id/atom_name`. You will need to know the specific chain ID, residue name, residue ID, and atom name between the relevant pairs of atoms to unambiguously specify the bond.

-📝 **Example JSON configuration with covalent modifcations** (working example found at `docs/rf3/examples/7o1r_from_json.json`):
+📝 **Example JSON configuration with covalent modifcations** (full example found at `docs/rf3/examples/7o1r_from_json.json`):
 ```json
 [
    {
@@ -448,7 +448,7 @@ RF3 uses AtomWorks' flexible `AtomSelectionStack` query syntax for specifying st

 It is often helpful to template one or multiple polymer chains while allowing the other chain(s) to fold unconstrained. We demonstrate with an nanobody-antigen use case below how to apply templates.

-📝 **Example JSON configuration templating the antigen and the nanobody framework** (working example found at `docs/rf3/examples/7xli_template_antigen_and_framework.json`):
+📝 **Example JSON configuration templating the antigen and the nanobody framework** (full example found at `docs/rf3/examples/7xli_template_antigen_and_framework.json`):
 ```json
 [
    {
@@ -479,30 +479,37 @@ You may also specify templating directly via the CLI using `template_selection="

 #### Templating a Small Molecule

-We find that enforcing a particular small molecule conformation has various applications within fixed-ligand protein docking, enzyme activity filtering, and other biologically relevant tasks. RF3 natively enables encouraging a particular small molecule conformations via the ground truth reference conformer track. For the moment, such an approach is only effective if we want to template the *entire* small molecule. Partial templating of small molecules is still possible via the `template_selection` API described earlier rather than the `ground_truth_conformer_selection` track.
+We find that enforcing a particular small molecule conformation has various applications within fixed-ligand protein docking, enzyme activity filtering, and other biologically relevant tasks. RF3 natively enables encouraging a particular small molecule conformations via both the ground truth reference conformer track and the template selection track. 

-📝 **Example JSON configuration templating a small molecule** (working example found at `docs/rf3/examples/8cdz_templating_ligand.json`):
+For the moment, the ground truth conformer track is only effective if we want to template the *entire* small molecule. Partial templating of small molecules is still possible via the `template_selection` approach. We encourage exploration of both templating techniques to find what combination(s) are most effective for a given problem. Below we provide both, which represents the strongest possible conditioning.
+
+📝 **Example JSON configuration templating a small molecule and the corresponding protein** (full example found at `docs/rf3/examples/1eiz_template_ligand_and_protein.json`):
 ```json
-{
-    "name": "8cdz_templating_ligand",
-    "components": [
-        {
-            "path": "docs/rf3/examples/8cdz.cif"
-        }
-    ],
-    "ground_truth_conformer_selection": ["E"]
-}
+[
+    {
+        "name": "9dfn_template_ligand_and_protein",
+        "components": [
+            {
+                "path": "docs/rf3/examples/9dfn.cif"
+            }
+        ],
+        "template_selection": ["A", "C", "D"],
+        "ground_truth_conformer_selection": ["C", "D"]
+
+    }
+]
 ```

+> [!NOTE]
+> We template the protein above to avoid providing an MSA
+
 🚀 **Run the example:**

 ```bash
 rf3 fold inputs='docs/rf3/examples/8cdz_templating_ligand.json'
 ```

-You may also specify the ground truth conformer selection directly via the CLI, e.g., using `ground_truth_conformer_selection="[E]`
-
-*Content coming soon...*
+You may also specify the ground truth conformer selection directly via the CLI, e.g., using `ground_truth_conformer_selection="[E]"`

 #### Templating an Interface

--- a/src/modelhub/inference_engines/rf3.py
+++ b/src/modelhub/inference_engines/rf3.py
@@ -5,11 +5,11 @@ from pathlib import Path
 import hydra
 import pandas as pd
 import torch
+from atomworks.io import parse
+from atomworks.io.transforms.categories import category_to_dict
 from lightning.fabric import seed_everything
 from omegaconf import OmegaConf

-from atomworks.io import parse
-from atomworks.io.transforms.categories import category_to_dict
 from modelhub.inference_engines.base import InferenceEngine
 from modelhub.model.RF3 import ShouldEarlyStopFn
 from modelhub.utils.datasets import (
@@ -342,17 +342,28 @@ class RF3InferenceEngine(InferenceEngine):
                else out["asym_unit"][0]
            )

-            # ... extract temlate information from the CIF file
-            templating_from_cif = category_to_dict(out["cif_block"], "templating")
+            # ... extract template information from the CIF file, if present
+            template_selection_from_CIF = (
+                category_to_dict(out["cif_block"], "template_selection")
+                if "cif_block" in out
+                else {}
+            )
+            ground_truth_conformer_selection_from_CIF = (
+                category_to_dict(out["cif_block"], "ground_truth_conformer_selection")
+                if "cif_block" in out
+                else {}
+            )

            # First, apply the template selection from the CIF file
            atom_array = apply_conformer_and_template_selections(
                atom_array,
                template_selection=list(
-                    templating_from_cif.get("template_selection", [])
+                    template_selection_from_CIF.get("template_selection", [])
                ),
                ground_truth_conformer_selection=list(
-                    templating_from_cif.get("ground_truth_conformer_selection", [])
+                    ground_truth_conformer_selection_from_CIF.get(
+                        "ground_truth_conformer_selection", []
+                    )
                ),
            )

--- a/src/modelhub/utils/inference.py
+++ b/src/modelhub/utils/inference.py
@@ -6,8 +6,6 @@ from pathlib import Path
 from typing import Iterable

 import numpy as np
-from biotite.structure import AtomArray
-
 from atomworks.common import as_list
 from atomworks.enums import GroundTruthConformerPolicy
 from atomworks.io.tools.inference import (
@@ -16,6 +14,8 @@ from atomworks.io.tools.inference import (
 )
 from atomworks.io.utils.io_utils import to_cif_file
 from atomworks.io.utils.selection import AtomSelectionStack
+from biotite.structure import AtomArray
+
 from modelhub.utils.io import (
    CIF_LIKE_EXTENSIONS,
    DICTIONARY_LIKE_EXTENSIONS,
@@ -62,13 +62,17 @@ def _spoof_cif_from_dictionary(item: dict, temp_dir: PathLike) -> Path:
            msa_paths_by_chain_id[chain_id] = msa_path

    extra_categories = {}
-    if item.get("template_selection") or item.get("ground_truth_conformer_selection"):
-        extra_categories["templating"] = {
+    if item.get("template_selection"):
+        extra_categories["template_selection"] = {
            "template_selection": item.get("template_selection"),
+        }
+    if item.get("ground_truth_conformer_selection"):
+        extra_categories["ground_truth_conformer_selection"] = {
            "ground_truth_conformer_selection": item.get(
                "ground_truth_conformer_selection"
            ),
        }
+
    if msa_paths_by_chain_id:
        extra_categories["msa_paths_by_chain_id"] = msa_paths_by_chain_id