feat: update ligand templating, bump atomworks version

This commit is contained in:
ncorley
2025-09-17 21:41:57 -07:00
parent b3e9466b52
commit 2e218aafc2
10 changed files with 6290 additions and 5703 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,12 +0,0 @@
{
"name": "8cdz_templating_ligand",
"components": [
{
"path": "docs/rf3/examples/8cdz.cif"
}
],
"ground_truth_conformer_selection": ["E"],
"msa_paths": {
"A": "docs/rf3/examples/msas/8cdz_A.a3m.gz"
}
}

6218
docs/rf3/examples/9dfn.cif Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
[
{
"name": "9dfn_template_ligand_and_protein",
"components": [
{
"path": "docs/rf3/examples/9dfn.cif"
}
],
"template_selection": ["A", "C", "D"],
"ground_truth_conformer_selection": ["C", "D"]
}
]

View File

@@ -45,7 +45,7 @@ dependencies = [
"beartype>=0.18.0,<1",
# ... dataloading
"atomworks==1.0.0",
"atomworks==1.0.2"
]

View File

@@ -5,10 +5,8 @@ from pathlib import Path
from typing import Any
import numpy as np
from biotite.structure import AtomArray, concatenate
from atomworks.common import exists
from atomworks.enums import ChainType
from atomworks.ml.common import exists
from atomworks.ml.datasets import logger
from atomworks.ml.datasets.datasets import StructuralDatasetWrapper
from atomworks.ml.datasets.parsers import (
@@ -23,6 +21,7 @@ from atomworks.ml.transforms._checks import (
from atomworks.ml.transforms.base import Transform, TransformedDict
from atomworks.ml.transforms.msa._msa_loading_utils import load_msa_data_from_path
from atomworks.ml.utils.rng import capture_rng_states
from biotite.structure import AtomArray, concatenate
# input data wrapper that allows multiple input files separated by ':'

View File

@@ -2,8 +2,6 @@ from os import PathLike
from pathlib import Path
import numpy as np
from omegaconf import DictConfig
from atomworks.common import exists
from atomworks.constants import (
AF3_EXCLUDED_LIGANDS,
@@ -99,6 +97,8 @@ from atomworks.ml.transforms.msa.msa import (
from atomworks.ml.transforms.random_atomize_residues import RandomAtomizeResidues
from atomworks.ml.transforms.rdkit_utils import GetRDKitChiralCenters
from atomworks.ml.transforms.symmetry import FindAutomorphismsWithNetworkX
from omegaconf import DictConfig
from modelhub.data.extra_xforms import CheckForNaNsInInputs
from modelhub.data.pipeline_utils import (
annotate_post_crop_hash,
@@ -272,7 +272,7 @@ def build_af3_transform_pipeline(
TrainingRoute(
SetOccToZeroOnBfactor(b_factor_min, b_factor_max),
),
RemoveUnresolvedPNUnits(),
TrainingRoute(RemoveUnresolvedPNUnits()),
RemovePolymersWithTooFewResolvedResidues(min_residues=4),
MaskPolymerResiduesWithUnresolvedFrameAtoms(),
ConditionalRoute(

View File

@@ -67,7 +67,7 @@ For this example, the pTM in the `metrics.csv` should be `>0.8` (even without an
RF3 supports `.a3m` and `.fasta` files as input MSA formats; `.a3m` is recommended. We do not at the moment support pre-paired MSAs (we will pair on-the-fly) or on-the-fly MSA computation, but both are on the roadmap. Please raise an issue if these limitations are critical for your project and we can prioritize accordingly.
📝 **Example JSON configuration** (working example found at `docs/rf3/examples/3en2_from_json_with_msa.json`):
📝 **Example JSON configuration** (full example found at `docs/rf3/examples/3en2_from_json_with_msa.json`):
```json
{
@@ -93,7 +93,7 @@ rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json'
If performing inference from a prepared `.cif` file, MSAs can also be specified directly as a category within the raw CIF data.
We will automatically extract the correct MSA paths during parsing.
📝 **Example CIF header** (working example found at `docs/rf3/examples/3en2_from_file.cif`):
📝 **Example CIF header** (full example found at `docs/rf3/examples/3en2_from_file.cif`):
```cif
data_3EN2
#
@@ -112,7 +112,7 @@ rf3 fold inputs='docs/rf3/3en2_from_file.cif
> Without an MSA and using default settings, the above examples will trigger "early stopping." This means that if the model determines early on that a correct prediction is unlikely, it will stop computation and only output a `metrics.csv` and `.score` file to save compute resources. You can adjust this behavior using the `early_stopping_plddt_threshold` argument (see below). In our group, we find this argument can save wasted compute on erroneous inputs.
> [!TIP]
> To ensure that a provided MSA is loaded correctly, you may use the `raise_if_missing_msa_for_protein_of_length_n` command-line argument. For example, `rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json raise_if_missing_msa_for_protein_of_length_n=10` would raise an error if there were any proteins >=10 residues without compatible MSAs.
> To ensure that a provided MSA is loaded correctly, you may use the `raise_if_missing_msa_for_protein_of_length_n` command-line argument. For example, `rf3 fold inputs='docs/rf3/examples/3en2_from_json_with_msa.json' raise_if_missing_msa_for_protein_of_length_n=10` would raise an error if there were any proteins >=10 residues without compatible MSAs.
> [!TIP]
> For non-canonical amino acids, most MSA generation algorithms substitute `X` (unknown residue)! Ensure your MSAs adhere to this convention.
@@ -138,7 +138,7 @@ We will automatically distribute predictions across GPU's if running in a multi-
### 1⃣ **Single JSON with Multiple Examples**
📝 **Example JSON configuration** (working example found at `docs/rf3/examples/multiple_example_from_json.json`)
📝 **Example JSON configuration** (full example found at `docs/rf3/examples/multiple_example_from_json.json`)
```json
[
@@ -221,7 +221,7 @@ For convenience, we also support a `json` API analogous to that implemented by A
> [!TIP]
> **Performance Tip**: For small molecules, a general rule-of-thumb is that performance is best when using `CCD` codes directly, followed by `cif`/`sdf` files, and finally SMILES.
📝 **Example JSON configuration with arbitrary biomolecules** (working example found at `docs/rf3/examples/7o1r_from_json.json`):
📝 **Example JSON configuration with arbitrary biomolecules** (full example found at `docs/rf3/examples/7o1r_from_json.json`):
```json
[
{
@@ -292,7 +292,7 @@ Such `.cif` files complete with appropriate bonds can be composed with AtomWorks
If you would prefer to use the JSON API, bonds can be explicitly given using PyMol-like strings of the form `chain_id/res_name/res_id/atom_name`. You will need to know the specific chain ID, residue name, residue ID, and atom name between the relevant pairs of atoms to unambiguously specify the bond.
📝 **Example JSON configuration with covalent modifcations** (working example found at `docs/rf3/examples/7o1r_from_json.json`):
📝 **Example JSON configuration with covalent modifcations** (full example found at `docs/rf3/examples/7o1r_from_json.json`):
```json
[
{
@@ -448,7 +448,7 @@ RF3 uses AtomWorks' flexible `AtomSelectionStack` query syntax for specifying st
It is often helpful to template one or multiple polymer chains while allowing the other chain(s) to fold unconstrained. We demonstrate with an nanobody-antigen use case below how to apply templates.
📝 **Example JSON configuration templating the antigen and the nanobody framework** (working example found at `docs/rf3/examples/7xli_template_antigen_and_framework.json`):
📝 **Example JSON configuration templating the antigen and the nanobody framework** (full example found at `docs/rf3/examples/7xli_template_antigen_and_framework.json`):
```json
[
{
@@ -479,30 +479,37 @@ You may also specify templating directly via the CLI using `template_selection="
#### Templating a Small Molecule
We find that enforcing a particular small molecule conformation has various applications within fixed-ligand protein docking, enzyme activity filtering, and other biologically relevant tasks. RF3 natively enables encouraging a particular small molecule conformations via the ground truth reference conformer track. For the moment, such an approach is only effective if we want to template the *entire* small molecule. Partial templating of small molecules is still possible via the `template_selection` API described earlier rather than the `ground_truth_conformer_selection` track.
We find that enforcing a particular small molecule conformation has various applications within fixed-ligand protein docking, enzyme activity filtering, and other biologically relevant tasks. RF3 natively enables encouraging a particular small molecule conformations via both the ground truth reference conformer track and the template selection track.
📝 **Example JSON configuration templating a small molecule** (working example found at `docs/rf3/examples/8cdz_templating_ligand.json`):
For the moment, the ground truth conformer track is only effective if we want to template the *entire* small molecule. Partial templating of small molecules is still possible via the `template_selection` approach. We encourage exploration of both templating techniques to find what combination(s) are most effective for a given problem. Below we provide both, which represents the strongest possible conditioning.
📝 **Example JSON configuration templating a small molecule and the corresponding protein** (full example found at `docs/rf3/examples/1eiz_template_ligand_and_protein.json`):
```json
{
"name": "8cdz_templating_ligand",
"components": [
{
"path": "docs/rf3/examples/8cdz.cif"
}
],
"ground_truth_conformer_selection": ["E"]
}
[
{
"name": "9dfn_template_ligand_and_protein",
"components": [
{
"path": "docs/rf3/examples/9dfn.cif"
}
],
"template_selection": ["A", "C", "D"],
"ground_truth_conformer_selection": ["C", "D"]
}
]
```
> [!NOTE]
> We template the protein above to avoid providing an MSA
🚀 **Run the example:**
```bash
rf3 fold inputs='docs/rf3/examples/8cdz_templating_ligand.json'
```
You may also specify the ground truth conformer selection directly via the CLI, e.g., using `ground_truth_conformer_selection="[E]`
*Content coming soon...*
You may also specify the ground truth conformer selection directly via the CLI, e.g., using `ground_truth_conformer_selection="[E]"`
#### Templating an Interface

View File

@@ -5,11 +5,11 @@ from pathlib import Path
import hydra
import pandas as pd
import torch
from atomworks.io import parse
from atomworks.io.transforms.categories import category_to_dict
from lightning.fabric import seed_everything
from omegaconf import OmegaConf
from atomworks.io import parse
from atomworks.io.transforms.categories import category_to_dict
from modelhub.inference_engines.base import InferenceEngine
from modelhub.model.RF3 import ShouldEarlyStopFn
from modelhub.utils.datasets import (
@@ -342,17 +342,28 @@ class RF3InferenceEngine(InferenceEngine):
else out["asym_unit"][0]
)
# ... extract temlate information from the CIF file
templating_from_cif = category_to_dict(out["cif_block"], "templating")
# ... extract template information from the CIF file, if present
template_selection_from_CIF = (
category_to_dict(out["cif_block"], "template_selection")
if "cif_block" in out
else {}
)
ground_truth_conformer_selection_from_CIF = (
category_to_dict(out["cif_block"], "ground_truth_conformer_selection")
if "cif_block" in out
else {}
)
# First, apply the template selection from the CIF file
atom_array = apply_conformer_and_template_selections(
atom_array,
template_selection=list(
templating_from_cif.get("template_selection", [])
template_selection_from_CIF.get("template_selection", [])
),
ground_truth_conformer_selection=list(
templating_from_cif.get("ground_truth_conformer_selection", [])
ground_truth_conformer_selection_from_CIF.get(
"ground_truth_conformer_selection", []
)
),
)

View File

@@ -6,8 +6,6 @@ from pathlib import Path
from typing import Iterable
import numpy as np
from biotite.structure import AtomArray
from atomworks.common import as_list
from atomworks.enums import GroundTruthConformerPolicy
from atomworks.io.tools.inference import (
@@ -16,6 +14,8 @@ from atomworks.io.tools.inference import (
)
from atomworks.io.utils.io_utils import to_cif_file
from atomworks.io.utils.selection import AtomSelectionStack
from biotite.structure import AtomArray
from modelhub.utils.io import (
CIF_LIKE_EXTENSIONS,
DICTIONARY_LIKE_EXTENSIONS,
@@ -62,13 +62,17 @@ def _spoof_cif_from_dictionary(item: dict, temp_dir: PathLike) -> Path:
msa_paths_by_chain_id[chain_id] = msa_path
extra_categories = {}
if item.get("template_selection") or item.get("ground_truth_conformer_selection"):
extra_categories["templating"] = {
if item.get("template_selection"):
extra_categories["template_selection"] = {
"template_selection": item.get("template_selection"),
}
if item.get("ground_truth_conformer_selection"):
extra_categories["ground_truth_conformer_selection"] = {
"ground_truth_conformer_selection": item.get(
"ground_truth_conformer_selection"
),
}
if msa_paths_by_chain_id:
extra_categories["msa_paths_by_chain_id"] = msa_paths_by_chain_id