first first commit

This commit is contained in:
HannesStark
2025-10-26 20:27:38 +00:00
commit ff9964a539
221 changed files with 384768 additions and 0 deletions

212
.gitignore vendored Executable file
View File

@@ -0,0 +1,212 @@
scripts/bindcraft/utils.py
.tmp_wandb
*.ipynb
!filter.ipynb
!similarity.ipynb
cache
results
workdir
workbench
.vscode/
wandb
tmp
slurm_out
small_data
.idea
*.pkl
*.out
.DS_Store
# Development files
.vscode
outputs/
workdir
results
notebooks/
*.ckpt
samples/
workbench
workdir_fold
wandb
tmp
debug.txt
*.pt
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# ruff
.ruff_cache/
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Scratch dir
scratch
# pt and yaml files
*.pt
data/PepPC/
data/PepPC*
*.err
*.csv
data/test_set
visualization_data_designed_only
visualization_results_designed_only
similarity.ipynb

63
Dockerfile Normal file
View File

@@ -0,0 +1,63 @@
# syntax=docker/dockerfile:1
FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
CUDA_HOME=/usr/local/cuda \
PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu121 \
HF_HOME=/cache
RUN apt-get update && \
apt-get install -y --no-install-recommends \
python3.10 \
python3.10-dev \
python3-pip \
python3-venv \
python3-wheel \
build-essential \
git \
cmake \
pkg-config \
libffi-dev \
libssl-dev \
libxml2-dev \
libxslt-dev \
libgl1 \
libhdf5-dev \
libboost-all-dev \
&& rm -rf /var/lib/apt/lists/*
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
python -m pip install --upgrade pip setuptools setuptools_scm wheel
WORKDIR /app
COPY . /app
RUN pip install --no-cache-dir -e /app
ARG DOWNLOAD_WEIGHTS=false
ARG HF_TOKEN=""
RUN mkdir -p "${HF_HOME}" && \
if [ "${DOWNLOAD_WEIGHTS}" = "true" ]; then \
HF_TOKEN="${HF_TOKEN}" boltzgen download --models-cache-dir "${HF_HOME}" --force-download --show-paths; \
fi
ARG USERNAME=boltzgen
ARG USER_UID=1000
ARG USER_GID=1000
RUN groupadd --gid ${USER_GID} ${USERNAME} && \
useradd --uid ${USER_UID} --gid ${USER_GID} --create-home --shell /bin/bash ${USERNAME}
RUN mkdir -p "${HF_HOME}" && chown -R ${USER_UID}:${USER_GID} "${HF_HOME}"
USER ${USERNAME}
WORKDIR /workspace
ENTRYPOINT ["boltzgen"]
CMD ["--help"]

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 Hannes Stärk
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

577
README.md Executable file
View File

@@ -0,0 +1,577 @@
<div align="center">
<div>&nbsp;</div>
<img src="assets/boltzgen.png" alt="BoltzGen logo" width="60%">
[Paper](https://hannes-stark.com/assets/boltzgen.pdf) |
[Slack](https://boltz.bio/join-slack) <br> <br>
![alt text](assets/cover.png)
</div>
# Installation
In an environment with python >=3.11:
```bash
pip install boltzgen
```
<details>
<summary style="font-size: 1.3em; font-weight: 600;">
Click for detailed installation instructions
</summary>
### 1 - Install Miniconda
Choose the installer for your operating system, download it, and follow the on-screen prompts:
* **Windows:** <https://www.anaconda.com/docs/getting-started/miniconda/install#windows-installation>
* **macOS / Linux:** <https://www.anaconda.com/docs/getting-started/miniconda/install#macos-linux-installation>
After installation, **open a terminal / command prompt** (you may need to search for “Anaconda Prompt” on Windows).
### 2 - Create a Miniconda Python environment
Run the command below in a terminal to create a fresh environment called `bg` with Python 3.12:
```bash
conda create -n bg python=3.12
```
### 3 - Activate the environment (do this every time you work with BoltzGen)
```bash
conda activate bg
```
> If you open a **new** terminal session later, you must run `conda activate bg` again before using BoltzGen.
### 4 - Install BoltzGen from source
Download the BoltzGen repository, change directory into the boltzgen directory, and install BoltzGen from source:
```bash
pip install boltzgen
```
</details>
<details>
<summary style="font-size: 1.3em; font-weight: 600;">
Click for optional Docker instructions if you prefer Docker
</summary>
To build and run the docker image:
```bash
# Build
docker build -t boltzgen .
# Run an example
mkdir -p workdir # output
mkdir -p cache # where models will be downloaded to
docker run --rm --gpus all -v "$(realpath workdir)":/workdir -v "$(realpath cache)":/cache -v "$(realpath example)":/example \
boltzgen run /example/vanilla_protein/1g13prot.yaml --output /workdir/test \
--protocol protein-anything \
--num_designs 2
```
In the example above, the model weights are downloaded the first time the image is run. To bake the weights into the image at build time, run:
```bash
docker build -t boltzgen:weights --build-arg DOWNLOAD_WEIGHTS=true .
```
</details>
<br>
# Running BoltzGen
![alt text](assets/fig1.png)
`boltzgen run` takes a [design specification](#how-to-make-a-design-specification-yaml) `.yaml` and produces a set of ranked designs.\
⚠️ it downloads models (~6GB) to `~/.cache`. This can by changed by passing `--cache YOUR_PATH` or by setting `$HF_HOME`.\
⚠️ If your run is ever interrupted, you can restart it with `--reuse`. No progress is lost.
```bash
boltzgen run example/vanilla_protein/1g13prot.yaml \
--output workbench/test_run \
--protocol protein-anything \
--num_designs 10 \
--budget 2
# --num_designs is the number of intermediate designs. In practice you will want between 10,000 - 60,000
# --budget is how many designs should be in the final diversity optimized set
```
All command line args are explained in ["All Command Line Arguments"](#all-command-line-arguments).\
**Step-by-step guide for making your designs:**
1. Make your `.yaml` file that specifies your target and what you want to design. We provide many examples in
`example` such as `example/vanilla_peptide_with_target_binding_site/beetletert.yaml`. Details in
["How to make a design specification .yaml"](#how-to-make-a-design-specification-yaml).
2. Check whether your design specification is as intended.
1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.
2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
3. Your viewer should show the binding site in a different color than the rest of the target.
3. Run the `boltzgen run ...` command as above on your `.yaml` file.
4. Your filtered, ranked set of designs will be in `--output`. <img src="assets/fig_seconds_per_design.png" alt="Seconds per design" align="right" width="35%">
5. You likely want to rerun the filtering step with different settings (takes ~15 sec). Use
`boltzgen run --steps filtering --output ...` or the Jupyter notebook `filter.ipynb` which is often more convenient.
Detailed explanation in ["Rerunning the Filtering"](#rerunning-the-filtering-recommended).
**How many designs to generate?** \
More is better. The "minimum" depends on your target.
BoltzGen should be run on a GPU. On the right you can see the time required for each step in the pipeline for a single design on an A100 GPU.
We suggest first running with e.g. `--num_design 50`, checking that everything behaves as desired, and then increasing `--num_design` to between 10,000 - 60,000.
## Pipeline output
When the pipeline completes your output directory will have:
- `config/`, `steps.yaml`: configuration files.
- `intermediate_designs/`: output of design step
- `/*.cif` and `/*.npz`: CIF and NPZ (metadata files) for the designed proteins and targets before inverse folding
- `intermediate_designs_inverse_folded/`: output of inverse folding, folding, and analysis steps
- `/*.cif` and `/*.npz` : CIF and NPZ for designed proteins and targets after inverse folding. *Note: For designed residues, only the backbone atoms will have coordinates (sidechain coordinates will be 0,0,0).*
- `/refold_cif`: refolded complex structures (target and binder). This is the primary input to the analysis and filtering steps.
- `/refold_design_cif`: refolded binder structures, without target.
- `/aggregate_metrics_analyze.csv`, `/per_target_metrics_analyze.csv` — outputs of the analysis step.
- `final_ranked_designs/` : outputs of the filtering step
- `/intermediate_ranked_<N>_designs/` — top-N quality designs. CIFs are copied from `refold_cif` above.
- `/final_<budget>_designs/` — quality + diversity set. CIFs copied from `refold_cif/`.
- `/all_designs_metrics.csv` — metrics for all designs considered by filtering.
- `/final_designs_metrics_<budget>.csv` — metrics for the selected final set.
- `/results_overview.pdf` — plots
# Protocols
| Protocol (design-target) | Appropriate for | Major config differences |
|--------------------------|---------------------------------------------------------------------------|------------------------|
| protein-anything | Design proteins to bind proteins or peptides | Includes `design folding` step. |
| peptide-anything | Design peptides (including helicons, cyclic peptides) to bind proteins | No Cys are generated in inverse folding. No `design folding` step. Don't compute largest hydrophobic patch. |
| protein-small_molecule | Design proteins to bind small molecules | Includes binding affinity prediction. Includes `design folding` step. |
| nanobody-anything | Design nanobodies (single-domain antibodies) | No Cys are generated in inverse folding. No `design folding` step. Don't compute largest hydrophobic patch. |
All configuration parameters can be overridden using the `--config` option; see `boltzgen run --help` or the `Advanced Users` section below for details.
# How to make a design specification .yaml
A more detailed explanation of how our <code>.yaml</code> design specification files work is in <a href="example/README.md" target="_blank">example/README.md</a>. Below is an example based explanation, which is sufficient for most tasks.
**IMPORTANT:** ⚠️ All residue indices are specified **starting at 1** and we use the canonical mmcif residue index `label_asym_id`, and **not** the `auth_asym_id` author residue index!
You can check the indexing in your mmcif file by opening it in https://molstar.org/viewer/, hovering over a residue, and checking the index on the bottom right. You will see something like this where **41 is the index we use, the auth id 22 is incorrect**:
![](assets/label_seq_id.png)
After you constructed your `.yaml` file we recommend that you run the `check` command on it:
1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.
2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
3. Your viewer should show the binding site in a differnt color than the rest of the target.
## Example based explanation:
We provide many example `.yaml` files in the `example/` directory, including:
- `example/design_spec_showcasing_all_functionalities.yaml`
- `example/vanilla_peptide_with_target_binding_site/beetletert.yaml`
- `example/peptide_against_specific_site_on_ragc/rragc.yaml`
- `example/nanobody_against_penguinpox/penguinpox.yaml`
- `example/denovo_zinc_finger_against_dna/zinc_finger.yaml`
- `example/protein_binding_small_molecule/chorismite.yaml`
Small example of a protein design against a target protein without binding site specified:
```yaml
entities:
# Designed protein with between 80 and 140 residues
# (The lenght is randomly sampled)
- protein:
id: B
sequence: 80..140
# The target is extracted from a .cif file
- file:
# file references are relative to the location of the .yaml file
path: 6m1u.cif # .pdb files also work
# Which chain in the .cif file to use as target (uses all chains if unspecified)
include:
- chain:
id: A
```
**IMPORTANT:** ⚠️ File references inside a yaml file (e.g. to cif files) are interpreted relative to the directory of the yaml file.
Example highlighting many (not all) functionalities:
```yaml
entities:
# Specification of the target which is extracted from a .cif file
- file:
path: 8r3a.cif # .pdb files also work
# Which chain and residues in the .cif file to use as target (uses all chains if unspecified)
include:
- chain:
id: A
res_index: 2..50,55.. # residues between 2 and 50 and anything larger than 55
- chain:
id: B
# Wich regions of the target the design should or should NOT
# bind to (this can be left unspecified, then we just bind anywhere)
binding_types:
- chain:
id: A
binding: 5..7,13
- chain:
id: B
not_binding: "all"
# Which regions of the target should have their structure specified.
# By default, everything is visibility 1 which means that the structure is specified.
# If the visibility is 0, then the structure is not specified.
structure_groups:
- group:
visibility: 1
id: A
res_index: 10..13
- group:
# The relative positioning of things in structure group 2
# is not specified w.r.t to things in structure group 1
visibility: 2
id: B
# Overwrite the previous visibility setting and set it to 0 for res_index 13
- group:
visibility: 0
id: A
res_index: 13
# Optionally you can say that some residues in a loaded .cif file should also be redesigned.
design:
- chain:
id: A
res_index: 14..19
# For designed regions you can say what secondary structure they should have
secondary_structure:
- chain:
id: A
loop: 14
helix: 15..17
sheet: 19
# Specify a NON-designed protein chain
- protein:
id: X
sequence: AAVTTTTPPP
# Specify a designed protein chain
# Numbers specify what is being designed
- protein:
id: G
# random number between 15 and 20 of designed residues (inclusive)
sequence: 15..20AAAAAAVTTTT18PPP
# A designed helicon
# (see the constraints below that connect the peptide with the WHL ligand)
- protein:
id: R
# Random number of design residues between 3 and 5,
# then a Cystein, then 6 design residues, then ...
sequence: 3..5C6C3
- ligand:
id: Q
ccd: WHL
# A designed peptide with 17 residues
- protein:
id: H
sequence: 17
# specification for a designed peptide with two Cys and a disulfide bond (see constraints)
- protein:
id: S
sequence: 10..14C6C3
constraints:
# specify connections as if the minimum possible number of residues was sampled
- bond:
atom1: [R, 4, SG] # connection for a helicon between small molecule and designed peptide
atom2: [Q, 1, CK]
- bond:
atom1: [R, 11, SG] # connection for a helicon between small molecule and designed peptide
atom2: [Q, 1, CH]
- bond:
atom1: [S, 11, SG] # connection for a disulfide bond between Cys and Cys in designed peptide
atom2: [S, 18, SG]
```
# Running only specific pipeline steps
You can run only specific parts of the pipeline using the `--steps` flag:
**Run only the design and inverse_folding steps:**
```bash
boltzgen run example/cyclotide/3ivq.yaml \
--output workbench/partial-run \
--protocol peptide-anything \
--steps design inverse_folding \
--num_designs 2
```
**Available steps:**
- `design` - Generate num_design candidates using the diffusion model based on your design specification
- `inverse_folding` - Redesign sequences from the previous step using our inverse folding model
- `folding` - Re-fold the designed binders with their targets using Boltz-2 model
- `design_folding` - Re-fold the designed binders alone without target (disabled for peptide and nanobody binders)
- `affinity` - Predict binding affinity between designed proteins and their target small molecules using Boltz-2 (for design of small molecule binders only)
- `analysis` - Analyze the folded structures using various metrics to assess design quality
- `filtering` - Filter and rank designs based on analysis results to select the best candidates
# Rerunning the filtering (recommended)
After you generate designs, you will probably want to rerun the filtering step (which runs very fast) several times to tune your criteria for selecting good ones.
You can run the filtering step either using the `boltzgen` command or
using a [jupyter notebook](filter.ipynb) that we provide. In most cases the notebook is more convenient. If you'd prefer to use the command-line, here is an example of re-running the filters without the notebook.
First, suppose we initially generated some designs with default filtering options:
```bash
boltzgen run example/binding_disordered_peptides/tpp4.yaml \
--output workbench/tpp4 \
--protocol protein-anything \
--num_designs 20
```
After this runs we see that only a few designs passed our filters. We might now adjust the filters by running:
```
boltzgen run example/binding_disordered_peptides/tpp4.yaml \
--output workbench/tpp4 \
--protocol protein-anything \
--steps filtering \
--refolding_rmsd_threshold 3.0 \
--filter_biased=false \
--additional_filters 'ALA_fraction<0.3' 'filter_rmsd_design<2.5' \
--metrics_override plip_hbonds_refolded=4 \
--alpha 0.2
```
# All command line arguments
## `boltzgen run`
The `boltzgen run` command executes the BoltzGen binder design pipeline. Here are all available options:
### Design Specification
- `design_spec` - Path(s) to design specification YAML file(s), or a directory containing prepared configs
### General Configuration
- `--protocol {protein-anything,peptide-anything,protein-small_molecule,nanobody-anything}` - Protocol to use for the design. This determines default settings and in some cases what steps are run. Default: protein-anything. See [Protocols](#protocols) section for details.
- `--output OUTPUT` - Output directory for pipeline results
- `--config CONFIG [CONFIG ...]` - Override pipeline step configuration, in format `<step_name> <arg1>=<value1> <arg2>=<value2> ...` (example: `--config folding num_workers=4 trainer.devices=4`). Can be used multiple times.
- `--devices DEVICES` - Number of devices to use. Default is all devices available.
- `--num_workers NUM_WORKERS` - Number of DataLoader worker processes.
- `--config_dir CONFIG_DIR` - Path to the directory of default config files. Default: `/home/bizon/git/foldeverything/config`
- `--use_kernels {auto,true,false}` - Whether to use kernels. One of 'auto', 'true', or 'false'. Default: auto. If 'auto', will use kernels if the device capability is >= 8.
- `--moldir MOLDIR` - Path to the moldir. Default: `huggingface:boltzgen/inference-data:mols.zip`
### Design
- `--num_designs NUM_DESIGNS` - Number of total designs to generate. This commonly would be something like 10,000. After generating 10,000 designs we then filter down to `--budget` many designs in the filter step
- `--diffusion_batch_size DIFFUSION_BATCH_SIZE` - Number of diffusion samples to generate per trunk run. If not specified, this defaults to 1 if `--num-designs` is less than 100, and 10 otherwise. Note that for design tasks that randomly sample the binder length (or use randomness in other ways), all designs generated in the same batch will share the same length. Having a large diffusion batch size compared to the total number of designs to generate will therefore not evenly sample the possible lengths.
- `--design_checkpoints DESIGN_CHECKPOINTS [DESIGN_CHECKPOINTS ...]` - Path to the boltzgen checkpoint(s). One or more checkpoints are supported. Just specifying an individual path here will work. Each will be used for an equal fraction of the designs. By default, two checkpoints are used. Default: `['huggingface:boltzgen/boltzgen1_diverse:boltzgen1_diverse.ckpt', 'huggingface:boltzgen/boltzgen1_adherence:boltzgen1_adherence.ckpt']`
- `--step_scale STEP_SCALE` - Fixed step scale to use (e.g. 1.8). Default is to use a schedule
- `--noise_scale NOISE_SCALE` - Fixed noise scale to use (e.g. 0.98). Default is to use a schedule
### Inverse Folding
- `--skip_inverse_folding` - Skip inverse folding step
- `--inverse_fold_num_sequences INVERSE_FOLD_NUM_SEQUENCES` - Number of sequences per backbone to generate in the inverse fold step. Default: 1
- `--inverse_fold_checkpoint INVERSE_FOLD_CHECKPOINT` - Path or huggingface repo and filename for the inverse fold checkpoint. Default: `huggingface:boltzgen/boltzgen1_ifold:boltzgen1_ifold.ckpt`
- `--inverse_fold_avoid INVERSE_FOLD_AVOID` - Disallowed residues as a string of one letter amino acid codes, e.g. 'KEC'. This is implemented at the inverse fold step, so it only affects results if inverse folding is enabled. Default: none for protein design, 'C' for peptide and nanobody design. Pass an empty list if you want Cysteins to be generated if you are using a nanobody or peptide protocol
- `--only_inverse_fold` - Skip design step and only run inverse folding. Requires a fully specified structure.
### Folding and Affinity Prediction
- `--folding_checkpoint FOLDING_CHECKPOINT` - Path to the folding checkpoint. Default: `huggingface:boltzgen/boltz2_conf_final:boltz2_conf_final.ckpt`
- `--affinity_checkpoint AFFINITY_CHECKPOINT` - Path to the affinity predictor checkpoint. Default: `huggingface:boltzgen/boltz2_affinity:boltz2_aff.ckpt`
### Filtering
- `--budget BUDGET` - How many designs should be in the final diversity optimized set. This is used in the filtering step.
- `--alpha ALPHA` - Trade-off for sequence diversity selection: 0.0=quality-only, 1.0=diversity-only. Default is 0.01 (peptide-anything protocol) or 0.001 (other protocols).
- `--filter_biased {true,false}` - Remove amino-acid composition outliers (default caps on ALA/GLY/GLU/LEU/VAL). Default: true.
- `--metrics_override METRICS_OVERRIDE [METRICS_OVERRIDE ...]` - Per-metric inverse-importance weights for ranking. Format: `metric_name=weight` (e.g., `plip_hbonds_refolded=4 delta_sasa_refolded=2`). A larger value down-weights that metric's rank. Use `metric_name=none` to remove a metric.
- `--additional_filters ADDITIONAL_FILTERS [ADDITIONAL_FILTERS ...]` - Extra hard filters. Format: `feature>threshold` or `feature<threshold` (e.g., `'design_ALA>0.3' 'design_GLY<0.2'`). Use '>' if higher is better, '<' if lower is better. Make sure to single-quote the strings so your shell doesn't get confused by < and > characters.
- `--size_buckets SIZE_BUCKETS [SIZE_BUCKETS ...]` - Optional constraint for maximum number of designs in size ranges. Format: `min-max:count` (e.g., `10-20:5 20-30:10 30-40:5`).
- `--refolding_rmsd_threshold REFOLDING_RMSD_THRESHOLD` - Threshold used for RMSD-based filters (lower is better).
### Execution Options
- `--reuse` - Reuse existing results across all steps. Generate only as many new designs are needed to achieve the specified total number of designs.
- `--no_subprocess` - Run each step in the main process. Will cause issues when devices >1.
- `--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]` - Run only the specified pipeline steps (default: run all steps). See [The individual pipeline steps](#the-individual-pipeline-steps) section for details.
### Model and Data Download Options
- `--force_download` - Force a (re)-download of models and data.
- `--models_token MODELS_TOKEN` - Secret token to use for our models hosting service (Hugging Face). Default: `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`
- `--cache CACHE` - Directory where downloaded models will be stored. Default: `~/.cache`
## `boltzgen download`
The `boltzgen download` command downloads model weights and data artifacts needed for BoltzGen. In most cases you don't need to use `boltzgen download`, since `boltzgen run` will download what is needed automatically.
Downloaded weights and datasets are stored in `~/.cache` by default but this can be changed by specifying `--cache`.
### Example
```bash
boltzgen download all # downloads all models
boltzgen download inverse-fold # downloads only the inverse folding model
```
### Usage
```bash
boltzgen download [-h] [--force_download] [--models_token MODELS_TOKEN] [--cache CACHE] {affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all} [{affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all} ...]
```
### Positional arguments
- `{affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all}` - Subset of artifacts to download, or 'all' to download all artifacts.
### Options
- `--force_download` - Force a (re)-download of models and data.
- `--models_token MODELS_TOKEN` - Secret token to use for our models hosting service. Not usually required.
- `--cache CACHE` - Directory where downloaded models will be stored. Default: `~/.cache`
## `boltzgen configure`
For more control over your design process, you can separate the configuration generation from execution:
### Example
```bash
boltzgen configure example/cyclotide/3ivq.yaml \
--output workbench/test-peptide-protein \
--protocol peptide-anything \
--num_designs 2
```
This creates configuration files in `workbench/test-peptide-protein/` without running the actual design pipeline. You can edit these files if needed and then run `boltzgen execute workbench/test-peptide-protein` to run the workflow.
The options that `boltzgen configure` takes are a subset of the `boltzgen run` options so we don't list them all again here. Try `boltzgen configure --help` if you need help.
## `boltzgen execute`
The `boltzgen execute` command executes a pre-configured pipeline from a directory of config files generated by the `boltzgen configure` command.
### Usage
```bash
boltzgen execute [-h] [--reuse] [--no_subprocess] [--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]] output
```
### Positional Arguments
- `output` - Directory containing pre-configured pipeline files (generated by 'configure' command)
### Execution Options
- `--reuse` - Reuse existing results across all steps. Generate only as many new designs are needed to achieve the specified total number of designs.
- `--no_subprocess` - Run each step in the main process. Will cause issues when devices >1.
- `--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]` - Run only the specified pipeline steps (default: run all steps)
# Training BoltzGen models
Install in dev mode which will install additional packages like `wandb`.
```bash
git clone https://github.com/HannesStark/boltzgen
pip install -e .[dev]
```
### 1 Download training data and checkpoints
```bash
# Choose any location; this is default in yaml files
mkdir -p training_data
cd training_data
# ─ Targets ─
wget -O targets.zip "https://huggingface.co/datasets/boltzgen/boltzgen1_train/resolve/main/targets.zip?download=true"
unzip targets.zip # → training_data/targets/
# ─ MSAs ─
wget -O msa.zip "https://huggingface.co/datasets/boltzgen/boltzgen1_train/resolve/main/msa.zip?download=true"
unzip msa.zip # → training_data/msa/
# ─ Small-molecule dictionary ─
wget -O mols.zip "https://huggingface.co/datasets/boltzgen/inference-data/resolve/main/mols.zip?download=true"
unzip mols.zip # → training_data/mols/
# ─ Folding checkpoint ─
wget -O boltz2_fold.ckpt "https://huggingface.co/boltzgen/boltzgen-1/resolve/main/boltz2_conf_final.ckpt?download=true"
# ─────────── (optional) pretrained structure-only ckpt ───────────
# Needed ONLY if you want to resume from a structure-trained model.
wget -O boltzgen1_structuretrained_small.ckpt \
"https://huggingface.co/boltzgen/boltzgen-1/resolve/main/boltzgen1_structuretrained_small.ckpt?download=true"
```
Resulting layout
```
training_data/
├─ targets/ (used for target_dir in yaml)
├─ msa/ (used for msa_dir in yaml)
├─ mols/ (used for mol_dir in yaml)
├─ boltz2_fold.ckpt (used for folding_checkpoint in yaml)
└─ boltzgen1_structuretrained_small.ckpt (used for pretrained in yaml)
```
The directory `training_data` is the default location referenced in the
example YAML configuration files. If you place the data elsewhere, be sure to update those paths accordingly.
### 2 Training YAML files
Below is a quick reference for the three training configurations and how to launch them once paths are set:
| Config file | Purpose | Example command |
|-------------|---------|-----------------|
| `config/train/boltzgen_small.yaml` | Train the **small** Boltzgen model (recommended for development, 8 GPUs, gradient accumulation 16) | `python main.py config/train/boltzgen_small.yaml name=boltzgen_small` |
| `config/train/boltzgen.yaml` | Train the **large** BoltzGen model | `python main.py config/train/boltzgen.yaml name=boltzgen_large` |
| `config/train/inverse_folding.yaml` | Train the **inverse-folding** model only | `python main.py config/train/inverse_folding.yaml name=boltzgen_if` |
If you store the data somewhere other than `./training_data`, search and replace that path **in all three YAML files**. Typical keys you may need to update are `target_dir`, `msa_dir`, `moldir`, `pretrained`, `folding_checkpoint`, `monomer_target_dir`, and `ligand_target_dir`.
Example places:
```yaml
data:
datasets:
- target_dir: ./training_data/targets
msa_dir: ./training_data/msa
moldir: ./training_data/mols
pretrained: ./training_data/boltzgen1_structuretrained_small.ckpt
folding_checkpoint: ./training_data/boltz2_fold.ckpt
```
### 3 Train the models (example commands)
Small model on 8 GPUs gradient accumulation 16 (recommended dev setup):
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python main.py config/train/boltzgen_small.yaml \
name=boltzgen_small
```
Large model:
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python main.py config/train/boltzgen.yaml \
name=boltzgen_large
```
Inverse-folding model:
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python main.py config/train/inverse_folding.yaml \
name=boltzgen_if
```
Note: the large model currently expects additional distillation datasets (to be released). You can still explore its hyper-parameters and train solely on PDB data by adjusting the paths.
**Optionally resuming from a checkpoint**
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
python main.py config/train/boltzgen_small.yaml \
pretrained=./training_data/boltzgen1_structuretrained_small.ckpt \
name=boltzgen_small_pretrained
```

BIN
assets/boltzgen.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

BIN
assets/cover.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 284 KiB

BIN
assets/fig1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 995 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

BIN
assets/label_seq_id.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

53
config/affinity.yaml Normal file
View File

@@ -0,0 +1,53 @@
_target_: boltzgen.task.predict.predict.Predict
debug: false
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1000000000000000
num_targets: 10000000000000
moldir: null
batch_size: 1
num_workers: 4
pin_memory: true
design_dir: null
return_native: false
compute_affinity: true
target_templates: false
fail_if_no_designs: true
keys_dict_out: []
writer:
_target_: boltzgen.task.predict.writer.AffinityWriter
design_dir: ${data.design_dir}
trainer:
accelerator: gpu
logger: false
devices: 1
precision: bf16-mixed
name: affinity_in_eval_affinity
output: null
checkpoint: null
matmul_precision: null
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 5
override:
validators: null

64
config/analysis.yaml Normal file
View File

@@ -0,0 +1,64 @@
_target_: boltzgen.task.analyze.analyze.Analyze
name: analyze
design_dir: null
debug: false
num_processes: 32
# Common metrics to compute
affinity_metrics: false
backbone_fold_metrics: true
noncovalents_original: true
noncovalents_refolded: true
delta_sasa_original: true
delta_sasa_refolded: true
largest_hydrophobic: false
largest_hydrophobic_refolded: true
run_clustering: false
# Liability analysis
liability_analysis: true
liability_modality: peptide
liability_peptide_type: linear
# Uncommon metrics
diversity_original: false
diversity_refolded: false
diversity_per_target_original: false
diversity_per_target_refolded: false
novelty_original: false
novelty_refolded: false
novelty_per_target_original: false
novelty_per_target_refolded: false
ss_conditioning_metrics: false
sequence_recovery: false
native: false # This is only required for evaluations when we want to compute sequence recovery
compute_lddts: false # This is time intensive to compute
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1000000000000000
num_targets: 10000000000000
moldir: null
batch_size: 1
num_workers: 4
pin_memory: true
disulfide_prob: 1.0
disulfide_on: true
design_dir: ${design_dir}
target_templates: false
return_native: ${native}
fail_if_no_designs: true

99
config/design.yaml Normal file
View File

@@ -0,0 +1,99 @@
_target_: boltzgen.task.predict.predict.Predict
data:
_target_: boltzgen.task.predict.data_from_yaml.FromYamlDataModule
cfg:
_target_: boltzgen.task.predict.data_from_yaml.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
moldir: null
yaml_path: null
output_dir: ${output}
diffusion_samples: ${diffusion_samples}
# Design
backbone_only: false
atom14: true
atom37: false
disulfide_prob: 1.0
disulfide_on: true
batch_size: 1
num_workers: 4
pin_memory: true
writer:
_target_: boltzgen.task.predict.writer.DesignWriter
output_dir: ${output}
res_atoms_only: false
atom14: ${data.cfg.atom14}
atom37: ${data.cfg.atom37}
backbone_only: ${data.cfg.backbone_only}
write_native: false
trainer:
accelerator: gpu
devices: 1
precision: bf16-mixed
name: null
output: null
checkpoint: null
matmul_precision: high
recycling_steps: 3
sampling_steps: 500
diffusion_samples: 1
compile_pairformer: false
compile_structure: false
override:
masker_args:
mask: true
mask_backbone: false
validators: null
step_scale_schedule:
- step_scale: 1.8
period: 0.25
- step_scale: 2.0
period: 0.25
- step_scale: 1.8
period: 0.25
- step_scale: 2.0
period: 0.25
noise_scale_schedule:
- noise_scale: 0.95
period: 0.25
- noise_scale: 0.88
period: 0.25
- noise_scale: 0.95
period: 0.25
- noise_scale: 0.88
period: 0.25
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: null
step_scale: null
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
sampling_schedule: "dilated"
time_dilation: 2.667
time_dilation_start: 0.6
time_dilation_end: 0.8

16
config/filtering.yaml Normal file
View File

@@ -0,0 +1,16 @@
_target_: boltzgen.task.filter.filter.Filter
budget: 30
top_budget: 10
use_affinity: false # This changes the filtering metrics to metrics more amenable to small molecule binder design
filter_cysteine: false # [different from peptide-protein] This filters out all designs that have designed cysteins in them (prespecified cysteins in the design are not counted)
from_inverse_folded: false # This makes it so that we use the backbone refolding rmsd instead of the all-atom RMSD
filter_bindingsite: false # This filters out everything that does not have a residue within 4A of a binding site residue
modality: "peptide" # peptide or antibody
peptide_type: "linear" # linear or cyclic
alpha: 0.001 # 0 quality-only 1 diversity-only
random_state: 0
metrics_override: null # overrides metrics None values delete keys
num_liability_plots: 0
plot_seq_logos: false # make sequence logo diagrams of designed sequence

54
config/fold.yaml Normal file
View File

@@ -0,0 +1,54 @@
_target_: boltzgen.task.predict.predict.Predict
debug: false
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1000000000000000
num_targets: 10000000000000
moldir: null
batch_size: 1
num_workers: 4
pin_memory: true
disulfide_prob: 1.0
disulfide_on: true
design_dir: null
target_templates: true
return_native: false
fail_if_no_designs: true
output_dir: null
keys_dict_out: ["min_interaction_pae", "min_design_to_target_pae", "interaction_pae", "ligand_iptm", "protein_iptm", "iptm", "design_iptm", "design_iiptm", "design_to_target_iptm", "design_ptm", "target_ptm", "ptm"]
writer:
_target_: boltzgen.task.predict.writer.FoldingWriter
design_dir: ${data.design_dir}
trainer:
accelerator: gpu
logger: false
devices: 1
precision: bf16-mixed
name: null
output: null
checkpoint: null
matmul_precision: null
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 5
override:
validators: null

98
config/inverse_fold.yaml Normal file
View File

@@ -0,0 +1,98 @@
_target_: boltzgen.task.predict.predict.Predict
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
moldir: null
samples_per_target: 1000000000
# Design
design: true
backbone_only: true
atom14: false
max_seqs: 1
inverse_fold: true
batch_size: 1
num_workers: 4
pin_memory: true
num_targets: 1000000000
design_mask_override: null
fail_if_no_designs: true
design_dir: null
output_dir: ${output}
writer:
_target_: boltzgen.task.predict.writer.DesignWriter
output_dir: ${output}
res_atoms_only: false
atom14: ${data.cfg.atom14}
inverse_fold: ${data.cfg.inverse_fold}
write_native: false
trainer:
accelerator: gpu
devices: 1
precision: 32 # bf16-mixed
name: if_full
output: null
checkpoint: null
matmul_precision: null
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
override:
masker_args:
mask: true
mask_backbone: false
validators: null
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
inverse_fold_args:
atom_s: 128
atom_z: 16
token_s: 384
token_z: 128
node_dim: 128
pair_dim: 128
hidden_dim: 128
dropout: 0.1
softmax_dropout: 0.2
num_encoder_layers: 6
transformation_scale_factor: 1.0
inverse_fold_noise: 0.2
topk: 30
num_heads: 4
num_decoder_layers: 3
autoregressive: true
enable_input_embedder: true
inverse_fold_restriction: []

View File

@@ -0,0 +1,89 @@
_target_: boltzgen.task.predict.predict.Predict
data:
_target_: boltzgen.task.predict.data_from_yaml.FromYamlDataModule
cfg:
_target_: boltzgen.task.predict.data_from_yaml.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
moldir: null
yaml_path: null
# Design
backbone_only: true
atom14: false
atom37: false
disulfide_prob: 1.0
disulfide_on: true
batch_size: 1
num_workers: 4
pin_memory: true
writer:
_target_: boltzgen.task.predict.writer.DesignWriter
output_dir: ${output}
res_atoms_only: false
atom14: ${data.cfg.atom14}
inverse_fold: true
write_native: false
trainer:
accelerator: gpu
devices: 1
precision: 32
name: inverse_fold_only
output: null
checkpoint: null
matmul_precision: null
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
override:
masker_args:
mask: true
mask_backbone: false
inverse_fold: true
validators: null
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
inverse_fold_args:
atom_s: 128
atom_z: 16
token_s: 384
token_z: 128
node_dim: 128
pair_dim: 128
hidden_dim: 128
dropout: 0.1
softmax_dropout: 0.2
num_encoder_layers: 6
transformation_scale_factor: 1.0
inverse_fold_noise: 0.2
topk: 30
num_heads: 4
num_decoder_layers: 3
autoregressive: true
enable_input_embedder: true
inverse_fold_restriction: []

579
config/train/boltzgen.yaml Executable file
View File

@@ -0,0 +1,579 @@
_target_: boltzgen.task.train.train.Training
trainer:
accelerator: gpu
devices: 8
precision: bf16-mixed
gradient_clip_val: 10.0
accumulate_grad_batches: 1
max_epochs: -1
num_sanity_val_steps: 3
log_every_n_steps: 1
wandb:
group: boltzgen
project: boltzgen
entity: yourwandb
name: a_big_run_resume3
slurm: true
output: workdir
strict_loading: false
resume: null
pretrained: null
debug: false
save_every_n_train_steps: 2500
disable_checkpoint: false
matmul_precision: null
save_top_k: -1
data:
datasets:
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/targets
msa_dir: ./training_data/msa
prob: 0.6
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.date.DateFilter
date: "2023-06-01"
ref: released
- _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
resolution: 9.0
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
symmetry_correction: false
val_group: "RCSB"
# AFDB Distillation Data
- _target_: boltzgen.task.train.data.DatasetConfig
manifest_path: ./training_data/afdb/afdb_manifest_foldseek_c75_confidence.json
target_dir: ./training_data/afdb/targets
msa_dir: ./training_data/afdb/msa
prob: 0.3
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
composition_op: "AND"
metrics: ["confidence_score"]
compare_ops: ["greater"]
thresholds: [70]
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
symmetry_correction: true
override_method: "AFDB"
override_bfactor: true
# Protein-Ligand Distillation Data
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/protein_ligand/targets
msa_dir: ./training_data/protein_ligand/msa
moldir: ./training_data/protein_ligand/mols
prob: 0.03
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
composition_op: "AND"
metrics: ["complex_ipde", "complex_pde", "iptm"]
compare_ops: ["lesser", "lesser", "greater"]
thresholds: [1.5, 1.5, 0.9]
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
beta_chain: 0.05
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
symmetry_correction: true
override_method: "BOLTZ-1"
# RNA Distillation Data
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/rna/targets
msa_dir: ./training_data/rna/msa
prob: 0.04
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
composition_op: "OR"
metrics: ["complex_pde"]
compare_ops: ["lesser"]
thresholds: [2.0]
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
symmetry_correction: true
override_method: "BOLTZ-1"
# Protein-DNA Distillation Data
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/protein_dna/targets
msa_dir: ./training_data/protein_dna/msa
prob: 0.03
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
composition_op: "AND"
metrics: ["complex_ipde", "complex_pde", "iptm"]
compare_ops: ["lesser", "lesser", "greater"]
thresholds: [1.0, 2.0, 0.7]
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
beta_chain: 0.05
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
symmetry_correction: true
override_method: "BOLTZ-1"
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
moldir: ./training_data/mols
max_tokens: 512
max_atoms: 5120
max_seqs: 4096
pad_to_max_tokens: true
pad_to_max_atoms: true
pad_to_max_seqs: true
samples_per_epoch: 100000
batch_size: 1
num_workers: 2
random_seed: 42
pin_memory: false
overfit: null
return_train_symmetries: false
return_val_symmetries: false
atoms_per_window_queries: 32
min_dist: 2.0
max_dist: 22.0
num_bins: 64
single_sequence_prop_training: 0.1
msa_sampling_training: true
# Design
design: true
backbone_only: false
atom14: true
atom37: false
selector:
_target_: boltzgen.data.select.protein.ProteinSelector
design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
structure_condition_prob: 0.4
distance_noise_std: 1
run_selection: true
specify_binding_sites: true
ss_condition_prob: 0.1
select_all: false
# Design datasets
monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
monomer_target_dir: ./training_data/targets
monomer_target_structure_condition: true
monomer_seq_len: 100
ligand_split: data/pdb_sequences/val_ccd_pdb_pairs_boltzgen.txt
ligand_target_dir: ./training_data/targets
ligand_seq_len: 100
model:
_target_: boltzgen.model.models.boltz.Boltz
atom_s: 128
atom_z: 16
token_s: 384
token_z: 128
num_bins: 64
atom_feature_dim: 388
atoms_per_window_queries: 32
atoms_per_window_keys: 128
use_miniformer: false
ema: true
ema_decay: 0.999
exclude_ions_from_lddt: true
num_val_datasets: 1 # New
ignore_ckpt_shape_mismatch: false # New
aggregate_distogram: true # New
bond_type_feature: true
predict_bfactor: true
checkpoint_diffusion_conditioning: true
use_kernels: true
validators:
- _target_: boltzgen.model.validation.design.DesignValidator
val_names: ["RCSB"]
confidence_prediction: ${model.confidence_prediction}
atom14: ${data.atom14}
atom37: ${data.atom37}
masker_args:
mask: true
mask_backbone: false
mask_disto: true
embedder_args:
atom_encoder_depth: 3
atom_encoder_heads: 4
add_mol_type_feat: true
add_method_conditioning: true
add_modified_flag: true
add_cyclic_flag: true
add_design_mask_flag: true
add_binding_specification: true
add_ss_specification: true
freeze_template_weights: true
use_templates: true
template_args:
template_dim: 64
template_blocks: 2
activation_checkpointing: false
use_token_distances: true
token_distance_args:
token_distance_dim: 64
token_distance_blocks: 2
use_token_distance_feats: true
distance_gaussian_dim: 32
activation_checkpointing: true
msa_args:
msa_s: 64
msa_blocks: 4
msa_dropout: 0.15
z_dropout: 0.25
miniformer_blocks: false
pairwise_head_width: 32
pairwise_num_heads: 4
use_paired_feature: true
activation_checkpointing: true
pairformer_args:
num_blocks: 64
num_heads: 16
dropout: 0.25
post_layer_norm: false
activation_checkpointing: true
score_model_args:
sigma_data: 16
dim_fourier: 256
atom_encoder_depth: 3
atom_encoder_heads: 4
# token level args
token_layers: 1
token_transformer_depth: 24
token_transformer_heads: 16
diffusion_pairformer_args:
num_blocks: 0
num_heads: 2
dropout: 0
use_s_to_z: false
atom_decoder_depth: 3
atom_decoder_heads: 4
conditioning_transition_layers: 2
transformer_post_ln: false
activation_checkpointing: true
confidence_prediction: false
structure_prediction_training: true
training_args:
recycling_steps: 3
sampling_steps: 20
diffusion_multiplicity: 32
diffusion_samples: 1
confidence_loss_weight: 1e-4
diffusion_loss_weight: 4.0
distogram_loss_weight: 3e-2
bfactor_loss_weight: 1e-3
adam_beta_1: 0.9
adam_beta_2: 0.95
adam_eps: 0.00000001
lr_scheduler: af3
base_lr: 0.0
max_lr: 0.0005
lr_warmup_no_steps: 1000
lr_start_decay_after_n_steps: 50000
lr_decay_every_n_steps: 50000
lr_decay_factor: 0.95
weight_decay: 0.003
weight_decay_exclude: true
validation_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
symmetry_correction: false
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
diffusion_loss_args:
add_smooth_lddt_loss: true
add_bond_loss: false
nucleotide_loss_weight: 5.0
ligand_loss_weight: 10.0
refolding_validator:
_target_: boltzgen.model.validation.refolding.RefoldingValidator
val_names: ["RCSB"]
step_scale: 1.5
noise_scale: 0.75
atom14: ${data.atom14}
atom37: ${data.atom37}
val_monomer: ${data.monomer_split}
val_ligand: ${data.ligand_split}
analyze_task:
_target_: boltzgen.task.analyze.analyze.Analyze
name: ${name}
debug: ${debug}
design_dir: null
num_processes: 1
# Common metrics to compute
affinity_metrics: false
allatom_fold_metrics: true
backbone_fold_metrics: true
noncovalents_original: false
noncovalents_refolded: false
delta_sasa_original: false
delta_sasa_refolded: false
largest_hydrophobic: false
largest_hydrophobic_refolded: false
run_clustering: false
# Liability analysis
liability_analysis: false
liability_modality: peptide
liability_peptide_type: linear
# Uncommon metrics
diversity_original: true
diversity_refolded: true
diversity_per_target_original: false
diversity_per_target_refolded: false
novelty_original: false
novelty_refolded: false
novelty_per_target_original: false
novelty_per_target_refolded: false
wandb: null
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1
num_targets: 100000000
moldir: ./training_data/mols
batch_size: 1
num_workers: 4
pin_memory: true
return_native: true
predict_task: null
folding_checkpoint: ./training_data/boltz2_fold.ckpt
folding_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
folding_model_args:
atom_s: 128
atom_z: 16
token_s: 384
token_z: 128
num_bins: 64
atom_feature_dim: 388
atoms_per_window_queries: 32
atoms_per_window_keys: 128
compile_pairformer: false
compile_templates: false
compile_msa: false
use_miniformer: false
ema: true
ema_decay: 0.999
exclude_ions_from_lddt: true
num_val_datasets: 4
ignore_ckpt_shape_mismatch: false
aggregate_distogram: true
bond_type_feature: true
conditioning_cutoff_min: 4.0
conditioning_cutoff_max: 20.0
use_templates: true
predict_bfactor: true
checkpoint_diffusion_conditioning: false
use_kernels: true
validators: null
embedder_args:
atom_encoder_depth: 3
atom_encoder_heads: 4
add_mol_type_feat: true
add_method_conditioning: true
add_modified_flag: true
add_cyclic_flag: true
msa_args:
msa_s: 64
msa_blocks: 4
msa_dropout: 0.15
z_dropout: 0.25
miniformer_blocks: false
pairwise_head_width: 32
pairwise_num_heads: 4
use_paired_feature: true
activation_checkpointing: false
template_args:
template_dim: 64
template_blocks: 2
activation_checkpointing: false
pairformer_args:
num_blocks: 64
num_heads: 16
dropout: 0.25
post_layer_norm: false
activation_checkpointing: false
score_model_args:
sigma_data: 16
dim_fourier: 256
atom_encoder_depth: 3
atom_encoder_heads: 4
token_transformer_depth: 24
token_transformer_heads: 16
atom_decoder_depth: 3
atom_decoder_heads: 4
conditioning_transition_layers: 2
transformer_post_ln: false
activation_checkpointing: false
confidence_prediction: false
affinity_prediction: false
structure_prediction_training: true
affinity_model_args:
num_dist_bins: 64
max_dist: 22
no_trunk_feats: false
add_s_to_z_prod: false
add_s_input_to_s: false
confidence_args:
num_plddt_bins: 50
num_pde_bins: 64
num_pae_bins: 64
training_args:
recycling_steps: 3
sampling_steps: 20
diffusion_multiplicity: 48
diffusion_samples: 1
affinity_loss_weight: 3e-3
confidence_loss_weight: 1e-4
diffusion_loss_weight: 4.0
distogram_loss_weight: 3e-2
bfactor_loss_weight: 1e-3
adam_beta_1: 0.9
adam_beta_2: 0.95
adam_eps: 0.00000001
lr_scheduler: af3
base_lr: 0.0
max_lr: 0.001
lr_warmup_no_steps: 1000
lr_start_decay_after_n_steps: 50000
lr_decay_every_n_steps: 50000
lr_decay_factor: 0.95
weight_decay: 0.003
weight_decay_exclude: true
validation_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 5
symmetry_correction: false
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
diffusion_loss_args:
add_smooth_lddt_loss: true
add_bond_loss: false
nucleotide_loss_weight: 5.0
ligand_loss_weight: 10.0

View File

@@ -0,0 +1,346 @@
_target_: boltzgen.task.train.train.Training
trainer:
accelerator: gpu
devices: 8
precision: bf16-mixed
gradient_clip_val: 10.0
accumulate_grad_batches: 16
max_epochs: -1
num_sanity_val_steps: 3
log_every_n_steps: 1
wandb:
group: boltzgen
project: boltzgen
entity: yourwandb
name: small
output: workdir
strict_loading: false
resume: null
pretrained: ./training_data/boltzgen1_structuretrained_small.ckpt
debug: false
save_every_n_train_steps: 2500
disable_checkpoint: false
matmul_precision: null
save_top_k: -1
# ddp_timeout_seconds: 1000
data:
datasets:
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/targets
msa_dir: ./training_data/msa
prob: 1
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.date.DateFilter
date: "2023-06-01"
ref: released
- _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
resolution: 9.0
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
symmetry_correction: false
val_group: "RCSB"
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
moldir: ./training_data/mols
max_tokens: 256
max_atoms: 2048
max_seqs: 1024
pad_to_max_tokens: true
pad_to_max_atoms: true
pad_to_max_seqs: true
samples_per_epoch: 100000
batch_size: 1
num_workers: 4
random_seed: 42
pin_memory: true
overfit: null
return_train_symmetries: false
return_val_symmetries: false
atoms_per_window_queries: 32
min_dist: 2.0
max_dist: 22.0
num_bins: 64
single_sequence_prop_training: 0.1
msa_sampling_training: true
# Design
design: true
backbone_only: false
atom14: true
atom37: false
selector:
_target_: boltzgen.data.select.protein.ProteinSelector
design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
structure_condition_prob: 0.4
distance_noise_std: 1
run_selection: true
specify_binding_sites: true
ss_condition_prob: 0.1
select_all: false
chain_reindexing: false
# Design datasets
monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
monomer_target_dir: ./training_data/targets
monomer_target_structure_condition: true
monomer_seq_len: 100
ligand_split: data/pdb_sequences/val_ccd_pdb_pairs_boltzgen.txt
ligand_target_dir: ./training_data/targets
ligand_seq_len: 100
model:
_target_: boltzgen.model.models.boltz.Boltz
atom_s: 128
atom_z: 16
token_s: 384
token_z: 128
num_bins: 64
atom_feature_dim: 388
atoms_per_window_queries: 32
atoms_per_window_keys: 128
use_miniformer: true
ema: true
ema_decay: 0.999
exclude_ions_from_lddt: true
num_val_datasets: 1 # New
ignore_ckpt_shape_mismatch: false # New
aggregate_distogram: true # New
bond_type_feature: true
predict_bfactor: true
predict_res_type: false
checkpoint_diffusion_conditioning: false
use_kernels: true
validators:
- _target_: boltzgen.model.validation.design.DesignValidator
val_names: ["RCSB"]
confidence_prediction: ${model.confidence_prediction}
backbone_only: ${data.backbone_only}
atom14: ${data.atom14}
atom37: ${data.atom37}
masker_args:
mask: true
mask_backbone: false
mask_disto: true
embedder_args:
atom_encoder_depth: 3
atom_encoder_heads: 4
add_mol_type_feat: true
add_method_conditioning: true
add_modified_flag: true
add_cyclic_flag: true
add_design_mask_flag: true
add_binding_specification: true
add_ss_specification: true
freeze_template_weights: true
use_templates: true
template_args:
template_dim: 64
template_blocks: 2
miniformer_blocks: true
activation_checkpointing: false
use_token_distances: true
token_distance_args:
token_distance_dim: 64
token_distance_blocks: 2
use_token_distance_feats: true
distance_gaussian_dim: 32
msa_args:
msa_s: 64
msa_blocks: 3
msa_dropout: 0.15
z_dropout: 0.25
miniformer_blocks: true
pairwise_head_width: 32
pairwise_num_heads: 4
use_paired_feature: true
activation_checkpointing: false
pairformer_args:
num_blocks: 12
num_heads: 16
dropout: 0.25
post_layer_norm: false
activation_checkpointing: false
score_model_args:
sigma_data: 16
dim_fourier: 256
atom_encoder_depth: 3
atom_encoder_heads: 4
# token level args
token_layers: 1
token_transformer_depth: 8
token_transformer_heads: 16
diffusion_pairformer_args:
num_blocks: 0
num_heads: 2
dropout: 0
use_s_to_z: false
atom_decoder_depth: 3
atom_decoder_heads: 4
conditioning_transition_layers: 2
transformer_post_ln: false
activation_checkpointing: false
confidence_prediction: false
structure_prediction_training: true
training_args:
recycling_steps: 3
sampling_steps: 20
diffusion_multiplicity: 12
diffusion_samples: 1
confidence_loss_weight: 1e-4
diffusion_loss_weight: 4.0
distogram_loss_weight: 3e-2
bfactor_loss_weight: 1e-3
res_type_loss_weight: 3e-2
adam_beta_1: 0.9
adam_beta_2: 0.95
adam_eps: 0.00000001
lr_scheduler: af3
base_lr: 0.0
max_lr: 0.0018
lr_warmup_no_steps: 1000
lr_start_decay_after_n_steps: 50000
lr_decay_every_n_steps: 50000
lr_decay_factor: 0.95
weight_decay: 0.003
weight_decay_exclude: true
validation_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
symmetry_correction: false
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
diffusion_loss_args:
add_smooth_lddt_loss: true
add_bond_loss: false
nucleotide_loss_weight: 5.0
ligand_loss_weight: 10.0
refolding_validator:
_target_: boltzgen.model.validation.refolding.RefoldingValidator
val_names: ["RCSB"]
step_scale: 1.5
noise_scale: 0.75
atom14: ${data.atom14}
atom37: ${data.atom37}
backbone_only: ${data.backbone_only}
val_monomer: ${data.monomer_split}
val_ligand: ${data.ligand_split}
analyze_task:
_target_: boltzgen.task.analyze.analyze.Analyze
name: ${name}
debug: ${debug}
design_dir: null
num_processes: 1
# Common metrics to compute
affinity_metrics: false
allatom_fold_metrics: true
backbone_fold_metrics: true
noncovalents_original: false
noncovalents_refolded: false
delta_sasa_original: false
delta_sasa_refolded: false
largest_hydrophobic: false
largest_hydrophobic_refolded: false
run_clustering: false
# Liability analysis
liability_analysis: false
liability_modality: peptide
liability_peptide_type: linear
# Uncommon metrics
diversity_original: true
diversity_refolded: true
diversity_per_target_original: false
diversity_per_target_refolded: false
novelty_original: false
novelty_refolded: false
novelty_per_target_original: false
novelty_per_target_refolded: false
wandb: null
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1
num_targets: 100000000
moldir: ./training_data/mols
batch_size: 1
num_workers: 4
pin_memory: true
target_templates: true
return_native: true
folding_checkpoint: ./training_data/boltz2_fold.ckpt
folding_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
folding_model_args:
validators: null

376
config/train/inverse_folding.yaml Executable file
View File

@@ -0,0 +1,376 @@
_target_: boltzgen.task.train.train.Training
trainer:
accelerator: cuda
devices: 4
precision: 32
gradient_clip_val: 10.0
accumulate_grad_batches: 1
max_epochs: 5
num_sanity_val_steps: 1
log_every_n_steps: 1
wandb:
group: boltzgen
project: boltzgen
entity: yourwandb
name: if_lr_scheduler
output: workdir
strict_loading: false
resume: null
debug: false
save_every_n_train_steps: 2500
disable_checkpoint: false
matmul_precision: null
save_top_k: -1
data:
datasets:
- _target_: boltzgen.task.train.data.DatasetConfig
target_dir: ./training_data/targets
msa_dir: ./training_data/msa
prob: 1
filters:
- _target_: boltzgen.data.filter.dynamic.size.SizeFilter
min_chains: 1
max_chains: 300
- _target_: boltzgen.data.filter.dynamic.date.DateFilter
date: "2023-06-01"
ref: released
- _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
resolution: 9.0
- _target_: boltzgen.data.filter.dynamic.min_protein_residues.MinProteinResiduesFilter
min_residues: 5
- _target_: boltzgen.data.filter.dynamic.pdb_id_txtfile.FilterIDFromTXT
paths:
- data/exclude_ids/fibril.txt
- data/exclude_ids/transmembrane.txt
sampler:
_target_: boltzgen.data.sample.cluster.ClusterSampler
cropper:
_target_: boltzgen.data.crop.multimer.MultimerCropper
neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
symmetry_correction: false
val_group: "RCSB"
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
moldir: ./training_data/mols
max_tokens: 1024
max_atoms: 8192
max_seqs: 1
pad_to_max_tokens: true
pad_to_max_atoms: true
pad_to_max_seqs: true
samples_per_epoch: 600000
batch_size: 2
num_workers: 32
random_seed: 42
pin_memory: true
overfit: null
return_train_symmetries: false
return_val_symmetries: false
compute_frames: false
atoms_per_window_queries: 32
min_dist: 2.0
max_dist: 22.0
num_bins: 64
single_sequence_prop_training: 0.05
msa_sampling_training: true
# Design
design: true
backbone_only: true
atom14: false
atom37: false
inverse_fold: ${model.inverse_fold}
use_msa: false
selector:
_target_: boltzgen.data.select.protein.ProteinSelector
design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
structure_condition_prob: 0.5
distance_noise_std: 1
run_selection: true
specify_binding_sites: false
ss_condition_prob: 0
select_all: true
complete_structure_mask: true
# Design datasets
monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
monomer_target_dir: ./training_data/targets
monomer_target_structure_condition: true
monomer_seq_len: 100
ligand_split: null
ligand_target_dir: ./training_data/targets
ligand_seq_len: 100
model:
_target_: boltzgen.model.models.boltz.Boltz
atom_s: 128
atom_z: 16
token_s: 384
token_z: ${model.inverse_fold_args.pair_dim}
num_bins: 64
atom_feature_dim: 388
atoms_per_window_queries: 32
atoms_per_window_keys: 128
use_miniformer: true
ema: true
ema_decay: 0.999
exclude_ions_from_lddt: true
num_val_datasets: 1 # New
ignore_ckpt_shape_mismatch: false # New
aggregate_distogram: true # New
bond_type_feature: true
predict_bfactor: true
predict_res_type: true
checkpoint_diffusion_conditioning: false
inverse_fold: true
inverse_fold_args:
atom_s: ${model.atom_s}
atom_z: ${model.atom_z}
token_s: ${model.token_s}
token_z: ${model.token_z}
node_dim: 128
pair_dim: 128
hidden_dim: 128
dropout: 0.1
softmax_dropout: 0.2
num_encoder_layers: 6
num_decoder_layers: 3
autoregressive: true
transformation_scale_factor: 1.0
inverse_fold_noise: 0.2
topk: 30
num_heads: 4
enable_input_embedder: True
sampling_temperature: -1.0
validators:
- _target_: boltzgen.model.validation.design.DesignValidator
val_names: ["RCSB"]
confidence_prediction: ${model.confidence_prediction}
atom14: ${data.atom14}
atom37: ${data.atom37}
backbone_only: ${data.backbone_only}
inverse_fold: ${model.inverse_fold}
masker_args:
mask: true
mask_backbone: false
mask_disto: false
embedder_args:
atom_encoder_depth: 1
atom_encoder_heads: 4
add_mol_type_feat: true
add_method_conditioning: true
add_modified_flag: true
add_cyclic_flag: true
add_design_mask_flag: false
add_binding_specification: false
add_ss_specification: false
use_token_distances: false
token_distance_args:
token_distance_dim: ${model.inverse_fold_args.pair_dim}
token_distance_blocks: 0
use_token_distance_feats: true
distance_gaussian_dim: 32
disable_token_distance_transition: true
use_relative_position_encoding: true
# MSA module is not used in inverse folding
msa_args:
msa_s: 2
msa_blocks: 0
msa_dropout: 0
z_dropout: 0
miniformer_blocks: true
pairwise_head_width: 2
pairwise_num_heads: 1
use_paired_feature: true
activation_checkpointing: false
pairformer_args:
num_blocks: 2
num_heads: 16
dropout: 0.25
post_layer_norm: false
activation_checkpointing: false
score_model_args:
sigma_data: 16
dim_fourier: 256
atom_encoder_depth: 3
atom_encoder_heads: 4
# token level args
token_layers: 1
token_transformer_depth: 3
token_transformer_heads: 16
diffusion_pairformer_args:
num_blocks: 0
num_heads: 2
dropout: 0
use_s_to_z: false
atom_decoder_depth: 3
atom_decoder_heads: 4
conditioning_transition_layers: 2
transformer_post_ln: false
activation_checkpointing: false
confidence_prediction: false
affinity_prediction: false
structure_prediction_training: true
affinity_model_args:
num_dist_bins: 64
max_dist: 22
no_trunk_feats: false
add_s_to_z_prod: false
add_s_input_to_s: false
confidence_args:
num_plddt_bins: 50
num_pde_bins: 64
num_pae_bins: 64
training_args:
recycling_steps: 0
sampling_steps: 20
diffusion_multiplicity: 2
diffusion_samples: 1
affinity_loss_weight: 3e-3
confidence_loss_weight: 1e-4
diffusion_loss_weight: 4.0
distogram_loss_weight: 3e-2
bfactor_loss_weight: 1e-3
res_type_loss_weight: 1
adam_beta_1: 0.9
adam_beta_2: 0.95
adam_eps: 0.00000001
lr_scheduler: onecycle
base_lr: 0.0
max_lr: 0.001
weight_decay: 0.003
weight_decay_exclude: true
validation_args:
recycling_steps: 0
sampling_steps: 200
diffusion_samples: 1
symmetry_correction: false
diffusion_process_args:
sigma_min: 0.0004 # min noise level
sigma_max: 160.0 # max noise level
sigma_data: 16.0 # standard deviation of data distribution
rho: 7 # controls the sampling schedule
P_mean: -1.2 # mean of log-normal distribution from which noise is drawn for training
P_std: 1.5 # standard deviation of log-normal distribution from which noise is drawn for training
gamma_0: 0.8
gamma_min: 1.0
noise_scale: 1.0
step_scale: 1.0
mse_rotational_alignment: true
coordinate_augmentation: true
alignment_reverse_diff: true
synchronize_sigmas: false
diffusion_loss_args:
add_smooth_lddt_loss: true
add_bond_loss: false
nucleotide_loss_weight: 5.0
ligand_loss_weight: 10.0
refolding_validator:
_target_: boltzgen.model.validation.refolding.RefoldingValidator
val_names: ["RCSB"]
step_scale: 1.5
noise_scale: 0.75
atom14: ${data.atom14}
atom37: ${data.atom37}
val_monomer: ${data.monomer_split}
val_ligand: ${data.ligand_split}
inverse_fold: ${model.inverse_fold}
analyze_task:
_target_: boltzgen.task.analyze.analyze.Analyze
name: ${name}
debug: ${debug}
design_dir: null
num_processes: 1
# Common metrics to compute
affinity_metrics: false
allatom_fold_metrics: true
backbone_fold_metrics: true
noncovalents_original: false
noncovalents_refolded: false
delta_sasa_original: false
delta_sasa_refolded: false
largest_hydrophobic: false
largest_hydrophobic_refolded: false
run_clustering: false
# Liability analysis
liability_analysis: false
liability_modality: peptide
liability_peptide_type: linear
# Uncommon metrics
diversity_original: true
diversity_refolded: true
diversity_per_target_original: false
diversity_per_target_refolded: false
novelty_original: false
novelty_refolded: false
novelty_per_target_original: false
novelty_per_target_refolded: false
wandb: null
data:
_target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
cfg:
_target_: boltzgen.task.predict.data_from_generated.DataConfig
tokenizer:
_target_: boltzgen.data.tokenize.tokenizer.Tokenizer
atomize_modified_residues: false
featurizer:
_target_: boltzgen.data.feature.featurizer.Featurizer
suffix: .cif
suffix_metadata: .npz
suffix_native: _native.cif
samples_per_target: 1
num_targets: 100000000
moldir: ./training_data/mols
batch_size: 1
num_workers: 1
pin_memory: false
target_templates: true
return_native: true
folding_checkpoint: ./training_data/boltz2_fold.ckpt
folding_args:
recycling_steps: 3
sampling_steps: 200
diffusion_samples: 1
folding_model_args:
validators: null

6552
example/7rpz.cif Normal file

File diff suppressed because it is too large Load Diff

75434
example/8r3a.cif Normal file

File diff suppressed because it is too large Load Diff

429
example/README.md Normal file
View File

@@ -0,0 +1,429 @@
# How to make a design specification .yaml
**IMPORTANT:** ⚠️ All residue indices are specified **starting at 1** and we use the canonical mmcif residue index `label_asym_id`, and **not** the `auth_asym_id` author residue index!
You can check the indexing in your mmcif file by opening it in https://molstar.org/viewer/, hovering over a residue, and checking the index on the bottom right. You will see something like this where **41 is the index we use, the auth id 22 is incorrect**:
![](../assets/label_seq_id.png)
After you constructed your `.yaml` file we recommend that you run the `check` command on it:
1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.
2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
3. Your viewer should show the binding site in a different color than the rest of the target.
# Example based explanation:
We provide many example `.yaml` files in the `example/` directory, including:
- [design_spec_showcasing_all_functionalities.yaml](design_spec_showcasing_all_functionalities.yaml)
- [vanilla_peptide_with_target_binding_site/beetletert.yaml](vanilla_peptide_with_target_binding_site/beetletert.yaml)
- [peptide_against_specific_site_on_ragc/rragc.yaml](peptide_against_specific_site_on_ragc/rragc.yaml)
- [nanobody_against_penguinpox/penguinpox.yaml](nanobody_against_penguinpox/penguinpox.yaml)
- [denovo_zinc_finger_against_dna/zinc_finger.yaml](denovo_zinc_finger_against_dna/zinc_finger.yaml)
- [protein_binding_small_molecule/chorismite.yaml](protein_binding_small_molecule/chorismite.yaml)
Small example of a protein design against a target protein without binding site specified:
```yaml
entities:
# Designed protein with between 80 and 140 residues
# (The length is randomly sampled)
- protein:
id: B
sequence: 80..140
# The target is extracted from a .cif file
- file:
path: hard_targets/6m1u.cif
# Which chain in the .cif file to use as target (uses all chains if unspecified)
include:
- chain:
id: A
```
**IMPORTANT:** ⚠️ File references inside a yaml file (e.g. to cif files) are interpreted relative to the directory of the yaml file.
Example highlighting many (not all) functionalities:
```yaml
entities:
# Specification of the target which is extracted from a .cif file
- file:
path: 8r3a.cif
# Which chain and residues in the .cif file to use as target (uses all chains if unspecified)
include:
- chain:
id: A
res_index: 2..50,55.. # residues between 2 and 50 and anything larger than 55
- chain:
id: B
# Which regions of the target the design should or should NOT
# bind to (this can be left unspecified, then we just bind anywhere)
binding_types:
- chain:
id: A
binding: 5..7,13
- chain:
id: B
not_binding: "all"
# Which regions of the target should have their structure specified.
# By default, everything is visibility 1 which means that the structure is specified.
# If the visibility is 0, then the structure is not specified.
structure_groups:
- group:
visibility: 1
id: A
res_index: 10..13
- group:
# The relative positioning of things in structure group 2
# is not specified w.r.t to things in structure group 1
visibility: 2
id: B
# Overwrite the previous visibility setting and set it to 0 for res_index 13
- group:
visibility: 0
id: A
res_index: 13
# Optionally you can say that some residues in a loaded .cif file should also be redesigned.
design:
- chain:
id: A
res_index: 14..19
# For designed regions you can say what secondary structure they should have
secondary_structure:
- chain:
id: A
loop: 14
helix: 15..17
sheet: 19
# Specify a NON-designed protein chain
- protein:
id: X
sequence: AAVTTTTPPP
# Specify a designed protein chain
# Numbers specify what is being designed
- protein:
id: G
# random number between 15 and 20 of designed residues (inclusive)
sequence: 15..20AAAAAAVTTTT18PPP
# A designed helicon
# (see the constraints below that connect the peptide with the WHL ligand)
- protein:
id: R
# Random number of design residues between 3 and 5,
# then a Cysteine, then 6 design residues, then ...
sequence: 3..5C6C3
- ligand:
id: Q
ccd: WHL
# A designed peptide with 17 residues
- protein:
id: H
sequence: 17
# specification for a designed peptide with two Cys and a disulfide bond (see constraints)
- protein:
id: S
sequence: 10..14C6C3
constraints:
# specify connections as if the minimum possible number of residues was sampled
- bond:
atom1: [R, 4, SG] # connection for a helicon between small molecule and designed peptide
atom2: [Q, 1, CK]
- bond:
atom1: [R, 11, SG] # connection for a helicon between small molecule and designed peptide
atom2: [Q, 1, CH]
- bond:
atom1: [S, 11, SG] # connection for a disulfide bond between Cys and Cys in designed peptide
atom2: [S, 18, SG]
```
# Detailed Explanation
```yaml
entities:
# Define proteins, ligands, and structure files
- protein: ...
- ligand: ...
- file: ...
constraints:
# Define bonds and total length constraints
- bond: ...
- total_len: ...
```
### Entities Section
The `entities` section defines all the components of your design:
#### Protein Sequences
Define custom protein sequences with design flexibility:
```yaml
entities:
- protein:
id: G # Unique identifier
sequence: 15..20AAAAAAVTTTT18PPP # Mix of fixed residues and design regions
binding_types: uuuuBBBuNNNuBuu # Binding specifications (optional)
secondary_structure: HHHLLLEEE # Secondary structure constraints for designed regions (optional)
```
**Sequence notation:**
- `15..20` - Design between 15-20 residues (inclusive)
- `AAAA` - Fixed amino acid sequence
- `18` - Design exactly 18 residues
- `3..5C6C3` - Variable design residues, then fixed Cys, then more design
**Binding types:**
- `B` - Binding residue
- `N` - Non-binding residue
- `u` - Unspecified (default)
- Can specify as string: `uuuuBBBuNNNuBuu`
- Or as ranges:
```yaml
binding_types:
binding: 5..7,13 # Residues 5-7 and 13 are binding
not_binding: 9..11 # Residues 9-11 are non-binding
```
#### Ligands
Define small molecule ligands using CCD codes or SMILES:
```yaml
# Using Chemical Component Dictionary (CCD) code
entities:
- ligand:
id: [E, F] # specify list of IDs to copy the entity
ccd: WHL
binding_types: B
# Using SMILES string
entities:
- ligand:
id: Q
smiles: 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'
binding_types: B
```
#### Structure Files
Include existing protein structures from PDB/mmCIF files:
```yaml
entities:
- file:
path: 7rpz.cif
# Include specific chains
include:
- chain:
id: A
- chain:
id: B
# Include by proximity
include_proximity:
- chain:
id: A
res_index: 10..16
radius: 35
# Exclude specific regions
exclude:
- chain:
id: A
res_index: ..5 # Exclude residues 1-5
# Reset residue numbering
reset_res_index:
- chain:
id: A
```
# Advanced Options
**Design regions:** Specify which residues to redesign
```yaml
entities:
...
- file:
...
design:
- chain:
id: A
res_index: ..4,20..27 # Redesign residues 1-4 and 20-27
```
**Secondary structure constraints:**
```yaml
entities:
...
- protein:
...
secondary_structure:
- chain:
id: A
loop: 1 # Residue 1 should be loop
helix: 2..3 # Residues 2-3 should be helix
sheet: 4 # Residue 4 should be sheet
```
**Structure visibility groups:**
```yaml
entities:
...
- file:
...
structure_groups:
- group:
visibility: 1 # Visibility level (0=hidden, 1=visible, 2=highlighted)
id: A
res_index: 10..16
```
**Design insertions:**
```yaml
entities:
...
- file:
...
design_insertions:
- insertion:
id: A
res_index: 20 # Insert after residue 20
num_residues: 2..9 # Insert 2-9 residues
secondary_structure: HELIX # UNSPECIFIED, LOOP, HELIX, or SHEET
```
**Binding type specifications:**
```yaml
entities:
...
- protein:
...
binding_types:
- chain:
id: A
binding: 5..7,13
- chain:
id: B
not_binding: "all"
```
### Constraints Section
Define structural constraints between components:
#### Bond Constraints
Create covalent bonds between specific atoms:
```yaml
constraints:
- bond:
atom1: [R, 4, SG] # [chain_id, residue_number, atom_name]
atom2: [Q, 1, CK] # Connect sulfur of Cys-4 in chain R to atom CK in ligand Q
```
Here is a comprehensive list of all the keys from your YAML file with explanations for each.
***
### Top-Level Keys
* `entities`: The main list containing all molecular components of the system, such as proteins, ligands, or imported files.
* `constraints`: A list of rules or conditions to apply to the system, like specific bonds between entities or total length restrictions.
---
### Entity Types (Keys within the `entities` list)
* `protein`: Defines a protein entity.
* `ligand`: Defines a small molecule ligand.
* `file`: Specifies an external structure file (e.g., a `.cif` file) to import parts of the system from.
---
### Keys for `protein` Entities
* `id`: A unique identifier for the protein chain (e.g., 'A', 'G').
* `sequence`: Defines the amino acid sequence of the protein. This can include numbers to specify lengths of residues to be designed.
* `secondary_structure`: Specifies the secondary structure of the protein.
* `binding_types`: Defines which residues are involved in binding. Can be a string or a more detailed dictionary.
* `cyclic`: A boolean (`true` or `false`) indicating if the protein is cyclic.
---
### Keys for `ligand` Entities
* `id`: A unique identifier for the ligand. Can be a single ID or a list of IDs.
* `ccd`: The Chemical Component Dictionary ID for the ligand (e.g., 'SAH').
* `smiles`: The SMILES string representing the ligand's chemical structure.
* `binding_types`: Specifies binding information, often a simple character like 'B' for binding.
---
### Keys for `file` Entities
* `path`: The file path to the structure file to be included (e.g., 'example/7rpz.cif').
* `msa`: A global flag for Multiple Sequence Alignment for the chains in the file. Can be overwritten by individual chain settings.
* `include`: Specifies which parts of the file to include. Can be the string `"all"` or a list of chains.
* `exclude`: Specifies which parts of an included file to exclude.
* `fuse`: Specifies a chain to which subsequent protein entities will be fused.
* `include_proximity`: Includes residues from the file that are within a certain distance of a specified chain.
* `binding_types`: Defines binding interactions for specific chains within the file.
* `structure_groups`: Defines groups of residues for visualization or other purposes.
* `design`: Specifies which residues in the included chains are designable.
* `secondary_structure`: Defines the secondary structure for specific residues within included chains.
* `design_insertions`: Specifies where to insert new designable residues.
---
### Keys for `constraints`
* `bond`: Defines a covalent bond to be formed between two specified atoms in the system.
* `atom1`: The first atom in the bond.
* `atom2`: The second atom in the bond.
* `total_len`: Constrains the total length of the polymeric system.
* `min`: The minimum allowed total length.
* `max`: The maximum allowed total length.
---
### Nested Keys (found within multiple entity types)
* `chain`: A sub-dictionary used in `include`, `exclude`, `binding_types`, `design`, and `secondary_structure` to specify a particular protein chain.
* `id`: The identifier of the chain.
* `msa`: A specific MSA setting for this chain, overriding the global `msa` flag.
* `res_index`: Specifies a range or list of residue indices.
* `radius`: Used in `include_proximity` to define a distance in Angstroms.
* `binding`: Specifies residues that are part of a binding site.
* `not_binding`: Specifies residues that are not part of a binding site.
* `loop`, `helix`, `sheet`: Used in `secondary_structure` to define the structure of specific residues.
* `group`: Used in `structure_groups` to define a residue group.
* `id`: The identifier of the chain or `"all"`.
* `visibility`: A numerical value to control how the group is displayed.
* `res_index`: The residues included in this group.
* `insertion`: Used in `design_insertions`.
* `id`: The chain ID where the insertion occurs.
* `res_index`: The residue index after which the insertion is made.
* `num_residues`: The number or range of residues to be inserted.
* `secondary_structure`: The desired secondary structure for the inserted residues (e.g., `HELIX`).

View File

@@ -0,0 +1,7 @@
entities:
- protein:
id: A
sequence: 120..140
- protein:
id: B
sequence: GGGILPWKWPWWPWRRGGG

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
entities:
- protein:
id: B
sequence: 40..80
- file:
path: hoxd13.cif
include:
- chain:
id: A
res_index: ..71
structure_groups:
- group:
visibility: 0
id: "all"
binding_types:
- chain:
id: A
binding: 57..71
- protein:
id: C
fuse: A
sequence: AAAAAAAA
binding_types: BBBBBBBB
- file:
path: hoxd13.cif
fuse: A
include:
- chain:
id: A
res_index: 72..
structure_groups:
- group:
visibility: 0
id: "all"
- group:
visibility: 1
id: A
res_index: 281..

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,20 @@
entities:
- protein:
id: G
sequence: 40..80
- file:
path: npm1.cif
include:
- chain:
id: A
binding_types:
- chain:
id: A
binding: 123..240
not_binding: 1..122
structure_groups:
- group:
visibility: 1
id: A
res_index: 12..118, 243..291

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
entities:
- protein:
id: G
sequence: 40..80
- file:
path: nup98.cif
include:
- chain:
id: A
res_index: 1..400
structure_groups:
- group:
visibility: 0
id: "all"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,33 @@
entities:
- file:
path: 9d3d.cif
include:
- chain:
id: A
- chain:
id: B
- chain:
id: C
include_proximity:
- chain:
id: G
res_index: 106..118
radius: 30
binding_types:
- chain:
id: A
binding: 91,128,131
- chain:
id: B
binding: 91,128,131
- chain:
id: C
binding: 91,128,131
- protein:
id: E
sequence: 8..18
cyclic: True

4842
example/cyclotide/3ivq.cif Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,25 @@
entities:
- protein:
id: B
sequence: 3C8C6C5C3C1C2
cyclic: true
- file:
path: 3ivq.cif
include:
- chain:
id: A
structure_groups: "all"
constraints:
- bond:
atom1: [B, 4, SG]
atom2: [B, 26, SG]
- bond:
atom1: [B, 13, SG]
atom2: [B, 30, SG]
- bond:
atom1: [B, 20, SG]
atom2: [B, 32, SG]

4856
example/cyclotide/5wrd.cif Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,25 @@
entities:
- protein:
id: B
sequence: 3C8C6C5C3C1C2
cyclic: true
- file:
path: 5wrd.cif
include:
- chain:
id: A
structure_groups: "all"
constraints:
- bond:
atom1: [B, 4, SG]
atom2: [B, 26, SG]
- bond:
atom1: [B, 13, SG]
atom2: [B, 30, SG]
- bond:
atom1: [B, 20, SG]
atom2: [B, 32, SG]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
entities:
- protein:
id: B
sequence: 8..16
cyclic: true
- file:
path: 8jjs.cif
include:
- chain:
id: A
- chain:
id: C
binding_types:
- chain:
id: A
binding: 12,14,61,63,73,76,77,83,101,104,108

View File

@@ -0,0 +1,12 @@
entities:
- protein:
id: G
sequence: 40..120
- file:
path: zf.cif
include:
- chain:
id: C1
- chain:
id: B1

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,30 @@
entities:
- file:
path: zf.cif
include: "all"
exclude:
- chain:
id: A1
res_index: ..10,63..69,185..
design_insertions:
- insertion:
id: A1
res_index: 63
num_residues: 3..8
structure_groups:
- group:
visibility: 0
id: "all"
design:
- chain:
id: A1
res_index: 11..184
not_design:
- chain:
id: A1
res_index: 11..20,29,33,39..48,57,61,72..81,90,94,100..109,118,122,129..138,147,151,157..166,175,179
reset_res_index:
- chain:
id: A1

View File

@@ -0,0 +1,184 @@
entities:
- protein:
id: G
sequence: 15..20AAAAAAVTTTT18PPP # range between 15 and 20 inclusive on both sides
- protein:
id: R
sequence: 3..5C6C3 # Random number of design residues between 3 and 5, then a Cystein, then 6 design residues, then ...
- ligand:
id: Q
ccd: WHL
- protein:
id: H
sequence: 17
secondary_structure: # No secondary structure specified, defaults
- file:
path: 7rpz.cif
include:
- chain:
id: A
- chain:
id: B
include_proximity:
- chain:
id: A
res_index: 10..16
radius: 35
binding_types:
- chain:
id: A
binding: 5..7,13
- chain:
id: B
not_binding: "all"
structure_groups:
- group:
visibility: 1
id: A
res_index: 10..16
- group:
visibility: 2
id: B
- group:
visibility: 0
id: A
res_index: 13
design:
- chain:
id: A
res_index: ..4,20..27
secondary_structure:
- chain:
id: A
loop: 1
helix: 2..3
sheet: 4
design_insertions:
- insertion:
id: A
res_index: 20 # The 20th residue will be a designed one (starting to count from 1)
num_residues: 2..9
secondary_structure: HELIX # One of UNSPECIFIED (default), LOOP, HELIX, SHEET.
- protein:
id: A
sequence: AAAAAAAAAAAAAAAAAAAAAAAA
binding_types: uuuuBBBuNNNuBuu # the missing specifications will be 'u' by default
- file:
path: 7rpz.cif
fuse: A
include:
- chain:
id: A
res_index: ..5
- protein:
id: B
sequence: AAAAAAAAAAAAAAAAAAAAAAAA
binding_types:
binding: 5..7,13
not_binding: 9..11
- ligand:
id: [C, D]
ccd: SAH
- ligand:
id: [E, F]
smiles: 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'
binding_types: B
- file:
path: 7rpz.cif
include: "all"
exclude:
- chain:
id: A
res_index: ..5
structure_groups:
- group:
visibility: 1
id: "all"
- group:
visibility: 0
id: A
res_index: 10..16
- file:
path: 8r3a.cif
include:
- chain:
id: A
- chain:
id: B
binding_types:
- chain:
id: A
binding: 5..7,13
- chain:
id: B
not_binding: "all"
structure_groups:
- group:
visibility: 1
id: A
res_index: 10..13
- group:
visibility: 2
id: B
- group:
visibility: 0
id: A
res_index: 13
design:
- chain:
id: A
res_index: 14..19
secondary_structure:
- chain:
id: A
loop: 14
helix: 15..17
sheet: 19
- protein:
id: S
sequence: 10C6C3
- protein:
id: T
sequence: C10C6C3C
cyclic: true
constraints:
# specify connections as if the minimum possible number of residues was sampled
- bond:
atom1: [R, 4, SG] # connection for a helicon
atom2: [Q, 1, CK]
- bond:
atom1: [R, 11, SG]
atom2: [Q, 1, CH]
- bond:
atom1: [S, 11, SG] # connection for a disulfide bond
atom2: [S, 18, SG]
- bond:
atom1: [T, 12, SG]
atom2: [T, 19, SG]
- total_len:
min: 10
max: 20

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
entities:
- protein:
id: B
sequence: 1C11..16C1
secondary_structure:
sheet: 1,3..11
- file:
path: 7nre.cif
include:
- chain:
id: A
res_index: 24..
binding_types:
- chain:
id: A
binding: 26..31,381,408
constraints:
- bond:
atom1: [B, 2, SG]
atom2: [B, 14, SG]

View File

@@ -0,0 +1,34 @@
entities:
- protein:
id: B
sequence: 1C11..16C1
secondary_structure:
sheet: 1,3..11
- file:
path: 7nre.cif
include:
- chain:
id: A
res_index: 24..
binding_types:
- chain:
id: A
binding: 26..31,381,408
include_proximity:
- chain:
id: A
res_index: 26..31,381,408
radius: 28
constraints:
- bond:
atom1: [B, 2, SG]
atom2: [B, 14, SG]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
entities:
- protein:
id: B
sequence: 1..3CC4C1..3C1..3
- file:
path: 8wtw.cif
include:
- chain:
id: A
binding_types:
- chain:
id: A
binding: 24
constraints:
- bond:
atom1: [B, 2, SG]
atom2: [B, 10, SG]
- bond:
atom1: [B, 3, SG]
atom2: [B, 8, SG]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 1g13.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 1g13.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 1jqd.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 1jqd.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 1nb0.cif
include:
- chain:
id: A
- file:
path:
- example/nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 1nb0.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 2a1x.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 2a1x.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 2pny.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 2pny.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 3apu.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 3apu.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 3ch4.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 3ch4.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 3qkg.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 3qkg.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 6m1u.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,11 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 6m1u.cif
include:
- chain:
id: A

10556
example/hard_targets/7aah.cif Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,15 @@
entities:
- file:
path: 7aah.cif
include:
- chain:
id: A
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

View File

@@ -0,0 +1,12 @@
entities:
- protein:
id: C
sequence: 80..140
- file:
path: 7aah.cif
include:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,35 @@
entities:
- protein:
id: G
sequence: 3..7C6C3..7
- ligand:
id: F
ccd: WHL
- file:
path: 3mrp.cif
include:
- chain:
id: A # MHC
- chain:
id: C # Peptide
- chain:
id: B # other part of MHC
binding_types:
- chain:
id: C
binding: "all"
structure_groups: "all"
constraints:
- bond:
atom1: [G, 4, SG] # specify connection as if the minimum possible
atom2: [F, 1, CK]
- bond:
atom1: [G, 11, SG]
atom2: [F, 1, CH]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
entities:
- file:
path: 9bkq-assembly2.cif # penguinpox target
include:
- chain:
id: B
- file:
path:
- ../nanobody_scaffolds/7eow.yaml
- ../nanobody_scaffolds/7xl0.yaml
- ../nanobody_scaffolds/8coh.yaml
- ../nanobody_scaffolds/8z8v.yaml

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
path: 7eow.cif
include:
- chain:
id: B
design:
- chain:
id: B
res_index: 26..34,52..59,98..118
structure_groups:
- group:
id: B
visibility: 2
- group:
id: B
visibility: 0
res_index: 26..34,52..59,98..118
# Flexible lengths for CDR 1
exclude:
- chain:
id: B
res_index: 26..28 # take out 3
design_insertions:
- insertion:
id: B
res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 2
exclude:
- chain:
id: B
res_index: 52..54 # take out 3
design_insertions:
- insertion:
id: B
res_index: 52 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 3
exclude:
- chain:
id: B
res_index: 98..104 # take out seven
design_insertions:
- insertion:
id: B
res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..14
# reindex the residue index which is used in the positional encoding
reset_res_index:
- chain:
id: B

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,59 @@
path: 7xl0.cif
include:
- chain:
id: A
design:
- chain:
id: A
res_index: 26..33,51..57,97..110
structure_groups:
- group:
id: A
visibility: 2
- group:
id: A
visibility: 0
res_index: 26..33,51..57,97..110
# Flexible lengths for CDR 1
exclude:
- chain:
id: A
res_index: 26..28 # take out 3
design_insertions:
- insertion:
id: A
res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 2
exclude:
- chain:
id: A
res_index: 51..53 # take out 3
design_insertions:
- insertion:
id: A
res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 3
exclude:
- chain:
id: A
res_index: 97..102 # take out 6
design_insertions:
- insertion:
id: A
res_index: 97 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..12
# reindex the residue index which is used in the positional encoding
reset_res_index:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,60 @@
path: 8coh.cif
include:
- chain:
id: A
res_index: ..126
design:
- chain:
id: A
res_index: 26..33,51..58,97..115
structure_groups:
- group:
id: A
visibility: 2
- group:
id: A
visibility: 0
res_index: 26..33,51..58,97..115
# Flexible lengths for CDR 1
exclude:
- chain:
id: A
res_index: 26..28 # take out 3
design_insertions:
- insertion:
id: A
res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 2
exclude:
- chain:
id: A
res_index: 51..53 # take out 3
design_insertions:
- insertion:
id: A
res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 3
exclude:
- chain:
id: A
res_index: 97..103 # take out seven
design_insertions:
- insertion:
id: A
res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..14
# reindex the residue index which is used in the positional encoding
reset_res_index:
- chain:
id: A

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,60 @@
path: 8z8v.cif
include:
- chain:
id: B
design:
- chain:
id: B
res_index: 26..33,51..58,98..108
structure_groups:
- group:
id: B
visibility: 2
- group:
id: B
visibility: 0
res_index: 26..33,51..58,98..108
# Flexible lengths for CDR 1
exclude:
- chain:
id: B
res_index: 26..28 # take out 3
design_insertions:
- insertion:
id: B
res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 2
exclude:
- chain:
id: B
res_index: 51..53 # take out 3
design_insertions:
- insertion:
id: B
res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..5
# Flexible lengths for CDR 3
exclude:
- chain:
id: B
res_index: 98..100 # take out 3
design_insertions:
- insertion:
id: B
res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
num_residues: 1..12
# reindex the residue index which is used in the positional encoding
reset_res_index:
- chain:
id: B

View File

@@ -0,0 +1,24 @@
entities:
- protein:
id: B
sequence: 12..20
- file:
path: cryptochrome4_european_robin_bird_boltz_prediction.cif
include:
- chain:
id: A
structure_groups:
- group:
visibility: 1
id: A
- group:
visibility: 0
id: A
res_index: 494..
binding_types:
- chain:
id: A
binding: 494..507

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
entities:
- protein:
id: P
sequence: 5..20
- file:
path: 6wj3.cif
include:
- chain:
id: G
binding_types:
- chain:
id: G
binding: 190,193,194,258,259,262,263,205,214,215,216,217,218,219,220,221,222,232,236,239,278,279,280,281,282,283,284,285,286,240,245,246,249,250,253,254,256,257,261,262

View File

@@ -0,0 +1,16 @@
entities:
- protein:
id: P
sequence: 10..22
- file:
path: 6wj3.cif
include:
- chain:
id: G
binding_types:
- chain:
id: G
binding: 190,193,194,258,259,262,263,205,214,215,216,217,218,219,220,221,222

View File

@@ -0,0 +1,16 @@
entities:
- protein:
id: P
sequence: 10..22
- file:
path: 6wj3.cif
include:
- chain:
id: G
binding_types:
- chain:
id: G
binding: 232,236,239,278,279,280,281,282,283,284,285,286,240,245,246,249,250,253,254,256,257,261,262

View File

@@ -0,0 +1,7 @@
entities:
- protein:
id: A
sequence: 140..180
- ligand:
id: B
smiles: "C1CNC[C@@H]1OC2=C(C=C(C=C2NC(=O)C3=CC(=NC=N3)C(=O)NC4=CC(=CC(=C4O[C@@H]5CCNC5)NC(=O)CCCCN=C(N)N)C(F)(F)F)C(F)(F)F)NC(=O)CCCCN=C(N)N"

View File

@@ -0,0 +1,7 @@
entities:
- protein:
id: A
sequence: 140..180
- ligand:
id: B
ccd: TSA

View File

@@ -0,0 +1,7 @@
entities:
- protein:
id: A
sequence: 140..180
- ligand:
id: B
ccd: RPB

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,22 @@
entities:
- protein:
id: C
sequence: 8..18
cyclic: true
- file:
path: 1mk5.cif
include:
- chain:
id: A
structure_groups:
- group:
id: A
visibility: 1
- group:
id: A
visibility: 0
res_index: 32..42

View File

@@ -0,0 +1,24 @@
entities:
- protein:
id: C
sequence: 1..5C6C1..5
- file:
path: 1mk5.cif
include:
- chain:
id: A
structure_groups:
- group:
id: A
visibility: 1
- group:
id: A
visibility: 0
res_index: 32..42
constraints:
- bond:
atom1: [C, 2, SG]
atom2: [C, 9, SG]

Some files were not shown because too many files have changed in this diff Show More