first first commit

2026-06-04 11:54:23 +08:00 · 2025-10-26 20:27:38 +00:00
commit ff9964a539
221 changed files with 384768 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,212 @@
+scripts/bindcraft/utils.py
+.tmp_wandb
+*.ipynb
+!filter.ipynb
+!similarity.ipynb
+cache
+results
+workdir
+workbench
+.vscode/
+wandb
+tmp
+slurm_out
+small_data
+.idea
+*.pkl
+*.out
+.DS_Store
+
+# Development files
+.vscode
+outputs/
+workdir
+results
+notebooks/
+*.ckpt
+samples/
+workbench
+workdir_fold
+wandb
+tmp
+debug.txt
+*.pt
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# ruff
+.ruff_cache/
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Scratch dir
+scratch
+
+# pt and yaml files
+*.pt
+
+data/PepPC/
+data/PepPC*
+*.err
+*.csv
+data/test_set
+visualization_data_designed_only
+visualization_results_designed_only
+similarity.ipynb
--- a/63
+++ b/63
@@ -0,0 +1,63 @@
+# syntax=docker/dockerfile:1
+
+FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    CUDA_HOME=/usr/local/cuda \
+    PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu121 \
+    HF_HOME=/cache
+    
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    python3.10 \
+    python3.10-dev \
+    python3-pip \
+    python3-venv \
+    python3-wheel \
+    build-essential \
+    git \
+    cmake \
+    pkg-config \
+    libffi-dev \
+    libssl-dev \
+    libxml2-dev \
+    libxslt-dev \
+    libgl1 \
+    libhdf5-dev \
+    libboost-all-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    python -m pip install --upgrade pip setuptools setuptools_scm wheel
+
+WORKDIR /app
+
+COPY . /app
+
+RUN pip install --no-cache-dir -e /app
+
+ARG DOWNLOAD_WEIGHTS=false
+ARG HF_TOKEN=""
+RUN mkdir -p "${HF_HOME}" && \
+    if [ "${DOWNLOAD_WEIGHTS}" = "true" ]; then \
+        HF_TOKEN="${HF_TOKEN}" boltzgen download --models-cache-dir "${HF_HOME}" --force-download --show-paths; \
+    fi
+
+ARG USERNAME=boltzgen
+ARG USER_UID=1000
+ARG USER_GID=1000
+
+RUN groupadd --gid ${USER_GID} ${USERNAME} && \
+    useradd --uid ${USER_UID} --gid ${USER_GID} --create-home --shell /bin/bash ${USERNAME}
+
+RUN mkdir -p "${HF_HOME}" && chown -R ${USER_UID}:${USER_GID} "${HF_HOME}"
+
+USER ${USERNAME}
+WORKDIR /workspace
+
+ENTRYPOINT ["boltzgen"]
+CMD ["--help"]
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Hannes Stärk
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,577 @@
+
+<div align="center">
+  <div>&nbsp;</div>
+  <img src="assets/boltzgen.png" alt="BoltzGen logo" width="60%">
+
+[Paper](https://hannes-stark.com/assets/boltzgen.pdf) | 
+[Slack](https://boltz.bio/join-slack) <br> <br>
+ ![alt text](assets/cover.png)
+</div>
+
+# Installation
+In an environment with python >=3.11:
+```bash
+pip install boltzgen
+```
+
+<details>
+  <summary style="font-size: 1.3em; font-weight: 600;">
+    Click for detailed installation instructions
+  </summary>
+
+### 1 - Install Miniconda
+
+Choose the installer for your operating system, download it, and follow the on-screen prompts:
+
+* **Windows:** <https://www.anaconda.com/docs/getting-started/miniconda/install#windows-installation>
+* **macOS / Linux:** <https://www.anaconda.com/docs/getting-started/miniconda/install#macos-linux-installation>
+
+After installation, **open a terminal / command prompt** (you may need to search for “Anaconda Prompt” on Windows).
+
+### 2 - Create a Miniconda Python environment
+
+Run the command below in a terminal to create a fresh environment called `bg` with Python 3.12:
+
+```bash
+conda create -n bg python=3.12
+```
+
+### 3 - Activate the environment (do this every time you work with BoltzGen)
+
+```bash
+conda activate bg
+```
+
+> If you open a **new** terminal session later, you must run `conda activate bg` again before using BoltzGen.
+
+### 4 - Install BoltzGen from source
+
+Download the BoltzGen repository, change directory into the boltzgen directory, and install BoltzGen from source:
+
+```bash
+pip install boltzgen
+```
+</details>
+
+<details>
+  <summary style="font-size: 1.3em; font-weight: 600;">
+    Click for optional Docker instructions if you prefer Docker
+  </summary>
+
+To build and run the docker image:
+
+```bash
+# Build
+docker build -t boltzgen .
+
+# Run an example
+mkdir -p workdir  # output
+mkdir -p cache    # where models will be downloaded to
+docker run --rm --gpus all -v "$(realpath workdir)":/workdir -v "$(realpath cache)":/cache -v "$(realpath example)":/example \
+  boltzgen run /example/vanilla_protein/1g13prot.yaml --output /workdir/test \
+  --protocol protein-anything \
+  --num_designs 2
+```
+
+In the example above, the model weights are downloaded the first time the image is run. To bake the weights into the image at build time, run:
+
+```bash
+docker build -t boltzgen:weights --build-arg DOWNLOAD_WEIGHTS=true .
+```
+</details>
+<br>
+
+
+# Running BoltzGen
+![alt text](assets/fig1.png)
+
+
+`boltzgen run` takes a [design specification](#how-to-make-a-design-specification-yaml) `.yaml` and produces a set of ranked designs.\
+⚠️ it downloads models (~6GB) to `~/.cache`. This can by changed by passing `--cache YOUR_PATH` or by setting `$HF_HOME`.\
+⚠️ If your run is ever interrupted, you can restart it with `--reuse`. No progress is lost.
+
+
+```bash
+boltzgen run example/vanilla_protein/1g13prot.yaml \
+  --output workbench/test_run \
+  --protocol protein-anything \
+  --num_designs 10 \
+  --budget 2
+# --num_designs is the number of intermediate designs. In practice you will want between 10,000 - 60,000
+# --budget is how many designs should be in the final diversity optimized set
+```
+All command line args are explained in ["All Command Line Arguments"](#all-command-line-arguments).\
+**Step-by-step guide for making your designs:**
+1. Make your `.yaml` file that specifies your target and what you want to design. We provide many examples in
+   `example` such as `example/vanilla_peptide_with_target_binding_site/beetletert.yaml`. Details in
+   ["How to make a design specification .yaml"](#how-to-make-a-design-specification-yaml).
+2. Check whether your design specification is as intended.  
+   1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.  
+   2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
+   3. Your viewer should show the binding site in a different color than the rest of the target.
+3. Run the `boltzgen run ...` command as above on your `.yaml` file. 
+4. Your filtered, ranked set of designs will be in `--output`. <img src="assets/fig_seconds_per_design.png" alt="Seconds per design" align="right" width="35%">
+5. You likely want to rerun the filtering step with different settings (takes ~15 sec). Use
+   `boltzgen run --steps filtering --output ...` or the Jupyter notebook `filter.ipynb` which is often more convenient.
+   Detailed explanation in ["Rerunning the Filtering"](#rerunning-the-filtering-recommended).
+
+**How many designs to generate?** \
+More is better. The "minimum" depends on your target.  
+BoltzGen should be run on a GPU. On the right you can see the time required for each step in the pipeline for a single design on an A100 GPU.
+
+We suggest first running with e.g. `--num_design 50`, checking that everything behaves as desired, and then increasing `--num_design` to between 10,000 - 60,000.
+
+## Pipeline output
+When the pipeline completes your output directory will have:
+- `config/`, `steps.yaml`: configuration files.
+- `intermediate_designs/`: output of design step
+  - `/*.cif` and `/*.npz`: CIF and NPZ (metadata files) for the designed proteins and targets before inverse folding
+- `intermediate_designs_inverse_folded/`: output of inverse folding, folding, and analysis steps
+  - `/*.cif` and `/*.npz` : CIF and NPZ for designed proteins and targets after inverse folding.  *Note: For designed residues, only the backbone atoms will have coordinates (sidechain coordinates will be 0,0,0).*
+  - `/refold_cif`: refolded complex structures (target and binder). This is the primary input to the analysis and filtering steps.
+  - `/refold_design_cif`: refolded binder structures, without target.
+  - `/aggregate_metrics_analyze.csv`, `/per_target_metrics_analyze.csv` — outputs of the analysis step.
+- `final_ranked_designs/` : outputs of the filtering step
+  - `/intermediate_ranked_<N>_designs/` — top-N quality designs. CIFs are copied from `refold_cif` above.
+  - `/final_<budget>_designs/` — quality + diversity set. CIFs copied from `refold_cif/`.
+  - `/all_designs_metrics.csv` — metrics for all designs considered by filtering.
+  - `/final_designs_metrics_<budget>.csv` — metrics for the selected final set.
+  - `/results_overview.pdf` — plots
+
+# Protocols 
+
+| Protocol (design-target) | Appropriate for                                                           | Major config differences        |
+|--------------------------|---------------------------------------------------------------------------|------------------------|
+| protein-anything         | Design proteins to bind proteins or peptides                              | Includes `design folding` step. |
+| peptide-anything         | Design peptides (including helicons, cyclic peptides) to bind proteins | No Cys are generated in inverse folding. No `design folding` step. Don't compute largest hydrophobic patch. |
+| protein-small_molecule   | Design proteins to bind small molecules                                | Includes binding affinity prediction. Includes `design folding` step. |
+| nanobody-anything        | Design nanobodies (single-domain antibodies)                           | No Cys are generated in inverse folding. No `design folding` step. Don't compute largest hydrophobic patch. |
+
+All configuration parameters can be overridden using the `--config` option; see `boltzgen run --help` or the `Advanced Users` section below for details.
+
+
+
+# How to make a design specification .yaml
+A more detailed explanation of how our <code>.yaml</code> design specification files work is in <a href="example/README.md" target="_blank">example/README.md</a>. Below is an example based explanation, which is sufficient for most tasks.
+
+**IMPORTANT:** ⚠️ All residue indices are specified **starting at 1** and we use the canonical mmcif residue index `label_asym_id`, and **not** the `auth_asym_id` author residue index! 
+You can check the indexing in your mmcif file by opening it in https://molstar.org/viewer/, hovering over a residue, and checking the index on the bottom right. You will see something like this where **41 is the index we use, the auth id 22 is incorrect**:
+
+![](assets/label_seq_id.png)
+
+After you constructed your `.yaml` file we recommend that you run the `check` command on it:
+1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.  
+2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
+3. Your viewer should show the binding site in a differnt color than the rest of the target. 
+
+
+## Example based explanation:
+We provide many example `.yaml` files in the `example/` directory, including:
+
+- `example/design_spec_showcasing_all_functionalities.yaml`
+- `example/vanilla_peptide_with_target_binding_site/beetletert.yaml`
+- `example/peptide_against_specific_site_on_ragc/rragc.yaml`
+- `example/nanobody_against_penguinpox/penguinpox.yaml`
+- `example/denovo_zinc_finger_against_dna/zinc_finger.yaml`
+- `example/protein_binding_small_molecule/chorismite.yaml`
+
+Small example of a protein design against a target protein without binding site specified:
+```yaml
+entities:
+  # Designed protein with between 80 and 140 residues 
+  # (The lenght is randomly sampled)
+  - protein: 
+      id: B
+      sequence: 80..140
+
+  # The target is extracted from a .cif file
+  - file:
+      # file references are relative to the location of the .yaml file
+      path: 6m1u.cif # .pdb files also work
+
+      # Which chain in the .cif file to use as target (uses all chains if unspecified)
+      include: 
+        - chain:
+            id: A
+```
+
+**IMPORTANT:** ⚠️ File references inside a yaml file (e.g. to cif files) are interpreted relative to the directory of the yaml file.
+
+
+Example highlighting many (not all) functionalities:
+```yaml
+entities:
+  # Specification of the target which is extracted from a .cif file
+  - file:
+      path: 8r3a.cif # .pdb files also work
+      
+      # Which chain and residues in the .cif file to use as target (uses all chains if unspecified)
+      include: 
+        - chain:
+            id: A
+            res_index: 2..50,55.. # residues between 2 and 50 and anything larger than 55
+        - chain:
+            id: B
+
+      # Wich regions of the target the design should or should NOT
+      # bind to (this can be left unspecified, then we just bind anywhere)
+      binding_types:
+        - chain:
+            id: A
+            binding: 5..7,13
+        - chain:
+            id: B
+            not_binding: "all" 
+      
+      # Which regions of the target should have their structure specified.
+      # By default, everything is visibility 1 which means that the structure is specified.
+      # If the visibility is 0, then the structure is not specified.
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+            res_index: 10..13
+        - group:
+            # The relative positioning of things in structure group 2
+            # is not specified w.r.t to things in structure group 1
+            visibility: 2 
+            id: B
+        # Overwrite the previous visibility setting and set it to 0 for res_index 13
+        - group:
+            visibility: 0
+            id: A
+            res_index: 13 
+
+      # Optionally you can say that some residues in a loaded .cif file should also be redesigned.
+      design:
+        - chain:
+            id: A
+            res_index: 14..19
+
+      # For designed regions you can say what secondary structure they should have
+      secondary_structure:
+        - chain:
+            id: A
+            loop: 14
+            helix: 15..17
+            sheet: 19
+
+  # Specify a NON-designed protein chain
+  - protein: 
+      id: X
+      sequence: AAVTTTTPPP
+
+  # Specify a designed protein chain 
+  # Numbers specify what is being designed
+  - protein: 
+      id: G
+      # random number between 15 and 20 of designed residues (inclusive)
+      sequence: 15..20AAAAAAVTTTT18PPP 
+
+  # A designed helicon 
+  # (see the constraints below that connect the peptide with the WHL ligand)
+  - protein: 
+      id: R
+      # Random number of design residues between 3 and 5,
+      # then a Cystein, then 6 design residues, then ...
+      sequence: 3..5C6C3 
+  - ligand:
+      id: Q
+      ccd: WHL
+  
+  # A designed peptide with 17 residues
+  - protein:
+      id: H
+      sequence: 17
+
+  # specification for a designed peptide with two Cys and a disulfide bond (see constraints)
+  - protein:
+      id: S
+      sequence: 10..14C6C3
+
+constraints:
+    # specify connections as if the minimum possible number of residues was sampled
+  - bond:
+      atom1: [R, 4, SG] # connection for a helicon between small molecule and designed peptide
+      atom2: [Q, 1, CK]
+  - bond:
+      atom1: [R, 11, SG] # connection for a helicon between small molecule and designed peptide
+      atom2: [Q, 1, CH]
+  - bond:
+      atom1: [S, 11, SG] # connection for a disulfide bond between Cys and Cys in designed peptide
+      atom2: [S, 18, SG]
+
+```
+
+
+# Running only specific pipeline steps
+
+You can run only specific parts of the pipeline using the `--steps` flag:
+
+**Run only the design and inverse_folding steps:**
+```bash
+boltzgen run example/cyclotide/3ivq.yaml \
+  --output workbench/partial-run \
+  --protocol peptide-anything \
+  --steps design inverse_folding \
+  --num_designs 2
+```
+
+**Available steps:**
+- `design` - Generate num_design candidates using the diffusion model based on your design specification
+- `inverse_folding` - Redesign sequences from the previous step using our inverse folding model
+- `folding` - Re-fold the designed binders with their targets using Boltz-2 model
+- `design_folding` - Re-fold the designed binders alone without target (disabled for peptide and nanobody binders)
+- `affinity` - Predict binding affinity between designed proteins and their target small molecules using Boltz-2 (for design of small molecule binders only)
+- `analysis` - Analyze the folded structures using various metrics to assess design quality
+- `filtering` - Filter and rank designs based on analysis results to select the best candidates
+
+
+# Rerunning the filtering (recommended)
+After you generate designs, you will probably want to rerun the filtering step (which runs very fast) several times to tune your criteria for selecting good ones.
+
+You can run the filtering step either using the `boltzgen` command or 
+using a [jupyter notebook](filter.ipynb) that we provide. In most cases the notebook is more convenient. If you'd prefer to use the command-line, here is an example of re-running the filters without the notebook.
+
+First, suppose we initially generated some designs with default filtering options:
+
+```bash
+boltzgen run example/binding_disordered_peptides/tpp4.yaml \
+  --output workbench/tpp4 \
+  --protocol protein-anything \
+  --num_designs 20
+```
+
+After this runs we see that only a few designs passed our filters. We might now adjust the filters by running:
+
+```
+boltzgen run example/binding_disordered_peptides/tpp4.yaml \
+  --output workbench/tpp4 \
+  --protocol protein-anything \
+  --steps filtering \
+  --refolding_rmsd_threshold 3.0 \
+  --filter_biased=false \
+  --additional_filters 'ALA_fraction<0.3' 'filter_rmsd_design<2.5' \
+  --metrics_override plip_hbonds_refolded=4 \
+  --alpha 0.2
+```
+
+# All command line arguments
+
+## `boltzgen run`
+The `boltzgen run` command executes the BoltzGen binder design pipeline. Here are all available options:
+
+### Design Specification
+- `design_spec` - Path(s) to design specification YAML file(s), or a directory containing prepared configs
+
+### General Configuration
+- `--protocol {protein-anything,peptide-anything,protein-small_molecule,nanobody-anything}` - Protocol to use for the design. This determines default settings and in some cases what steps are run. Default: protein-anything. See [Protocols](#protocols) section for details.
+- `--output OUTPUT` - Output directory for pipeline results
+- `--config CONFIG [CONFIG ...]` - Override pipeline step configuration, in format `<step_name> <arg1>=<value1> <arg2>=<value2> ...` (example: `--config folding num_workers=4 trainer.devices=4`). Can be used multiple times.
+- `--devices DEVICES` - Number of devices to use. Default is all devices available.
+- `--num_workers NUM_WORKERS` - Number of DataLoader worker processes.
+- `--config_dir CONFIG_DIR` - Path to the directory of default config files. Default: `/home/bizon/git/foldeverything/config`
+- `--use_kernels {auto,true,false}` - Whether to use kernels. One of 'auto', 'true', or 'false'. Default: auto. If 'auto', will use kernels if the device capability is >= 8.
+- `--moldir MOLDIR` - Path to the moldir. Default: `huggingface:boltzgen/inference-data:mols.zip`
+
+### Design
+- `--num_designs NUM_DESIGNS` - Number of total designs to generate. This commonly would be something like 10,000. After generating 10,000 designs we then filter down to `--budget` many designs in the filter step
+- `--diffusion_batch_size DIFFUSION_BATCH_SIZE` - Number of diffusion samples to generate per trunk run. If not specified, this defaults to 1 if `--num-designs` is less than 100, and 10 otherwise. Note that for design tasks that randomly sample the binder length (or use randomness in other ways), all designs generated in the same batch will share the same length. Having a large diffusion batch size compared to the total number of designs to generate will therefore not evenly sample the possible lengths.
+- `--design_checkpoints DESIGN_CHECKPOINTS [DESIGN_CHECKPOINTS ...]` - Path to the boltzgen checkpoint(s). One or more checkpoints are supported. Just specifying an individual path here will work. Each will be used for an equal fraction of the designs. By default, two checkpoints are used. Default: `['huggingface:boltzgen/boltzgen1_diverse:boltzgen1_diverse.ckpt', 'huggingface:boltzgen/boltzgen1_adherence:boltzgen1_adherence.ckpt']`
+- `--step_scale STEP_SCALE` - Fixed step scale to use (e.g. 1.8). Default is to use a schedule
+- `--noise_scale NOISE_SCALE` - Fixed noise scale to use (e.g. 0.98). Default is to use a schedule
+
+### Inverse Folding
+- `--skip_inverse_folding` - Skip inverse folding step
+- `--inverse_fold_num_sequences INVERSE_FOLD_NUM_SEQUENCES` - Number of sequences per backbone to generate in the inverse fold step. Default: 1
+- `--inverse_fold_checkpoint INVERSE_FOLD_CHECKPOINT` - Path or huggingface repo and filename for the inverse fold checkpoint. Default: `huggingface:boltzgen/boltzgen1_ifold:boltzgen1_ifold.ckpt`
+- `--inverse_fold_avoid INVERSE_FOLD_AVOID` - Disallowed residues as a string of one letter amino acid codes, e.g. 'KEC'. This is implemented at the inverse fold step, so it only affects results if inverse folding is enabled. Default: none for protein design, 'C' for peptide and nanobody design. Pass an empty list if you want Cysteins to be generated if you are using a nanobody or peptide protocol
+- `--only_inverse_fold` - Skip design step and only run inverse folding. Requires a fully specified structure.
+
+### Folding and Affinity Prediction
+- `--folding_checkpoint FOLDING_CHECKPOINT` - Path to the folding checkpoint. Default: `huggingface:boltzgen/boltz2_conf_final:boltz2_conf_final.ckpt`
+- `--affinity_checkpoint AFFINITY_CHECKPOINT` - Path to the affinity predictor checkpoint. Default: `huggingface:boltzgen/boltz2_affinity:boltz2_aff.ckpt`
+
+### Filtering
+- `--budget BUDGET` - How many designs should be in the final diversity optimized set. This is used in the filtering step.
+- `--alpha ALPHA` - Trade-off for sequence diversity selection: 0.0=quality-only, 1.0=diversity-only. Default is 0.01 (peptide-anything protocol) or 0.001 (other protocols).
+- `--filter_biased {true,false}` - Remove amino-acid composition outliers (default caps on ALA/GLY/GLU/LEU/VAL). Default: true.
+- `--metrics_override METRICS_OVERRIDE [METRICS_OVERRIDE ...]` - Per-metric inverse-importance weights for ranking. Format: `metric_name=weight` (e.g., `plip_hbonds_refolded=4 delta_sasa_refolded=2`). A larger value down-weights that metric's rank. Use `metric_name=none` to remove a metric.
+- `--additional_filters ADDITIONAL_FILTERS [ADDITIONAL_FILTERS ...]` - Extra hard filters. Format: `feature>threshold` or `feature<threshold` (e.g., `'design_ALA>0.3' 'design_GLY<0.2'`). Use '>' if higher is better, '<' if lower is better. Make sure to single-quote the strings so your shell doesn't get confused by < and > characters.
+- `--size_buckets SIZE_BUCKETS [SIZE_BUCKETS ...]` - Optional constraint for maximum number of designs in size ranges. Format: `min-max:count` (e.g., `10-20:5 20-30:10 30-40:5`).
+- `--refolding_rmsd_threshold REFOLDING_RMSD_THRESHOLD` - Threshold used for RMSD-based filters (lower is better).
+
+### Execution Options
+- `--reuse` - Reuse existing results across all steps. Generate only as many new designs are needed to achieve the specified total number of designs.
+- `--no_subprocess` - Run each step in the main process. Will cause issues when devices >1.
+- `--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]` - Run only the specified pipeline steps (default: run all steps). See [The individual pipeline steps](#the-individual-pipeline-steps) section for details.
+
+### Model and Data Download Options
+- `--force_download` - Force a (re)-download of models and data.
+- `--models_token MODELS_TOKEN` - Secret token to use for our models hosting service (Hugging Face). Default: `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`
+- `--cache CACHE` - Directory where downloaded models will be stored. Default: `~/.cache`
+
+## `boltzgen download`
+
+The `boltzgen download` command downloads model weights and data artifacts needed for BoltzGen. In most cases you don't need to use `boltzgen download`, since `boltzgen run` will download what is needed automatically.
+
+Downloaded weights and datasets are stored in `~/.cache` by default but this can be changed by specifying `--cache`.
+
+### Example
+
+```bash
+boltzgen download all # downloads all models
+boltzgen download inverse-fold # downloads only the inverse folding model
+```
+
+### Usage
+```bash
+boltzgen download [-h] [--force_download] [--models_token MODELS_TOKEN] [--cache CACHE] {affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all} [{affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all} ...]
+```
+
+### Positional arguments
+- `{affinity,design-adherence,design-diverse,folding,inverse-fold,moldir,all}` - Subset of artifacts to download, or 'all' to download all artifacts.
+
+### Options
+- `--force_download` - Force a (re)-download of models and data.
+- `--models_token MODELS_TOKEN` - Secret token to use for our models hosting service. Not usually required.
+- `--cache CACHE` - Directory where downloaded models will be stored. Default: `~/.cache`
+
+## `boltzgen configure`
+For more control over your design process, you can separate the configuration generation from execution:
+
+### Example
+```bash
+boltzgen configure example/cyclotide/3ivq.yaml \
+  --output workbench/test-peptide-protein \
+  --protocol peptide-anything \
+  --num_designs 2
+```
+
+This creates configuration files in `workbench/test-peptide-protein/` without running the actual design pipeline. You can edit these files if needed and then run `boltzgen execute workbench/test-peptide-protein` to run the workflow.
+
+The options that `boltzgen configure` takes are a subset of the `boltzgen run` options so we don't list them all again here. Try `boltzgen configure --help` if you need help.
+
+## `boltzgen execute`
+
+The `boltzgen execute` command executes a pre-configured pipeline from a directory of config files generated by the `boltzgen configure` command.
+
+### Usage
+```bash
+boltzgen execute [-h] [--reuse] [--no_subprocess] [--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]] output
+```
+
+### Positional Arguments
+- `output` - Directory containing pre-configured pipeline files (generated by 'configure' command)
+
+### Execution Options
+- `--reuse` - Reuse existing results across all steps. Generate only as many new designs are needed to achieve the specified total number of designs.
+- `--no_subprocess` - Run each step in the main process. Will cause issues when devices >1.
+- `--steps {design,inverse_folding,design_folding,folding,affinity,analysis,filtering} [{design,inverse_folding,design_folding,folding,affinity,analysis,filtering} ...]` - Run only the specified pipeline steps (default: run all steps)
+
+# Training BoltzGen models
+Install in dev mode which will install additional packages like `wandb`.
+```bash
+git clone https://github.com/HannesStark/boltzgen
+pip install -e .[dev]
+```
+### 1 – Download training data and checkpoints
+```bash
+# Choose any location; this is default in yaml files
+mkdir -p training_data
+cd training_data
+
+# ─ Targets ─
+wget -O targets.zip "https://huggingface.co/datasets/boltzgen/boltzgen1_train/resolve/main/targets.zip?download=true"
+unzip targets.zip      # → training_data/targets/
+
+# ─ MSAs  ─
+wget -O msa.zip "https://huggingface.co/datasets/boltzgen/boltzgen1_train/resolve/main/msa.zip?download=true"
+unzip msa.zip          # → training_data/msa/
+
+# ─ Small-molecule dictionary ─
+wget -O mols.zip "https://huggingface.co/datasets/boltzgen/inference-data/resolve/main/mols.zip?download=true"
+unzip mols.zip         # → training_data/mols/
+
+# ─ Folding checkpoint  ─
+wget -O boltz2_fold.ckpt "https://huggingface.co/boltzgen/boltzgen-1/resolve/main/boltz2_conf_final.ckpt?download=true"
+
+# ─────────── (optional) pretrained structure-only ckpt ───────────
+# Needed ONLY if you want to resume from a structure-trained model.
+wget -O boltzgen1_structuretrained_small.ckpt \
+  "https://huggingface.co/boltzgen/boltzgen-1/resolve/main/boltzgen1_structuretrained_small.ckpt?download=true"
+```
+Resulting layout
+```
+training_data/
+ ├─ targets/     (used for target_dir in yaml)     
+ ├─ msa/         (used for msa_dir in yaml)     
+ ├─ mols/        (used for mol_dir in yaml) 
+ ├─ boltz2_fold.ckpt    (used for folding_checkpoint in yaml) 
+ └─ boltzgen1_structuretrained_small.ckpt   (used for pretrained in yaml)
+```
+
+The directory `training_data` is the default location referenced in the 
+example YAML configuration files.  If you place the data elsewhere, be sure to update those paths accordingly.
+### 2 – Training YAML files
+
+Below is a quick reference for the three training configurations and how to launch them once paths are set:
+
+| Config file | Purpose | Example command |
+|-------------|---------|-----------------|
+| `config/train/boltzgen_small.yaml` | Train the **small** Boltzgen model (recommended for development, 8 GPUs, gradient accumulation 16) | `python main.py config/train/boltzgen_small.yaml  name=boltzgen_small` |
+| `config/train/boltzgen.yaml` | Train the **large** BoltzGen model | `python main.py config/train/boltzgen.yaml        name=boltzgen_large` |
+| `config/train/inverse_folding.yaml` | Train the **inverse-folding** model only | `python main.py config/train/inverse_folding.yaml name=boltzgen_if` |
+
+If you store the data somewhere other than `./training_data`, search and replace that path **in all three YAML files**.  Typical keys you may need to update are `target_dir`, `msa_dir`, `moldir`, `pretrained`, `folding_checkpoint`, `monomer_target_dir`, and `ligand_target_dir`.
+
+Example places:
+
+```yaml
+data:
+  datasets:
+    - target_dir: ./training_data/targets
+      msa_dir:    ./training_data/msa
+  moldir: ./training_data/mols
+
+pretrained: ./training_data/boltzgen1_structuretrained_small.ckpt
+
+folding_checkpoint: ./training_data/boltz2_fold.ckpt
+```
+
+### 3 – Train the models (example commands)
+
+Small model on 8 GPUs gradient accumulation 16 (recommended dev setup):
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main.py config/train/boltzgen_small.yaml \
+       name=boltzgen_small
+```
+
+Large model:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main.py config/train/boltzgen.yaml \
+       name=boltzgen_large
+```
+
+Inverse-folding model:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main.py config/train/inverse_folding.yaml \
+       name=boltzgen_if
+```
+
+Note: the large model currently expects additional distillation datasets (to be released). You can still explore its hyper-parameters and train solely on PDB data by adjusting the paths.
+
+**Optionally resuming from a checkpoint**
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python main.py config/train/boltzgen_small.yaml \
+       pretrained=./training_data/boltzgen1_structuretrained_small.ckpt \
+       name=boltzgen_small_pretrained
+```
--- a/assets/boltzgen.png
+++ b/assets/boltzgen.png
--- a/assets/cover.png
+++ b/assets/cover.png
--- a/assets/fig1.png
+++ b/assets/fig1.png
--- a/assets/fig_seconds_per_design.png
+++ b/assets/fig_seconds_per_design.png
--- a/assets/label_seq_id.png
+++ b/assets/label_seq_id.png
--- a/config/affinity.yaml
+++ b/config/affinity.yaml
@@ -0,0 +1,53 @@
+_target_: boltzgen.task.predict.predict.Predict
+
+debug: false
+
+data:
+  _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_generated.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+
+    suffix: .cif
+    suffix_metadata: .npz
+    suffix_native: _native.cif
+    samples_per_target: 1000000000000000
+    num_targets: 10000000000000
+    moldir: null
+
+    batch_size: 1
+    num_workers: 4
+    pin_memory: true
+  design_dir: null
+  return_native: false
+  compute_affinity: true
+  target_templates: false
+  fail_if_no_designs: true
+  
+
+keys_dict_out: []
+
+writer:
+  _target_: boltzgen.task.predict.writer.AffinityWriter
+  design_dir: ${data.design_dir}
+
+trainer:
+  accelerator: gpu
+  logger: false
+  devices: 1
+  precision: bf16-mixed
+
+name: affinity_in_eval_affinity
+output: null
+checkpoint: null
+matmul_precision: null
+recycling_steps: 3
+sampling_steps: 200
+diffusion_samples: 5
+
+override:
+  validators: null
--- a/config/analysis.yaml
+++ b/config/analysis.yaml
@@ -0,0 +1,64 @@
+_target_: boltzgen.task.analyze.analyze.Analyze
+
+name: analyze
+design_dir: null
+debug: false
+num_processes: 32
+
+# Common metrics to compute
+affinity_metrics: false
+backbone_fold_metrics: true
+noncovalents_original: true
+noncovalents_refolded: true
+delta_sasa_original: true
+delta_sasa_refolded: true
+largest_hydrophobic: false
+largest_hydrophobic_refolded: true
+run_clustering: false
+
+# Liability analysis
+liability_analysis: true
+liability_modality: peptide
+liability_peptide_type: linear
+
+# Uncommon metrics
+diversity_original: false
+diversity_refolded: false
+diversity_per_target_original: false
+diversity_per_target_refolded: false
+novelty_original: false
+novelty_refolded: false
+novelty_per_target_original: false
+novelty_per_target_refolded: false
+ss_conditioning_metrics: false 
+sequence_recovery: false
+native: false # This is only required for evaluations when we want to compute sequence recovery 
+compute_lddts: false # This is time intensive to compute
+  
+
+data:
+  _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_generated.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+
+    suffix: .cif
+    suffix_metadata: .npz
+    suffix_native: _native.cif
+    samples_per_target: 1000000000000000
+    num_targets: 10000000000000
+    moldir: null
+
+    batch_size: 1
+    num_workers: 4
+    pin_memory: true
+    disulfide_prob: 1.0
+    disulfide_on: true
+  design_dir: ${design_dir}
+  target_templates: false
+  return_native: ${native}
+  fail_if_no_designs: true
--- a/config/design.yaml
+++ b/config/design.yaml
@@ -0,0 +1,99 @@
+_target_: boltzgen.task.predict.predict.Predict
+
+data:
+  _target_: boltzgen.task.predict.data_from_yaml.FromYamlDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_yaml.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+    moldir: null
+    yaml_path: null
+    output_dir: ${output}
+    diffusion_samples: ${diffusion_samples}
+
+    # Design
+    backbone_only: false
+    atom14: true
+    atom37: false
+    disulfide_prob: 1.0
+    disulfide_on: true
+  batch_size: 1
+  num_workers: 4
+  pin_memory: true
+  
+
+writer:
+  _target_: boltzgen.task.predict.writer.DesignWriter
+  output_dir: ${output}
+  res_atoms_only: false
+  atom14: ${data.cfg.atom14}
+  atom37: ${data.cfg.atom37}
+  backbone_only: ${data.cfg.backbone_only}
+  write_native: false
+
+trainer:
+  accelerator: gpu
+  devices: 1
+  precision: bf16-mixed
+
+name: null
+output: null
+checkpoint: null
+matmul_precision: high
+recycling_steps: 3
+sampling_steps: 500
+diffusion_samples: 1
+compile_pairformer: false
+compile_structure: false
+
+override:
+  masker_args:
+    mask: true
+    mask_backbone: false
+
+  validators: null
+
+  step_scale_schedule:
+    - step_scale: 1.8
+      period: 0.25
+    - step_scale: 2.0
+      period: 0.25
+    - step_scale: 1.8
+      period: 0.25
+    - step_scale: 2.0
+      period: 0.25
+
+  noise_scale_schedule:
+    - noise_scale: 0.95
+      period: 0.25
+    - noise_scale: 0.88
+      period: 0.25
+    - noise_scale: 0.95
+      period: 0.25
+    - noise_scale: 0.88
+      period: 0.25
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: null
+    step_scale: null
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+    sampling_schedule: "dilated"
+    time_dilation: 2.667 
+    time_dilation_start: 0.6 
+    time_dilation_end: 0.8
+    
+
--- a/config/filtering.yaml
+++ b/config/filtering.yaml
@@ -0,0 +1,16 @@
+_target_: boltzgen.task.filter.filter.Filter
+
+budget:  30
+top_budget:  10
+use_affinity:  false  # This changes the filtering metrics to metrics more amenable to small molecule binder design
+filter_cysteine:  false  # [different from peptide-protein] This filters out all designs that have designed cysteins in them (prespecified cysteins in the design are not counted)
+from_inverse_folded:  false  # This makes it so that we use the backbone refolding rmsd instead of the all-atom RMSD
+filter_bindingsite:  false  # This filters out everything that does not have a residue within 4A of a binding site residue
+modality: "peptide"  # peptide or antibody
+peptide_type: "linear"  # linear or cyclic
+alpha:  0.001  # 0 quality-only 1 diversity-only
+random_state:  0
+metrics_override:  null  # overrides metrics None values delete keys
+num_liability_plots:  0
+plot_seq_logos:  false  # make sequence logo diagrams of designed sequence
+
--- a/config/fold.yaml
+++ b/config/fold.yaml
@@ -0,0 +1,54 @@
+_target_: boltzgen.task.predict.predict.Predict
+
+debug: false
+
+data:
+  _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_generated.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+
+    suffix: .cif
+    suffix_metadata: .npz
+    suffix_native: _native.cif
+    samples_per_target: 1000000000000000
+    num_targets: 10000000000000
+    moldir: null
+
+    batch_size: 1
+    num_workers: 4
+    pin_memory: true
+    disulfide_prob: 1.0
+    disulfide_on: true
+  design_dir: null
+  target_templates: true
+  return_native: false
+  fail_if_no_designs: true
+  output_dir: null
+  
+keys_dict_out: ["min_interaction_pae", "min_design_to_target_pae", "interaction_pae", "ligand_iptm", "protein_iptm", "iptm", "design_iptm", "design_iiptm", "design_to_target_iptm", "design_ptm", "target_ptm", "ptm"]
+
+writer:
+  _target_: boltzgen.task.predict.writer.FoldingWriter
+  design_dir: ${data.design_dir}
+
+trainer:
+  accelerator: gpu
+  logger: false
+  devices: 1
+  precision: bf16-mixed
+
+name: null
+output: null
+checkpoint: null
+matmul_precision: null
+recycling_steps: 3
+sampling_steps: 200
+diffusion_samples: 5
+
+override:
+  validators: null
--- a/config/inverse_fold.yaml
+++ b/config/inverse_fold.yaml
@@ -0,0 +1,98 @@
+_target_: boltzgen.task.predict.predict.Predict
+
+data:
+  _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_generated.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+
+    suffix: .cif
+    suffix_metadata: .npz
+    suffix_native: _native.cif
+    moldir: null
+    samples_per_target: 1000000000
+
+    # Design
+    design: true
+    backbone_only: true
+    atom14: false
+    max_seqs: 1
+    inverse_fold: true
+
+    batch_size: 1
+    num_workers: 4
+    pin_memory: true
+    num_targets: 1000000000
+    design_mask_override: null
+
+  fail_if_no_designs: true
+  design_dir: null
+  output_dir: ${output}
+
+writer:
+  _target_: boltzgen.task.predict.writer.DesignWriter
+  output_dir: ${output}
+  res_atoms_only: false
+  atom14: ${data.cfg.atom14}
+  inverse_fold: ${data.cfg.inverse_fold}
+  write_native: false
+
+trainer:
+  accelerator: gpu
+  devices: 1
+  precision: 32  # bf16-mixed
+
+name: if_full
+output: null
+checkpoint: null
+matmul_precision: null
+recycling_steps: 3
+sampling_steps: 200
+diffusion_samples: 1
+
+override:
+  masker_args:
+    mask: true
+    mask_backbone: false
+
+  validators: null
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: 1.0
+    step_scale: 1.0
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+    
+  inverse_fold_args:
+    atom_s: 128
+    atom_z: 16
+    token_s: 384
+    token_z: 128
+    node_dim: 128
+    pair_dim: 128
+    hidden_dim: 128
+    dropout: 0.1
+    softmax_dropout: 0.2
+    num_encoder_layers: 6
+    transformation_scale_factor: 1.0
+    inverse_fold_noise: 0.2
+    topk: 30
+    num_heads: 4
+    num_decoder_layers: 3
+    autoregressive: true
+    enable_input_embedder: true
+    inverse_fold_restriction: []
--- a/config/inverse_fold_only.yaml
+++ b/config/inverse_fold_only.yaml
@@ -0,0 +1,89 @@
+_target_: boltzgen.task.predict.predict.Predict
+
+data:
+  _target_: boltzgen.task.predict.data_from_yaml.FromYamlDataModule
+  cfg:
+    _target_: boltzgen.task.predict.data_from_yaml.DataConfig
+    tokenizer:
+      _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+      atomize_modified_residues: false
+    featurizer:
+      _target_: boltzgen.data.feature.featurizer.Featurizer
+    moldir: null
+    yaml_path: null
+
+    # Design
+    backbone_only: true
+    atom14: false
+    atom37: false
+    disulfide_prob: 1.0
+    disulfide_on: true
+  batch_size: 1
+  num_workers: 4
+  pin_memory: true
+
+writer:
+  _target_: boltzgen.task.predict.writer.DesignWriter
+  output_dir: ${output}
+  res_atoms_only: false
+  atom14: ${data.cfg.atom14}
+  inverse_fold: true
+  write_native: false
+
+trainer:
+  accelerator: gpu
+  devices: 1
+  precision: 32
+
+name: inverse_fold_only
+output: null
+checkpoint: null
+matmul_precision: null
+recycling_steps: 3
+sampling_steps: 200
+diffusion_samples: 1
+
+override:
+  masker_args:
+    mask: true
+    mask_backbone: false
+
+  inverse_fold: true
+
+  validators: null
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: 1.0
+    step_scale: 1.0
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+
+  inverse_fold_args:
+    atom_s: 128
+    atom_z: 16
+    token_s: 384
+    token_z: 128
+    node_dim: 128
+    pair_dim: 128
+    hidden_dim: 128
+    dropout: 0.1
+    softmax_dropout: 0.2
+    num_encoder_layers: 6
+    transformation_scale_factor: 1.0
+    inverse_fold_noise: 0.2
+    topk: 30
+    num_heads: 4
+    num_decoder_layers: 3
+    autoregressive: true
+    enable_input_embedder: true
+    inverse_fold_restriction: []
--- a/config/train/boltzgen.yaml
+++ b/config/train/boltzgen.yaml
@@ -0,0 +1,579 @@
+_target_: boltzgen.task.train.train.Training
+
+trainer:
+  accelerator: gpu
+  devices: 8
+  precision: bf16-mixed
+  gradient_clip_val: 10.0
+  accumulate_grad_batches: 1
+  max_epochs: -1
+  num_sanity_val_steps: 3
+  log_every_n_steps: 1
+
+wandb:
+  group: boltzgen
+  project: boltzgen
+  entity: yourwandb
+
+name: a_big_run_resume3
+slurm: true
+output: workdir
+strict_loading: false
+resume: null
+pretrained: null
+debug: false
+save_every_n_train_steps: 2500
+disable_checkpoint: false
+matmul_precision: null
+save_top_k: -1
+
+data:
+  datasets:
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/targets
+      msa_dir: ./training_data/msa
+      prob: 0.6
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.date.DateFilter
+          date: "2023-06-01"
+          ref: released
+        - _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
+          resolution: 9.0
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
+      symmetry_correction: false
+      val_group: "RCSB"
+
+    # AFDB Distillation Data
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      manifest_path: ./training_data/afdb/afdb_manifest_foldseek_c75_confidence.json
+      target_dir: ./training_data/afdb/targets
+      msa_dir: ./training_data/afdb/msa
+      prob: 0.3
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
+          composition_op: "AND"
+          metrics: ["confidence_score"]
+          compare_ops: ["greater"]
+          thresholds: [70]
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      symmetry_correction: true
+      override_method: "AFDB"
+      override_bfactor: true
+
+      # Protein-Ligand Distillation Data
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/protein_ligand/targets
+      msa_dir: ./training_data/protein_ligand/msa
+      moldir: ./training_data/protein_ligand/mols
+      prob: 0.03
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
+          composition_op: "AND"
+          metrics: ["complex_ipde", "complex_pde", "iptm"]
+          compare_ops: ["lesser", "lesser", "greater"]
+          thresholds: [1.5, 1.5, 0.9]
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+        beta_chain: 0.05
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      symmetry_correction: true
+      override_method: "BOLTZ-1"
+
+    # RNA Distillation Data
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/rna/targets
+      msa_dir: ./training_data/rna/msa
+      prob: 0.04
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
+          composition_op: "OR"
+          metrics: ["complex_pde"]
+          compare_ops: ["lesser"]
+          thresholds: [2.0]
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      symmetry_correction: true
+      override_method: "BOLTZ-1"
+
+    # Protein-DNA Distillation Data
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/protein_dna/targets
+      msa_dir: ./training_data/protein_dna/msa
+      prob: 0.03
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.confidence.ConfidenceFilter
+          composition_op: "AND"
+          metrics: ["complex_ipde", "complex_pde", "iptm"]
+          compare_ops: ["lesser", "lesser", "greater"]
+          thresholds: [1.0, 2.0, 0.7]
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+        beta_chain: 0.05
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      symmetry_correction: true
+      override_method: "BOLTZ-1"
+
+
+  tokenizer:
+    _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+    atomize_modified_residues: false
+  featurizer:
+    _target_: boltzgen.data.feature.featurizer.Featurizer
+  moldir: ./training_data/mols
+  max_tokens: 512
+  max_atoms: 5120
+  max_seqs: 4096
+  pad_to_max_tokens: true
+  pad_to_max_atoms: true
+  pad_to_max_seqs: true
+  samples_per_epoch: 100000
+  batch_size: 1
+  num_workers: 2
+  random_seed: 42
+  pin_memory: false
+  overfit: null
+  return_train_symmetries: false
+  return_val_symmetries: false
+
+  
+  atoms_per_window_queries: 32
+  min_dist: 2.0
+  max_dist: 22.0
+  num_bins: 64
+  single_sequence_prop_training: 0.1
+  msa_sampling_training: true
+
+
+  # Design
+  design: true
+  backbone_only: false
+  atom14: true
+  atom37: false
+  selector:
+    _target_: boltzgen.data.select.protein.ProteinSelector
+    design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
+    substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
+    structure_condition_prob: 0.4
+    distance_noise_std:  1
+    run_selection: true
+    specify_binding_sites: true
+    ss_condition_prob: 0.1
+    select_all: false
+
+  # Design datasets
+  monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
+  monomer_target_dir: ./training_data/targets
+  monomer_target_structure_condition: true
+  monomer_seq_len: 100
+
+  ligand_split: data/pdb_sequences/val_ccd_pdb_pairs_boltzgen.txt
+  ligand_target_dir: ./training_data/targets
+  ligand_seq_len: 100
+
+
+model:
+  _target_: boltzgen.model.models.boltz.Boltz
+  atom_s: 128
+  atom_z: 16
+  token_s: 384
+  token_z: 128
+  num_bins: 64
+  atom_feature_dim: 388
+  atoms_per_window_queries: 32
+  atoms_per_window_keys: 128
+  use_miniformer: false
+  ema: true
+  ema_decay: 0.999
+  exclude_ions_from_lddt: true
+  num_val_datasets: 1               # New
+  ignore_ckpt_shape_mismatch: false # New
+  aggregate_distogram: true         # New
+  bond_type_feature: true
+  predict_bfactor: true
+  checkpoint_diffusion_conditioning: true
+  use_kernels: true
+
+
+  validators:
+    - _target_: boltzgen.model.validation.design.DesignValidator
+      val_names: ["RCSB"]
+      confidence_prediction: ${model.confidence_prediction}
+      atom14: ${data.atom14}
+      atom37: ${data.atom37}
+
+  masker_args:
+    mask: true
+    mask_backbone: false
+    mask_disto: true
+
+  embedder_args:
+    atom_encoder_depth: 3
+    atom_encoder_heads: 4
+    add_mol_type_feat: true
+    add_method_conditioning: true
+    add_modified_flag: true
+    add_cyclic_flag: true
+    add_design_mask_flag: true
+    add_binding_specification: true
+    add_ss_specification: true
+
+  freeze_template_weights: true
+  use_templates: true
+  template_args:
+    template_dim: 64
+    template_blocks: 2
+    activation_checkpointing: false
+
+
+  use_token_distances: true
+  token_distance_args:
+    token_distance_dim: 64
+    token_distance_blocks: 2
+    use_token_distance_feats: true
+    distance_gaussian_dim: 32
+    activation_checkpointing: true
+
+
+  msa_args:
+    msa_s: 64
+    msa_blocks: 4
+    msa_dropout: 0.15
+    z_dropout: 0.25
+    miniformer_blocks: false
+    pairwise_head_width: 32
+    pairwise_num_heads: 4
+    use_paired_feature: true
+    activation_checkpointing: true
+
+
+  pairformer_args:
+    num_blocks: 64
+    num_heads: 16
+    dropout: 0.25
+    post_layer_norm: false
+    activation_checkpointing: true
+
+
+  score_model_args:
+    sigma_data: 16
+    dim_fourier: 256
+    atom_encoder_depth: 3
+    atom_encoder_heads: 4
+
+    # token level args
+    token_layers: 1
+    token_transformer_depth: 24
+    token_transformer_heads: 16
+    diffusion_pairformer_args:
+      num_blocks: 0
+      num_heads: 2
+      dropout: 0
+      use_s_to_z: false
+
+
+
+    atom_decoder_depth: 3
+    atom_decoder_heads: 4
+    conditioning_transition_layers: 2
+    transformer_post_ln: false
+    activation_checkpointing: true
+
+  confidence_prediction: false
+  structure_prediction_training: true
+  
+  training_args:
+    recycling_steps: 3
+    sampling_steps: 20
+    diffusion_multiplicity: 32
+    diffusion_samples: 1
+    confidence_loss_weight: 1e-4
+    diffusion_loss_weight: 4.0
+    distogram_loss_weight: 3e-2
+    bfactor_loss_weight: 1e-3
+    adam_beta_1: 0.9
+    adam_beta_2: 0.95
+    adam_eps: 0.00000001
+    lr_scheduler: af3
+    base_lr: 0.0
+    max_lr: 0.0005
+    lr_warmup_no_steps: 1000
+    lr_start_decay_after_n_steps: 50000
+    lr_decay_every_n_steps: 50000
+    lr_decay_factor: 0.95
+    weight_decay: 0.003
+    weight_decay_exclude: true
+
+  validation_args:
+    recycling_steps: 3
+    sampling_steps: 200
+    diffusion_samples: 1
+    symmetry_correction: false
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: 1.0
+    step_scale: 1.0
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+
+  diffusion_loss_args:
+    add_smooth_lddt_loss: true
+    add_bond_loss: false
+    nucleotide_loss_weight: 5.0
+    ligand_loss_weight: 10.0
+
+  refolding_validator:
+    _target_: boltzgen.model.validation.refolding.RefoldingValidator
+    val_names: ["RCSB"]
+    step_scale: 1.5
+    noise_scale: 0.75
+    atom14: ${data.atom14}
+    atom37: ${data.atom37}
+    val_monomer: ${data.monomer_split}
+    val_ligand: ${data.ligand_split}
+    analyze_task:
+      _target_: boltzgen.task.analyze.analyze.Analyze
+      name: ${name}
+      debug: ${debug}
+      design_dir: null
+      num_processes: 1
+
+      # Common metrics to compute
+      affinity_metrics: false
+      allatom_fold_metrics: true
+      backbone_fold_metrics: true
+      noncovalents_original: false
+      noncovalents_refolded: false
+      delta_sasa_original: false
+      delta_sasa_refolded: false
+      largest_hydrophobic: false
+      largest_hydrophobic_refolded: false
+      run_clustering: false
+
+      # Liability analysis
+      liability_analysis: false
+      liability_modality: peptide
+      liability_peptide_type: linear
+
+      # Uncommon metrics
+      diversity_original: true
+      diversity_refolded: true
+      diversity_per_target_original: false
+      diversity_per_target_refolded: false
+      novelty_original: false
+      novelty_refolded: false
+      novelty_per_target_original: false
+      novelty_per_target_refolded: false
+
+      wandb: null
+
+      data:
+        _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+        cfg:
+          _target_: boltzgen.task.predict.data_from_generated.DataConfig
+          tokenizer:
+            _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+            atomize_modified_residues: false
+          featurizer:
+            _target_: boltzgen.data.feature.featurizer.Featurizer
+
+          suffix: .cif
+          suffix_metadata: .npz
+          suffix_native: _native.cif
+          samples_per_target: 1
+          num_targets: 100000000
+          moldir: ./training_data/mols
+
+          batch_size: 1
+          num_workers: 4
+          pin_memory: true
+        return_native: true
+      predict_task: null
+
+    folding_checkpoint: ./training_data/boltz2_fold.ckpt
+
+    folding_args:
+      recycling_steps: 3
+      sampling_steps: 200
+      diffusion_samples: 1
+
+    folding_model_args:
+      atom_s: 128
+      atom_z: 16
+      token_s: 384
+      token_z: 128
+      num_bins: 64
+      atom_feature_dim: 388
+      atoms_per_window_queries: 32
+      atoms_per_window_keys: 128
+      compile_pairformer: false
+      compile_templates: false
+      compile_msa: false
+      use_miniformer: false
+      ema: true
+      ema_decay: 0.999
+      exclude_ions_from_lddt: true
+      num_val_datasets: 4
+      ignore_ckpt_shape_mismatch: false
+      aggregate_distogram: true
+      bond_type_feature: true
+      conditioning_cutoff_min: 4.0
+      conditioning_cutoff_max: 20.0
+      use_templates: true
+      predict_bfactor: true
+      checkpoint_diffusion_conditioning: false
+      use_kernels: true
+
+      validators: null
+
+      embedder_args:
+        atom_encoder_depth: 3
+        atom_encoder_heads: 4
+        add_mol_type_feat: true
+        add_method_conditioning: true
+        add_modified_flag: true
+        add_cyclic_flag: true
+
+      msa_args:
+        msa_s: 64
+        msa_blocks: 4
+        msa_dropout: 0.15
+        z_dropout: 0.25
+        miniformer_blocks: false
+        pairwise_head_width: 32
+        pairwise_num_heads: 4
+        use_paired_feature: true
+        activation_checkpointing: false
+
+
+      template_args:
+        template_dim: 64
+        template_blocks: 2
+        activation_checkpointing: false
+
+
+      pairformer_args:
+        num_blocks: 64
+        num_heads: 16
+        dropout: 0.25
+        post_layer_norm: false
+        activation_checkpointing: false
+
+
+      score_model_args:
+        sigma_data: 16
+        dim_fourier: 256
+        atom_encoder_depth: 3
+        atom_encoder_heads: 4
+        token_transformer_depth: 24
+        token_transformer_heads: 16
+        atom_decoder_depth: 3
+        atom_decoder_heads: 4
+        conditioning_transition_layers: 2
+        transformer_post_ln: false
+        activation_checkpointing: false
+
+      confidence_prediction: false
+      affinity_prediction: false
+      structure_prediction_training: true
+      affinity_model_args:
+        num_dist_bins: 64
+        max_dist: 22
+        no_trunk_feats: false
+        add_s_to_z_prod: false
+        add_s_input_to_s: false
+        confidence_args:
+          num_plddt_bins: 50
+          num_pde_bins: 64
+          num_pae_bins: 64
+
+      training_args:
+        recycling_steps: 3
+        sampling_steps: 20
+        diffusion_multiplicity: 48
+        diffusion_samples: 1
+        affinity_loss_weight: 3e-3
+        confidence_loss_weight: 1e-4
+        diffusion_loss_weight: 4.0
+        distogram_loss_weight: 3e-2
+        bfactor_loss_weight: 1e-3
+        adam_beta_1: 0.9
+        adam_beta_2: 0.95
+        adam_eps: 0.00000001
+        lr_scheduler: af3
+        base_lr: 0.0
+        max_lr: 0.001
+        lr_warmup_no_steps: 1000
+        lr_start_decay_after_n_steps: 50000
+        lr_decay_every_n_steps: 50000
+        lr_decay_factor: 0.95
+        weight_decay: 0.003
+        weight_decay_exclude: true
+
+      validation_args:
+        recycling_steps: 3
+        sampling_steps: 200
+        diffusion_samples: 5
+        symmetry_correction: false
+
+      diffusion_process_args:
+        sigma_min: 0.0004  # min noise level
+        sigma_max: 160.0  # max noise level
+        sigma_data: 16.0  # standard deviation of data distribution
+        rho: 7  # controls the sampling schedule
+        P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+        P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+        gamma_0: 0.8
+        gamma_min: 1.0
+        noise_scale: 1.0
+        step_scale: 1.0
+        mse_rotational_alignment: true
+        coordinate_augmentation: true
+        alignment_reverse_diff: true
+        synchronize_sigmas: false
+
+      diffusion_loss_args:
+        add_smooth_lddt_loss: true
+        add_bond_loss: false
+        nucleotide_loss_weight: 5.0
+        ligand_loss_weight: 10.0
--- a/config/train/boltzgen_small.yaml
+++ b/config/train/boltzgen_small.yaml
@@ -0,0 +1,346 @@
+_target_: boltzgen.task.train.train.Training
+
+trainer:
+  accelerator: gpu
+  devices: 8
+  precision: bf16-mixed
+  gradient_clip_val: 10.0
+  accumulate_grad_batches: 16
+  max_epochs: -1
+  num_sanity_val_steps: 3
+  log_every_n_steps: 1
+
+wandb:
+  group: boltzgen
+  project: boltzgen
+  entity: yourwandb
+
+name: small
+output: workdir
+strict_loading: false
+resume: null
+pretrained: ./training_data/boltzgen1_structuretrained_small.ckpt
+debug: false
+save_every_n_train_steps: 2500
+disable_checkpoint: false
+matmul_precision: null
+save_top_k: -1
+# ddp_timeout_seconds: 1000
+
+data:
+  datasets:
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/targets
+      msa_dir: ./training_data/msa
+      prob: 1
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.date.DateFilter
+          date: "2023-06-01"
+          ref: released
+        - _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
+          resolution: 9.0
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
+      symmetry_correction: false
+      val_group: "RCSB"
+
+
+  tokenizer:
+    _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+    atomize_modified_residues: false
+  featurizer:
+    _target_: boltzgen.data.feature.featurizer.Featurizer
+  moldir: ./training_data/mols
+  max_tokens: 256
+  max_atoms: 2048
+  max_seqs: 1024
+  pad_to_max_tokens: true
+  pad_to_max_atoms: true
+  pad_to_max_seqs: true
+  samples_per_epoch: 100000
+  batch_size: 1
+  num_workers: 4
+  random_seed: 42
+  pin_memory: true
+  overfit: null
+  return_train_symmetries: false
+  return_val_symmetries: false
+
+  
+  atoms_per_window_queries: 32
+  min_dist: 2.0
+  max_dist: 22.0
+  num_bins: 64
+  single_sequence_prop_training: 0.1
+  msa_sampling_training: true
+
+
+  # Design
+  design: true
+  backbone_only: false
+  atom14: true
+  atom37: false
+  selector:
+    _target_: boltzgen.data.select.protein.ProteinSelector
+    design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
+    substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
+    structure_condition_prob: 0.4
+    distance_noise_std:  1
+    run_selection: true
+    specify_binding_sites: true
+    ss_condition_prob: 0.1
+    select_all: false
+    chain_reindexing: false
+
+  # Design datasets
+  monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
+  monomer_target_dir: ./training_data/targets
+  monomer_target_structure_condition: true
+  monomer_seq_len: 100
+
+  ligand_split: data/pdb_sequences/val_ccd_pdb_pairs_boltzgen.txt
+  ligand_target_dir: ./training_data/targets
+  ligand_seq_len: 100
+
+
+model:
+  _target_: boltzgen.model.models.boltz.Boltz
+  atom_s: 128
+  atom_z: 16
+  token_s: 384
+  token_z: 128
+  num_bins: 64
+  atom_feature_dim: 388
+  atoms_per_window_queries: 32
+  atoms_per_window_keys: 128
+  use_miniformer: true
+  ema: true
+  ema_decay: 0.999
+  exclude_ions_from_lddt: true
+  num_val_datasets: 1               # New
+  ignore_ckpt_shape_mismatch: false # New
+  aggregate_distogram: true         # New
+  bond_type_feature: true
+  predict_bfactor: true
+  predict_res_type: false
+  checkpoint_diffusion_conditioning: false
+  use_kernels: true
+
+
+  validators:
+    - _target_: boltzgen.model.validation.design.DesignValidator
+      val_names: ["RCSB"]
+      confidence_prediction: ${model.confidence_prediction}
+      backbone_only: ${data.backbone_only}
+      atom14: ${data.atom14}
+      atom37: ${data.atom37}
+
+  masker_args:
+    mask: true
+    mask_backbone: false
+    mask_disto: true
+
+  embedder_args:
+    atom_encoder_depth: 3
+    atom_encoder_heads: 4
+    add_mol_type_feat: true
+    add_method_conditioning: true
+    add_modified_flag: true
+    add_cyclic_flag: true
+    add_design_mask_flag: true
+    add_binding_specification: true
+    add_ss_specification: true
+
+  freeze_template_weights: true
+  use_templates: true
+  template_args:
+    template_dim: 64
+    template_blocks: 2
+    miniformer_blocks: true
+    activation_checkpointing: false
+
+  use_token_distances: true
+  token_distance_args:
+    token_distance_dim: 64
+    token_distance_blocks: 2
+    use_token_distance_feats: true
+    distance_gaussian_dim: 32
+
+  msa_args:
+    msa_s: 64
+    msa_blocks: 3
+    msa_dropout: 0.15
+    z_dropout: 0.25
+    miniformer_blocks: true
+    pairwise_head_width: 32
+    pairwise_num_heads: 4
+    use_paired_feature: true
+    activation_checkpointing: false
+
+  pairformer_args:
+    num_blocks: 12
+    num_heads: 16
+    dropout: 0.25
+    post_layer_norm: false
+    activation_checkpointing: false
+
+  score_model_args:
+    sigma_data: 16
+    dim_fourier: 256
+    atom_encoder_depth: 3
+    atom_encoder_heads: 4
+
+    # token level args
+    token_layers: 1
+    token_transformer_depth: 8
+    token_transformer_heads: 16
+    diffusion_pairformer_args:
+      num_blocks: 0
+      num_heads: 2
+      dropout: 0
+      use_s_to_z: false
+
+    atom_decoder_depth: 3
+    atom_decoder_heads: 4
+    conditioning_transition_layers: 2
+    transformer_post_ln: false
+    activation_checkpointing: false
+
+  confidence_prediction: false
+  structure_prediction_training: true
+  training_args:
+    recycling_steps: 3
+    sampling_steps: 20
+    diffusion_multiplicity: 12
+    diffusion_samples: 1
+    confidence_loss_weight: 1e-4
+    diffusion_loss_weight: 4.0
+    distogram_loss_weight: 3e-2
+    bfactor_loss_weight: 1e-3
+    res_type_loss_weight: 3e-2
+    adam_beta_1: 0.9
+    adam_beta_2: 0.95
+    adam_eps: 0.00000001
+    lr_scheduler: af3
+    base_lr: 0.0
+    max_lr: 0.0018
+    lr_warmup_no_steps: 1000
+    lr_start_decay_after_n_steps: 50000
+    lr_decay_every_n_steps: 50000
+    lr_decay_factor: 0.95
+    weight_decay: 0.003
+    weight_decay_exclude: true
+
+  validation_args:
+    recycling_steps: 3
+    sampling_steps: 200
+    diffusion_samples: 1
+    symmetry_correction: false
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: 1.0
+    step_scale: 1.0
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+
+  diffusion_loss_args:
+    add_smooth_lddt_loss: true
+    add_bond_loss: false
+    nucleotide_loss_weight: 5.0
+    ligand_loss_weight: 10.0
+
+  refolding_validator:
+    _target_: boltzgen.model.validation.refolding.RefoldingValidator
+    val_names: ["RCSB"]
+    step_scale: 1.5
+    noise_scale: 0.75
+    atom14: ${data.atom14}
+    atom37: ${data.atom37}
+    backbone_only: ${data.backbone_only}
+    val_monomer: ${data.monomer_split}
+    val_ligand: ${data.ligand_split}
+    analyze_task:
+      _target_: boltzgen.task.analyze.analyze.Analyze
+      name: ${name}
+      debug: ${debug}
+      design_dir: null
+      num_processes: 1
+
+      # Common metrics to compute
+      affinity_metrics: false
+      allatom_fold_metrics: true
+      backbone_fold_metrics: true
+      noncovalents_original: false
+      noncovalents_refolded: false
+      delta_sasa_original: false
+      delta_sasa_refolded: false
+      largest_hydrophobic: false
+      largest_hydrophobic_refolded: false
+      run_clustering: false
+
+      # Liability analysis
+      liability_analysis: false
+      liability_modality: peptide
+      liability_peptide_type: linear
+
+      # Uncommon metrics
+      diversity_original: true
+      diversity_refolded: true
+      diversity_per_target_original: false
+      diversity_per_target_refolded: false
+      novelty_original: false
+      novelty_refolded: false
+      novelty_per_target_original: false
+      novelty_per_target_refolded: false
+
+      wandb: null
+
+      data:
+        _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+        cfg:
+          _target_: boltzgen.task.predict.data_from_generated.DataConfig
+          tokenizer:
+            _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+            atomize_modified_residues: false
+          featurizer:
+            _target_: boltzgen.data.feature.featurizer.Featurizer
+
+          suffix: .cif
+          suffix_metadata: .npz
+          suffix_native: _native.cif
+          samples_per_target: 1
+          num_targets: 100000000
+          moldir: ./training_data/mols
+
+          batch_size: 1
+          num_workers: 4
+          pin_memory: true
+        target_templates: true
+        return_native: true
+
+    folding_checkpoint: ./training_data/boltz2_fold.ckpt
+
+    folding_args:
+      recycling_steps: 3
+      sampling_steps: 200
+      diffusion_samples: 1
+
+    folding_model_args:
+      validators: null
--- a/config/train/inverse_folding.yaml
+++ b/config/train/inverse_folding.yaml
@@ -0,0 +1,376 @@
+_target_: boltzgen.task.train.train.Training
+
+trainer:
+  accelerator: cuda
+  devices: 4
+  precision: 32
+  gradient_clip_val: 10.0
+  accumulate_grad_batches: 1
+  max_epochs: 5
+  num_sanity_val_steps: 1
+  log_every_n_steps: 1
+
+wandb:
+  group: boltzgen
+  project: boltzgen
+  entity: yourwandb
+
+name: if_lr_scheduler
+output: workdir
+strict_loading: false
+resume: null
+debug: false
+save_every_n_train_steps: 2500
+disable_checkpoint: false
+matmul_precision: null
+save_top_k: -1
+
+data:
+  datasets:
+    - _target_: boltzgen.task.train.data.DatasetConfig
+      target_dir: ./training_data/targets
+      msa_dir: ./training_data/msa
+      prob: 1
+      filters:
+        - _target_: boltzgen.data.filter.dynamic.size.SizeFilter
+          min_chains: 1
+          max_chains: 300
+        - _target_: boltzgen.data.filter.dynamic.date.DateFilter
+          date: "2023-06-01"
+          ref: released
+        - _target_: boltzgen.data.filter.dynamic.resolution.ResolutionFilter
+          resolution: 9.0
+        - _target_: boltzgen.data.filter.dynamic.min_protein_residues.MinProteinResiduesFilter
+          min_residues: 5
+        - _target_: boltzgen.data.filter.dynamic.pdb_id_txtfile.FilterIDFromTXT
+          paths:
+            - data/exclude_ids/fibril.txt
+            - data/exclude_ids/transmembrane.txt
+      sampler:
+        _target_: boltzgen.data.sample.cluster.ClusterSampler
+      cropper:
+        _target_: boltzgen.data.crop.multimer.MultimerCropper
+        neighborhood_sizes: [ 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40 ]
+      split: ./data/pdb_sequences/boltz2/validation_ids_boltz2_all.txt
+      symmetry_correction: false
+      val_group: "RCSB"
+
+
+  tokenizer:
+    _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+    atomize_modified_residues: false
+  featurizer:
+    _target_: boltzgen.data.feature.featurizer.Featurizer
+  moldir: ./training_data/mols
+  max_tokens: 1024 
+  max_atoms: 8192
+  max_seqs: 1
+  pad_to_max_tokens: true
+  pad_to_max_atoms: true
+  pad_to_max_seqs: true
+  samples_per_epoch: 600000
+  batch_size: 2
+  num_workers: 32
+  random_seed: 42
+  pin_memory: true
+  overfit: null
+  return_train_symmetries: false
+  return_val_symmetries: false
+  compute_frames: false
+
+  
+  atoms_per_window_queries: 32
+  min_dist: 2.0
+  max_dist: 22.0
+  num_bins: 64
+  single_sequence_prop_training: 0.05
+  msa_sampling_training: true
+
+  # Design
+  design: true
+  backbone_only: true
+  atom14: false
+  atom37: false
+  inverse_fold: ${model.inverse_fold}
+  use_msa: false
+  selector:
+    _target_: boltzgen.data.select.protein.ProteinSelector
+    design_neighborhood_sizes: [2, 4, 6,8,10,12,14,16,18]
+    substructure_neighborhood_sizes: [2,4,6,8,10,12,24]
+    structure_condition_prob: 0.5
+    distance_noise_std:  1
+    run_selection: true
+    specify_binding_sites: false
+    ss_condition_prob: 0
+    select_all: true
+    complete_structure_mask: true
+
+  # Design datasets
+  monomer_split: data/pdb_sequences/val_monomers_boltzgen_min50_max220.txt
+  monomer_target_dir: ./training_data/targets
+  monomer_target_structure_condition: true
+  monomer_seq_len: 100
+
+  ligand_split: null
+  ligand_target_dir: ./training_data/targets
+  ligand_seq_len: 100
+
+
+model:
+  _target_: boltzgen.model.models.boltz.Boltz
+  atom_s: 128
+  atom_z: 16
+  token_s: 384
+  token_z: ${model.inverse_fold_args.pair_dim}
+  num_bins: 64
+  atom_feature_dim: 388
+  atoms_per_window_queries: 32
+  atoms_per_window_keys: 128
+  use_miniformer: true
+  ema: true
+  ema_decay: 0.999
+  exclude_ions_from_lddt: true
+  num_val_datasets: 1               # New
+  ignore_ckpt_shape_mismatch: false # New
+  aggregate_distogram: true         # New
+  bond_type_feature: true
+  predict_bfactor: true
+  predict_res_type: true
+  checkpoint_diffusion_conditioning: false
+  inverse_fold: true
+  inverse_fold_args:
+    atom_s: ${model.atom_s}
+    atom_z: ${model.atom_z}
+    token_s: ${model.token_s}
+    token_z: ${model.token_z}
+    node_dim: 128
+    pair_dim: 128
+    hidden_dim: 128
+    dropout: 0.1
+    softmax_dropout: 0.2
+    num_encoder_layers: 6
+    num_decoder_layers: 3
+    autoregressive: true
+    transformation_scale_factor: 1.0
+    inverse_fold_noise: 0.2
+    topk: 30
+    num_heads: 4
+    enable_input_embedder: True
+    sampling_temperature: -1.0
+
+  validators:
+    - _target_: boltzgen.model.validation.design.DesignValidator
+      val_names: ["RCSB"]
+      confidence_prediction: ${model.confidence_prediction}
+      atom14: ${data.atom14}
+      atom37: ${data.atom37}
+      backbone_only: ${data.backbone_only}
+      inverse_fold: ${model.inverse_fold}
+
+  masker_args:
+    mask: true
+    mask_backbone: false
+    mask_disto: false
+
+  embedder_args:
+    atom_encoder_depth: 1
+    atom_encoder_heads: 4
+    add_mol_type_feat: true
+    add_method_conditioning: true
+    add_modified_flag: true
+    add_cyclic_flag: true
+    add_design_mask_flag: false
+    add_binding_specification: false
+    add_ss_specification: false
+
+  use_token_distances: false
+  token_distance_args:
+    token_distance_dim: ${model.inverse_fold_args.pair_dim}
+    token_distance_blocks: 0
+    use_token_distance_feats: true
+    distance_gaussian_dim: 32
+    disable_token_distance_transition: true
+    use_relative_position_encoding: true 
+
+  # MSA module is not used in inverse folding
+  msa_args:
+    msa_s: 2
+    msa_blocks: 0
+    msa_dropout: 0
+    z_dropout: 0
+    miniformer_blocks: true
+    pairwise_head_width: 2
+    pairwise_num_heads: 1
+    use_paired_feature: true
+    activation_checkpointing: false
+
+  pairformer_args:
+    num_blocks: 2
+    num_heads: 16
+    dropout: 0.25
+    post_layer_norm: false
+    activation_checkpointing: false
+
+  score_model_args:
+    sigma_data: 16
+    dim_fourier: 256
+    atom_encoder_depth: 3
+    atom_encoder_heads: 4
+
+    # token level args
+    token_layers: 1
+    token_transformer_depth: 3
+    token_transformer_heads: 16
+    diffusion_pairformer_args:
+      num_blocks: 0
+      num_heads: 2
+      dropout: 0
+      use_s_to_z: false
+
+    atom_decoder_depth: 3
+    atom_decoder_heads: 4
+    conditioning_transition_layers: 2
+    transformer_post_ln: false
+    activation_checkpointing: false
+
+  confidence_prediction: false
+  affinity_prediction: false
+  structure_prediction_training: true
+  affinity_model_args:
+    num_dist_bins: 64
+    max_dist: 22
+    no_trunk_feats: false
+    add_s_to_z_prod: false
+    add_s_input_to_s: false
+
+    confidence_args:
+      num_plddt_bins: 50
+      num_pde_bins: 64
+      num_pae_bins: 64
+
+  training_args:
+    recycling_steps: 0
+    sampling_steps: 20
+    diffusion_multiplicity: 2
+    diffusion_samples: 1
+    affinity_loss_weight: 3e-3
+    confidence_loss_weight: 1e-4
+    diffusion_loss_weight: 4.0
+    distogram_loss_weight: 3e-2
+    bfactor_loss_weight: 1e-3
+    res_type_loss_weight: 1
+    adam_beta_1: 0.9
+    adam_beta_2: 0.95 
+    adam_eps: 0.00000001
+    lr_scheduler: onecycle
+    base_lr: 0.0
+    max_lr: 0.001
+    weight_decay: 0.003
+    weight_decay_exclude: true 
+
+  validation_args:
+    recycling_steps: 0
+    sampling_steps: 200
+    diffusion_samples: 1
+    symmetry_correction: false
+
+  diffusion_process_args:
+    sigma_min: 0.0004  # min noise level
+    sigma_max: 160.0  # max noise level
+    sigma_data: 16.0  # standard deviation of data distribution
+    rho: 7  # controls the sampling schedule
+    P_mean: -1.2  # mean of log-normal distribution from which noise is drawn for training
+    P_std: 1.5  # standard deviation of log-normal distribution from which noise is drawn for training
+    gamma_0: 0.8
+    gamma_min: 1.0
+    noise_scale: 1.0
+    step_scale: 1.0
+    mse_rotational_alignment: true
+    coordinate_augmentation: true
+    alignment_reverse_diff: true
+    synchronize_sigmas: false
+
+  diffusion_loss_args:
+    add_smooth_lddt_loss: true
+    add_bond_loss: false
+    nucleotide_loss_weight: 5.0
+    ligand_loss_weight: 10.0
+
+  refolding_validator:
+    _target_: boltzgen.model.validation.refolding.RefoldingValidator
+    val_names: ["RCSB"]
+    step_scale: 1.5
+    noise_scale: 0.75
+    atom14: ${data.atom14}
+    atom37: ${data.atom37}
+    val_monomer: ${data.monomer_split}
+    val_ligand: ${data.ligand_split}
+    inverse_fold: ${model.inverse_fold}
+    analyze_task:
+      _target_: boltzgen.task.analyze.analyze.Analyze
+      name: ${name}
+      debug: ${debug}
+      design_dir: null
+      num_processes: 1
+
+      # Common metrics to compute
+      affinity_metrics: false
+      allatom_fold_metrics: true
+      backbone_fold_metrics: true
+      noncovalents_original: false
+      noncovalents_refolded: false
+      delta_sasa_original: false
+      delta_sasa_refolded: false
+      largest_hydrophobic: false
+      largest_hydrophobic_refolded: false
+      run_clustering: false
+
+      # Liability analysis
+      liability_analysis: false
+      liability_modality: peptide
+      liability_peptide_type: linear
+
+      # Uncommon metrics
+      diversity_original: true
+      diversity_refolded: true
+      diversity_per_target_original: false
+      diversity_per_target_refolded: false
+      novelty_original: false
+      novelty_refolded: false
+      novelty_per_target_original: false
+      novelty_per_target_refolded: false
+
+      wandb: null
+
+      data:
+        _target_: boltzgen.task.predict.data_from_generated.FromGeneratedDataModule
+        cfg:
+          _target_: boltzgen.task.predict.data_from_generated.DataConfig
+          tokenizer:
+            _target_: boltzgen.data.tokenize.tokenizer.Tokenizer
+            atomize_modified_residues: false
+          featurizer:
+            _target_: boltzgen.data.feature.featurizer.Featurizer
+
+          suffix: .cif
+          suffix_metadata: .npz
+          suffix_native: _native.cif
+          samples_per_target: 1
+          num_targets: 100000000
+          moldir: ./training_data/mols
+
+          batch_size: 1
+          num_workers: 1
+          pin_memory: false
+        target_templates: true
+        return_native: true
+
+    folding_checkpoint: ./training_data/boltz2_fold.ckpt
+
+    folding_args:
+      recycling_steps: 3
+      sampling_steps: 200
+      diffusion_samples: 1
+
+    folding_model_args:
+      validators: null
--- a/example/7rpz.cif
+++ b/example/7rpz.cif
--- a/example/8r3a.cif
+++ b/example/8r3a.cif
--- a/example/README.md
+++ b/example/README.md
@@ -0,0 +1,429 @@
+# How to make a design specification .yaml
+
+
+**IMPORTANT:** ⚠️ All residue indices are specified **starting at 1** and we use the canonical mmcif residue index `label_asym_id`, and **not** the `auth_asym_id` author residue index! 
+You can check the indexing in your mmcif file by opening it in https://molstar.org/viewer/, hovering over a residue, and checking the index on the bottom right. You will see something like this where **41 is the index we use, the auth id 22 is incorrect**:
+
+![](../assets/label_seq_id.png)
+
+After you constructed your `.yaml` file we recommend that you run the `check` command on it:
+1. Run `boltzgen check example/vanilla_peptide_with_target_binding_site/beetletert.yaml`.  
+2. Visualize the resulting mmcif file in a protein structure viewer (e.g. PyMOL, Chimera, or online: https://molstar.org/viewer/).
+3. Your viewer should show the binding site in a different color than the rest of the target. 
+
+
+# Example based explanation:
+We provide many example `.yaml` files in the `example/` directory, including:
+
+- [design_spec_showcasing_all_functionalities.yaml](design_spec_showcasing_all_functionalities.yaml)
+- [vanilla_peptide_with_target_binding_site/beetletert.yaml](vanilla_peptide_with_target_binding_site/beetletert.yaml)
+- [peptide_against_specific_site_on_ragc/rragc.yaml](peptide_against_specific_site_on_ragc/rragc.yaml)
+- [nanobody_against_penguinpox/penguinpox.yaml](nanobody_against_penguinpox/penguinpox.yaml)
+- [denovo_zinc_finger_against_dna/zinc_finger.yaml](denovo_zinc_finger_against_dna/zinc_finger.yaml)
+- [protein_binding_small_molecule/chorismite.yaml](protein_binding_small_molecule/chorismite.yaml)
+
+Small example of a protein design against a target protein without binding site specified:
+```yaml
+entities:
+  # Designed protein with between 80 and 140 residues 
+  # (The length is randomly sampled)
+  - protein: 
+      id: B
+      sequence: 80..140
+
+  # The target is extracted from a .cif file
+  - file:
+      path: hard_targets/6m1u.cif
+
+      # Which chain in the .cif file to use as target (uses all chains if unspecified)
+      include: 
+        - chain:
+            id: A
+```
+
+**IMPORTANT:** ⚠️ File references inside a yaml file (e.g. to cif files) are interpreted relative to the directory of the yaml file.
+
+
+Example highlighting many (not all) functionalities:
+```yaml
+entities:
+  # Specification of the target which is extracted from a .cif file
+  - file:
+      path: 8r3a.cif
+      
+      # Which chain and residues in the .cif file to use as target (uses all chains if unspecified)
+      include: 
+        - chain:
+            id: A
+            res_index: 2..50,55.. # residues between 2 and 50 and anything larger than 55
+        - chain:
+            id: B
+
+      # Which regions of the target the design should or should NOT
+      # bind to (this can be left unspecified, then we just bind anywhere)
+      binding_types:
+        - chain:
+            id: A
+            binding: 5..7,13
+        - chain:
+            id: B
+            not_binding: "all" 
+      
+      # Which regions of the target should have their structure specified.
+      # By default, everything is visibility 1 which means that the structure is specified.
+      # If the visibility is 0, then the structure is not specified.
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+            res_index: 10..13
+        - group:
+            # The relative positioning of things in structure group 2
+            # is not specified w.r.t to things in structure group 1
+            visibility: 2 
+            id: B
+        # Overwrite the previous visibility setting and set it to 0 for res_index 13
+        - group:
+            visibility: 0
+            id: A
+            res_index: 13 
+
+      # Optionally you can say that some residues in a loaded .cif file should also be redesigned.
+      design:
+        - chain:
+            id: A
+            res_index: 14..19
+
+      # For designed regions you can say what secondary structure they should have
+      secondary_structure:
+        - chain:
+            id: A
+            loop: 14
+            helix: 15..17
+            sheet: 19
+
+  # Specify a NON-designed protein chain
+  - protein: 
+      id: X
+      sequence: AAVTTTTPPP
+
+  # Specify a designed protein chain 
+  # Numbers specify what is being designed
+  - protein: 
+      id: G
+      # random number between 15 and 20 of designed residues (inclusive)
+      sequence: 15..20AAAAAAVTTTT18PPP 
+
+  # A designed helicon 
+  # (see the constraints below that connect the peptide with the WHL ligand)
+  - protein: 
+      id: R
+      # Random number of design residues between 3 and 5,
+      # then a Cysteine, then 6 design residues, then ...
+      sequence: 3..5C6C3 
+  - ligand:
+      id: Q
+      ccd: WHL
+  
+  # A designed peptide with 17 residues
+  - protein:
+      id: H
+      sequence: 17
+
+  # specification for a designed peptide with two Cys and a disulfide bond (see constraints)
+  - protein:
+      id: S
+      sequence: 10..14C6C3
+
+constraints:
+    # specify connections as if the minimum possible number of residues was sampled
+  - bond:
+      atom1: [R, 4, SG] # connection for a helicon between small molecule and designed peptide
+      atom2: [Q, 1, CK]
+  - bond:
+      atom1: [R, 11, SG] # connection for a helicon between small molecule and designed peptide
+      atom2: [Q, 1, CH]
+  - bond:
+      atom1: [S, 11, SG] # connection for a disulfide bond between Cys and Cys in designed peptide
+      atom2: [S, 18, SG]
+
+```
+
+
+# Detailed Explanation
+
+```yaml
+entities:
+  # Define proteins, ligands, and structure files
+  - protein: ...
+  - ligand: ...
+  - file: ...
+
+constraints:
+  # Define bonds and total length constraints
+  - bond: ...
+  - total_len: ...
+```
+
+### Entities Section
+
+The `entities` section defines all the components of your design:
+
+#### Protein Sequences
+
+Define custom protein sequences with design flexibility:
+
+```yaml
+entities:
+  - protein:
+      id: G                   # Unique identifier 
+      sequence: 15..20AAAAAAVTTTT18PPP  # Mix of fixed residues and design regions
+      binding_types: uuuuBBBuNNNuBuu    # Binding specifications (optional)
+      secondary_structure: HHHLLLEEE     # Secondary structure constraints for designed regions (optional)
+```
+
+**Sequence notation:**
+- `15..20` - Design between 15-20 residues (inclusive)
+- `AAAA` - Fixed amino acid sequence
+- `18` - Design exactly 18 residues
+- `3..5C6C3` - Variable design residues, then fixed Cys, then more design
+
+**Binding types:**
+- `B` - Binding residue
+- `N` - Non-binding residue  
+- `u` - Unspecified (default)
+- Can specify as string: `uuuuBBBuNNNuBuu`
+- Or as ranges:
+  ```yaml
+  binding_types:
+    binding: 5..7,13      # Residues 5-7 and 13 are binding
+    not_binding: 9..11    # Residues 9-11 are non-binding
+  ```
+
+#### Ligands
+
+Define small molecule ligands using CCD codes or SMILES:
+
+```yaml
+# Using Chemical Component Dictionary (CCD) code
+entities:
+  - ligand:
+      id: [E, F] # specify list of IDs to copy the entity
+      ccd: WHL
+      binding_types: B
+
+# Using SMILES string
+entities:
+  - ligand:
+      id: Q
+      smiles: 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'
+      binding_types: B
+```
+
+#### Structure Files
+
+Include existing protein structures from PDB/mmCIF files:
+
+```yaml
+entities:
+  - file:
+      path: 7rpz.cif
+      
+      # Include specific chains
+      include:
+        - chain:
+            id: A
+        - chain:
+            id: B
+      
+      # Include by proximity
+      include_proximity:
+        - chain:
+            id: A
+            res_index: 10..16
+            radius: 35
+      
+      # Exclude specific regions
+      exclude:
+        - chain:
+            id: A
+            res_index: ..5    # Exclude residues 1-5
+      
+      # Reset residue numbering
+      reset_res_index:
+        - chain:
+            id: A
+```
+
+
+# Advanced Options 
+
+**Design regions:** Specify which residues to redesign
+
+```yaml
+entities:
+  ...
+  - file:
+    ...
+    design:
+      - chain:
+          id: A
+          res_index: ..4,20..27  # Redesign residues 1-4 and 20-27
+```
+
+**Secondary structure constraints:**
+```yaml
+entities:
+  ...
+  - protein:
+    ...
+    secondary_structure:
+      - chain:
+          id: A
+          loop: 1        # Residue 1 should be loop
+          helix: 2..3    # Residues 2-3 should be helix
+          sheet: 4       # Residue 4 should be sheet
+```
+
+**Structure visibility groups:**
+```yaml
+entities:
+  ...
+  - file:
+    ...
+    structure_groups:
+      - group:
+          visibility: 1    # Visibility level (0=hidden, 1=visible, 2=highlighted)
+          id: A
+          res_index: 10..16
+```
+
+**Design insertions:**
+```yaml
+entities:
+  ...
+  - file:
+    ...
+    design_insertions:
+      - insertion:
+          id: A
+          res_index: 20              # Insert after residue 20
+          num_residues: 2..9         # Insert 2-9 residues
+          secondary_structure: HELIX  # UNSPECIFIED, LOOP, HELIX, or SHEET
+```
+
+**Binding type specifications:**
+```yaml
+entities:
+  ...
+  - protein:
+    ...
+    binding_types:
+      - chain:
+          id: A
+          binding: 5..7,13
+      - chain:
+          id: B
+          not_binding: "all"
+```
+
+### Constraints Section
+
+Define structural constraints between components:
+
+#### Bond Constraints
+
+Create covalent bonds between specific atoms:
+
+```yaml
+constraints:
+  - bond:
+      atom1: [R, 4, SG]   # [chain_id, residue_number, atom_name]
+      atom2: [Q, 1, CK]   # Connect sulfur of Cys-4 in chain R to atom CK in ligand Q
+```
+
+
+Here is a comprehensive list of all the keys from your YAML file with explanations for each.
+
+***
+
+### Top-Level Keys
+
+* `entities`: The main list containing all molecular components of the system, such as proteins, ligands, or imported files.
+* `constraints`: A list of rules or conditions to apply to the system, like specific bonds between entities or total length restrictions.
+
+---
+
+### Entity Types (Keys within the `entities` list)
+
+* `protein`: Defines a protein entity.
+* `ligand`: Defines a small molecule ligand.
+* `file`: Specifies an external structure file (e.g., a `.cif` file) to import parts of the system from.
+
+---
+
+### Keys for `protein` Entities
+
+* `id`: A unique identifier for the protein chain (e.g., 'A', 'G').
+* `sequence`: Defines the amino acid sequence of the protein. This can include numbers to specify lengths of residues to be designed.
+* `secondary_structure`: Specifies the secondary structure of the protein.
+* `binding_types`: Defines which residues are involved in binding. Can be a string or a more detailed dictionary.
+* `cyclic`: A boolean (`true` or `false`) indicating if the protein is cyclic.
+
+---
+
+### Keys for `ligand` Entities
+
+* `id`: A unique identifier for the ligand. Can be a single ID or a list of IDs.
+* `ccd`: The Chemical Component Dictionary ID for the ligand (e.g., 'SAH').
+* `smiles`: The SMILES string representing the ligand's chemical structure.
+* `binding_types`: Specifies binding information, often a simple character like 'B' for binding.
+
+---
+
+### Keys for `file` Entities
+
+* `path`: The file path to the structure file to be included (e.g., 'example/7rpz.cif').
+* `msa`: A global flag for Multiple Sequence Alignment for the chains in the file. Can be overwritten by individual chain settings.
+* `include`: Specifies which parts of the file to include. Can be the string `"all"` or a list of chains.
+* `exclude`: Specifies which parts of an included file to exclude.
+* `fuse`: Specifies a chain to which subsequent protein entities will be fused.
+* `include_proximity`: Includes residues from the file that are within a certain distance of a specified chain.
+* `binding_types`: Defines binding interactions for specific chains within the file.
+* `structure_groups`: Defines groups of residues for visualization or other purposes.
+* `design`: Specifies which residues in the included chains are designable.
+* `secondary_structure`: Defines the secondary structure for specific residues within included chains.
+* `design_insertions`: Specifies where to insert new designable residues.
+
+---
+
+### Keys for `constraints`
+
+* `bond`: Defines a covalent bond to be formed between two specified atoms in the system.
+    * `atom1`: The first atom in the bond.
+    * `atom2`: The second atom in the bond.
+* `total_len`: Constrains the total length of the polymeric system.
+    * `min`: The minimum allowed total length.
+    * `max`: The maximum allowed total length.
+
+---
+
+### Nested Keys (found within multiple entity types)
+
+* `chain`: A sub-dictionary used in `include`, `exclude`, `binding_types`, `design`, and `secondary_structure` to specify a particular protein chain.
+    * `id`: The identifier of the chain.
+    * `msa`: A specific MSA setting for this chain, overriding the global `msa` flag.
+    * `res_index`: Specifies a range or list of residue indices.
+    * `radius`: Used in `include_proximity` to define a distance in Angstroms.
+    * `binding`: Specifies residues that are part of a binding site.
+    * `not_binding`: Specifies residues that are not part of a binding site.
+    * `loop`, `helix`, `sheet`: Used in `secondary_structure` to define the structure of specific residues.
+* `group`: Used in `structure_groups` to define a residue group.
+    * `id`: The identifier of the chain or `"all"`.
+    * `visibility`: A numerical value to control how the group is displayed.
+    * `res_index`: The residues included in this group.
+* `insertion`: Used in `design_insertions`.
+    * `id`: The chain ID where the insertion occurs.
+    * `res_index`: The residue index after which the insertion is made.
+    * `num_residues`: The number or range of residues to be inserted.
+    * `secondary_structure`: The desired secondary structure for the inserted residues (e.g., `HELIX`).
--- a/example/binding_disordered_peptides/tpp4.yaml
+++ b/example/binding_disordered_peptides/tpp4.yaml
@@ -0,0 +1,7 @@
+entities:
+  - protein:
+      id: A
+      sequence: 120..140
+  - protein:
+      id: B
+      sequence: GGGILPWKWPWWPWRRGGG
--- a/example/binding_disordered_regions_of_proteins/hoxd13.cif
+++ b/example/binding_disordered_regions_of_proteins/hoxd13.cif
--- a/example/binding_disordered_regions_of_proteins/hoxd13.yaml
+++ b/example/binding_disordered_regions_of_proteins/hoxd13.yaml
@@ -0,0 +1,40 @@
+entities:
+  - protein:
+      id: B
+      sequence: 40..80
+  - file:
+      path: hoxd13.cif
+      include: 
+        - chain:
+            id: A 
+            res_index: ..71
+      structure_groups:
+        - group:
+            visibility: 0
+            id: "all"
+      binding_types:
+        - chain:
+            id: A
+            binding: 57..71
+
+  
+  - protein:
+      id: C
+      fuse: A
+      sequence: AAAAAAAA
+      binding_types: BBBBBBBB
+  - file:
+      path: hoxd13.cif
+      fuse: A
+      include:
+        - chain:
+            id: A
+            res_index: 72..
+      structure_groups:
+        - group:
+            visibility: 0
+            id: "all"
+        - group:
+            visibility: 1
+            id: A
+            res_index: 281..
--- a/example/binding_disordered_regions_of_proteins/npm1.cif
+++ b/example/binding_disordered_regions_of_proteins/npm1.cif
--- a/example/binding_disordered_regions_of_proteins/npm1.yaml
+++ b/example/binding_disordered_regions_of_proteins/npm1.yaml
@@ -0,0 +1,20 @@
+entities:
+  - protein:
+      id: G
+      sequence: 40..80
+  - file:
+      path: npm1.cif
+      include:
+        - chain:
+            id: A
+      binding_types:
+        - chain:
+            id: A
+            binding: 123..240
+            not_binding: 1..122
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+            res_index: 12..118, 243..291
+        
--- a/example/binding_disordered_regions_of_proteins/nup98.cif
+++ b/example/binding_disordered_regions_of_proteins/nup98.cif
--- a/example/binding_disordered_regions_of_proteins/nup98.yaml
+++ b/example/binding_disordered_regions_of_proteins/nup98.yaml
@@ -0,0 +1,14 @@
+entities:
+  - protein:
+      id: G
+      sequence: 40..80
+  - file:
+      path: nup98.cif
+      include:
+        - chain:
+            id: A
+            res_index: 1..400
+      structure_groups:
+        - group:
+            visibility: 0
+            id: "all"
--- a/example/cyclic_against_hiv_antibody_site/9d3d.cif
+++ b/example/cyclic_against_hiv_antibody_site/9d3d.cif
--- a/example/cyclic_against_hiv_antibody_site/9d3d.yaml
+++ b/example/cyclic_against_hiv_antibody_site/9d3d.yaml
@@ -0,0 +1,33 @@
+entities:
+  - file:
+      path: 9d3d.cif
+       
+      include:
+        - chain:
+            id: A
+        - chain:
+            id: B
+        - chain:
+            id: C
+
+      include_proximity:
+        - chain: 
+            id: G
+            res_index: 106..118
+            radius: 30
+      
+      binding_types:
+        - chain:
+            id: A
+            binding: 91,128,131
+        - chain:
+            id: B
+            binding: 91,128,131
+        - chain:
+            id: C
+            binding: 91,128,131
+
+  - protein:
+      id: E
+      sequence: 8..18
+      cyclic: True
--- a/example/cyclotide/3ivq.cif
+++ b/example/cyclotide/3ivq.cif
--- a/example/cyclotide/3ivq.yaml
+++ b/example/cyclotide/3ivq.yaml
@@ -0,0 +1,25 @@
+entities:
+  - protein: 
+      id: B
+      sequence: 3C8C6C5C3C1C2
+      cyclic: true
+
+  - file:
+      path: 3ivq.cif
+       
+      include:
+        - chain:
+            id: A
+
+      structure_groups: "all"
+
+constraints:
+  - bond:
+      atom1: [B, 4, SG] 
+      atom2: [B, 26, SG]
+  - bond:
+      atom1: [B, 13, SG] 
+      atom2: [B, 30, SG]
+  - bond:
+      atom1: [B, 20, SG] 
+      atom2: [B, 32, SG]
--- a/example/cyclotide/5wrd.cif
+++ b/example/cyclotide/5wrd.cif
--- a/example/cyclotide/5wrd.yaml
+++ b/example/cyclotide/5wrd.yaml
@@ -0,0 +1,25 @@
+entities:
+  - protein: 
+      id: B
+      sequence: 3C8C6C5C3C1C2
+      cyclic: true
+
+  - file:
+      path: 5wrd.cif
+       
+      include:
+        - chain:
+            id: A
+
+      structure_groups: "all"
+
+constraints:
+  - bond:
+      atom1: [B, 4, SG] 
+      atom2: [B, 26, SG]
+  - bond:
+      atom1: [B, 13, SG] 
+      atom2: [B, 30, SG]
+  - bond:
+      atom1: [B, 20, SG] 
+      atom2: [B, 32, SG]
--- a/example/cylcic_against_kras_with_specific_site/8jjs.cif
+++ b/example/cylcic_against_kras_with_specific_site/8jjs.cif
--- a/example/cylcic_against_kras_with_specific_site/cyclicdesign.yaml
+++ b/example/cylcic_against_kras_with_specific_site/cyclicdesign.yaml
@@ -0,0 +1,19 @@
+entities:
+  - protein: 
+      id: B
+      sequence: 8..16
+      cyclic: true
+  - file:
+      path: 8jjs.cif
+       
+      include: 
+        - chain:
+            id: A
+        - chain:
+            id: C
+
+      binding_types:
+        - chain:
+            id: A
+            binding: 12,14,61,63,73,76,77,83,101,104,108
+          
--- a/example/denovo_zinc_finger_against_dna/vanilla_protein.yaml
+++ b/example/denovo_zinc_finger_against_dna/vanilla_protein.yaml
@@ -0,0 +1,12 @@
+entities:
+
+  - protein: 
+      id: G
+      sequence: 40..120
+  - file:
+      path: zf.cif
+      include: 
+        - chain:
+            id: C1
+        - chain:
+            id: B1
--- a/example/denovo_zinc_finger_against_dna/zf.cif
+++ b/example/denovo_zinc_finger_against_dna/zf.cif
--- a/example/denovo_zinc_finger_against_dna/zinc_finger.yaml
+++ b/example/denovo_zinc_finger_against_dna/zinc_finger.yaml
@@ -0,0 +1,30 @@
+entities:
+  - file:
+      path: zf.cif
+      include: "all"
+      exclude:
+        - chain:
+            id: A1
+            res_index: ..10,63..69,185..
+      design_insertions:
+        - insertion:
+            id: A1
+            res_index: 63 
+            num_residues: 3..8
+          
+      structure_groups:
+        - group:
+            visibility: 0
+            id: "all"
+
+      design:
+        - chain:
+            id: A1
+            res_index: 11..184
+      not_design:
+        - chain:
+            id: A1
+            res_index: 11..20,29,33,39..48,57,61,72..81,90,94,100..109,118,122,129..138,147,151,157..166,175,179
+      reset_res_index:
+        - chain:
+            id: A1
--- a/example/design_spec_showcasing_all_functionalities.yaml
+++ b/example/design_spec_showcasing_all_functionalities.yaml
@@ -0,0 +1,184 @@
+entities:
+  - protein: 
+      id: G
+      sequence: 15..20AAAAAAVTTTT18PPP # range between 15 and 20 inclusive on both sides
+  - protein: 
+      id: R
+      sequence: 3..5C6C3 # Random number of design residues between 3 and 5, then a Cystein, then 6 design residues, then ...
+  - ligand:
+      id: Q
+      ccd: WHL
+  - protein:
+      id: H
+      sequence: 17
+      secondary_structure: # No secondary structure specified, defaults 
+  - file:
+      path: 7rpz.cif
+       
+      include: 
+        - chain:
+            id: A
+        - chain:
+            id: B
+
+      include_proximity:
+        - chain: 
+            id: A
+            res_index: 10..16
+            radius: 35
+
+      binding_types:
+        - chain:
+            id: A
+            binding: 5..7,13
+        - chain:
+            id: B
+            not_binding: "all" 
+          
+
+
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+            res_index: 10..16
+        - group:
+            visibility: 2
+            id: B
+        - group:
+            visibility: 0
+            id: A
+            res_index: 13
+
+      design:
+        - chain:
+            id: A
+            res_index: ..4,20..27
+
+      secondary_structure:
+        - chain:
+            id: A
+            loop: 1
+            helix: 2..3
+            sheet: 4
+        
+      design_insertions:
+        - insertion:
+            id: A
+            res_index: 20 # The 20th residue will be a designed one (starting to count from 1)
+            num_residues: 2..9
+            secondary_structure: HELIX # One of UNSPECIFIED (default), LOOP, HELIX, SHEET. 
+
+  - protein: 
+      id: A
+      sequence: AAAAAAAAAAAAAAAAAAAAAAAA
+      binding_types: uuuuBBBuNNNuBuu # the missing specifications will be 'u' by default
+  - file:
+      path: 7rpz.cif
+      fuse: A
+      include: 
+        - chain:
+            id: A
+            res_index: ..5
+            
+  - protein:
+      id: B
+      sequence: AAAAAAAAAAAAAAAAAAAAAAAA
+      binding_types:
+        binding: 5..7,13
+        not_binding: 9..11
+  - ligand:
+      id: [C, D]
+      ccd: SAH
+  - ligand:
+      id: [E, F]
+      smiles: 'N[C@@H](Cc1ccc(O)cc1)C(=O)O'
+      binding_types: B
+
+
+  - file:
+      path: 7rpz.cif
+      include: "all"
+      exclude: 
+        - chain:
+            id: A
+            res_index: ..5
+
+      structure_groups:
+        - group:
+            visibility: 1
+            id: "all"
+
+        - group:
+            visibility: 0
+            id: A
+            res_index: 10..16
+  - file:
+      path: 8r3a.cif
+       
+      include: 
+        - chain:
+            id: A
+        - chain:
+            id: B
+
+      binding_types:
+        - chain:
+            id: A
+            binding: 5..7,13
+        - chain:
+            id: B
+            not_binding: "all" 
+          
+
+
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+            res_index: 10..13
+        - group:
+            visibility: 2
+            id: B
+        - group:
+            visibility: 0
+            id: A
+            res_index: 13
+
+      design:
+        - chain:
+            id: A
+            res_index: 14..19
+
+      secondary_structure:
+        - chain:
+            id: A
+            loop: 14
+            helix: 15..17
+            sheet: 19
+  - protein:
+      id: S
+      sequence: 10C6C3
+  - protein:
+      id: T
+      sequence: C10C6C3C
+      cyclic: true
+
+constraints:
+    # specify connections as if the minimum possible number of residues was sampled
+  - bond:
+      atom1: [R, 4, SG] # connection for a helicon
+      atom2: [Q, 1, CK]
+  - bond:
+      atom1: [R, 11, SG]
+      atom2: [Q, 1, CH]
+  - bond:
+      atom1: [S, 11, SG] # connection for a disulfide bond
+      atom2: [S, 18, SG]
+  - bond:
+      atom1: [T, 12, SG]
+      atom2: [T, 19, SG]
+
+  - total_len:
+      min: 10
+      max: 20
--- a/example/disulfide_peptide_with_betahairpin_conditioning/7nre.cif
+++ b/example/disulfide_peptide_with_betahairpin_conditioning/7nre.cif
--- a/example/disulfide_peptide_with_betahairpin_conditioning/cropped_target.yaml
+++ b/example/disulfide_peptide_with_betahairpin_conditioning/cropped_target.yaml
@@ -0,0 +1,26 @@
+entities:
+  - protein:
+      id: B
+      sequence: 1C11..16C1
+
+      secondary_structure:
+          sheet: 1,3..11
+
+  - file:
+      path: 7nre.cif
+      include:
+        - chain:
+            id: A
+            res_index: 24..
+      
+      binding_types:
+        - chain:
+            id: A
+            binding: 26..31,381,408
+  
+
+constraints:
+  - bond:
+      atom1: [B, 2, SG] 
+      atom2: [B, 14, SG]
+
--- a/example/disulfide_peptide_with_betahairpin_conditioning/proximity_cropped_target.yaml
+++ b/example/disulfide_peptide_with_betahairpin_conditioning/proximity_cropped_target.yaml
@@ -0,0 +1,34 @@
+entities:
+  - protein:
+      id: B
+      sequence: 1C11..16C1
+
+      secondary_structure:
+          sheet: 1,3..11
+
+  - file:
+      path: 7nre.cif
+      include:
+        - chain:
+            id: A
+            res_index: 24..
+      
+      binding_types:
+        - chain:
+            id: A
+            binding: 26..31,381,408
+
+      include_proximity:
+        - chain: 
+            id: A
+            res_index: 26..31,381,408
+            radius: 28
+
+      
+  
+
+constraints:
+  - bond:
+      atom1: [B, 2, SG] 
+      atom2: [B, 14, SG]
+
--- a/example/double_disulfide_peptide_against_specific_site/8wtw.cif
+++ b/example/double_disulfide_peptide_against_specific_site/8wtw.cif
--- a/example/double_disulfide_peptide_against_specific_site/norepinephrine.yaml
+++ b/example/double_disulfide_peptide_against_specific_site/norepinephrine.yaml
@@ -0,0 +1,24 @@
+entities:
+  - protein:
+      id: B
+      sequence: 1..3CC4C1..3C1..3
+
+  - file:
+      path: 8wtw.cif
+       
+      include: 
+        - chain:
+            id: A
+
+      binding_types:
+        - chain:
+            id: A
+            binding: 24
+          
+constraints:
+  - bond:
+      atom1: [B, 2, SG] 
+      atom2: [B, 10, SG]
+  - bond:
+      atom1: [B, 3, SG] 
+      atom2: [B, 8, SG]
--- a/example/hard_targets/1g13.cif
+++ b/example/hard_targets/1g13.cif
--- a/example/hard_targets/1g13nano.yaml
+++ b/example/hard_targets/1g13nano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 1g13.cif
+       
+      include: 
+          - chain:
+              id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/1g13prot.yaml
+++ b/example/hard_targets/1g13prot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 1g13.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/1jqd.cif
+++ b/example/hard_targets/1jqd.cif
--- a/example/hard_targets/1jqdnano.yaml
+++ b/example/hard_targets/1jqdnano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 1jqd.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/1jqdprot.yaml
+++ b/example/hard_targets/1jqdprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 1jqd.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/1nb0.cif
+++ b/example/hard_targets/1nb0.cif
--- a/example/hard_targets/1nb0nano.yaml
+++ b/example/hard_targets/1nb0nano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 1nb0.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - example/nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/1nb0prot.yaml
+++ b/example/hard_targets/1nb0prot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 1nb0.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/2a1x.cif
+++ b/example/hard_targets/2a1x.cif
--- a/example/hard_targets/2a1xnano.yaml
+++ b/example/hard_targets/2a1xnano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 2a1x.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/2a1xprot.yaml
+++ b/example/hard_targets/2a1xprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 2a1x.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/2pny.cif
+++ b/example/hard_targets/2pny.cif
--- a/example/hard_targets/2pnynano.yaml
+++ b/example/hard_targets/2pnynano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 2pny.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/2pnyprot.yaml
+++ b/example/hard_targets/2pnyprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 2pny.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/3apu.cif
+++ b/example/hard_targets/3apu.cif
--- a/example/hard_targets/3apunano.yaml
+++ b/example/hard_targets/3apunano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 3apu.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/3apuprot.yaml
+++ b/example/hard_targets/3apuprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 3apu.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/3ch4.cif
+++ b/example/hard_targets/3ch4.cif
--- a/example/hard_targets/3ch4nano.yaml
+++ b/example/hard_targets/3ch4nano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 3ch4.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/3ch4prot.yaml
+++ b/example/hard_targets/3ch4prot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 3ch4.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/3qkg.cif
+++ b/example/hard_targets/3qkg.cif
--- a/example/hard_targets/3qkgnano.yaml
+++ b/example/hard_targets/3qkgnano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 3qkg.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/3qkgprot.yaml
+++ b/example/hard_targets/3qkgprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 3qkg.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/6m1u.cif
+++ b/example/hard_targets/6m1u.cif
--- a/example/hard_targets/6m1unano.yaml
+++ b/example/hard_targets/6m1unano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 6m1u.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/6m1uprot.yaml
+++ b/example/hard_targets/6m1uprot.yaml
@@ -0,0 +1,11 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 6m1u.cif
+       
+      include: 
+        - chain:
+            id: A
+
--- a/example/hard_targets/7aah.cif
+++ b/example/hard_targets/7aah.cif
--- a/example/hard_targets/7aahnano.yaml
+++ b/example/hard_targets/7aahnano.yaml
@@ -0,0 +1,15 @@
+entities:
+  - file:
+      path: 7aah.cif
+       
+      include: 
+        - chain:
+            id: A
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
+
+
--- a/example/hard_targets/7aahprot.yaml
+++ b/example/hard_targets/7aahprot.yaml
@@ -0,0 +1,12 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 80..140
+  - file:
+      path: 7aah.cif
+       
+      include: 
+        - chain:
+            id: A
+
+
--- a/example/helicon_against_peptide_in_pmhc/3mrp.cif
+++ b/example/helicon_against_peptide_in_pmhc/3mrp.cif
--- a/example/helicon_against_peptide_in_pmhc/mart1.yaml
+++ b/example/helicon_against_peptide_in_pmhc/mart1.yaml
@@ -0,0 +1,35 @@
+entities:
+  - protein: 
+      id: G
+      sequence: 3..7C6C3..7 
+  - ligand:
+      id: F
+      ccd: WHL
+
+  - file:
+      path: 3mrp.cif
+       
+      include:
+        - chain:
+            id: A # MHC
+        - chain:
+            id: C # Peptide
+        - chain:
+            id: B # other part of MHC
+
+
+      binding_types:
+        - chain:
+            id: C
+            binding: "all"
+         
+          
+      structure_groups: "all"
+
+constraints:
+  - bond:
+      atom1: [G, 4, SG] # specify connection as if the minimum possible 
+      atom2: [F, 1, CK]
+  - bond:
+      atom1: [G, 11, SG]
+      atom2: [F, 1, CH]
--- a/example/nanobody_against_penguinpox/9bkq-assembly2.cif
+++ b/example/nanobody_against_penguinpox/9bkq-assembly2.cif
--- a/example/nanobody_against_penguinpox/penguinpox.yaml
+++ b/example/nanobody_against_penguinpox/penguinpox.yaml
@@ -0,0 +1,13 @@
+entities:
+  - file:
+      path: 9bkq-assembly2.cif # penguinpox target
+      include: 
+        - chain: 
+            id: B
+
+  - file:
+      path: 
+        - ../nanobody_scaffolds/7eow.yaml
+        - ../nanobody_scaffolds/7xl0.yaml
+        - ../nanobody_scaffolds/8coh.yaml
+        - ../nanobody_scaffolds/8z8v.yaml
--- a/example/nanobody_scaffolds/7eow.cif
+++ b/example/nanobody_scaffolds/7eow.cif
--- a/example/nanobody_scaffolds/7eow.yaml
+++ b/example/nanobody_scaffolds/7eow.yaml
@@ -0,0 +1,59 @@
+path: 7eow.cif
+include: 
+  - chain: 
+      id: B
+
+design:
+  - chain:
+      id: B
+      res_index: 26..34,52..59,98..118
+
+structure_groups:
+  - group:
+      id: B
+      visibility: 2
+  - group:
+      id: B
+      visibility: 0
+      res_index: 26..34,52..59,98..118
+
+# Flexible lengths for CDR 1
+exclude: 
+  - chain:
+      id: B
+      res_index: 26..28 # take out 3
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 2
+exclude: 
+  - chain:
+      id: B
+      res_index: 52..54 # take out 3
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 52 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 3
+exclude: 
+  - chain:
+      id: B
+      res_index: 98..104 # take out seven
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..14
+
+# reindex the residue index which is used in the positional encoding
+reset_res_index:
+  - chain:
+      id: B
--- a/example/nanobody_scaffolds/7xl0.cif
+++ b/example/nanobody_scaffolds/7xl0.cif
--- a/example/nanobody_scaffolds/7xl0.yaml
+++ b/example/nanobody_scaffolds/7xl0.yaml
@@ -0,0 +1,59 @@
+path: 7xl0.cif
+include: 
+  - chain: 
+      id: A
+design:
+  - chain:
+      id: A
+      res_index: 26..33,51..57,97..110
+
+structure_groups:
+  - group:
+      id: A
+      visibility: 2
+  - group:
+      id: A
+      visibility: 0
+      res_index: 26..33,51..57,97..110
+
+
+# Flexible lengths for CDR 1
+exclude: 
+  - chain:
+      id: A
+      res_index: 26..28 # take out 3
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 2
+exclude: 
+  - chain:
+      id: A
+      res_index: 51..53 # take out 3
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 3
+exclude: 
+  - chain:
+      id: A
+      res_index: 97..102 # take out 6
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 97 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..12
+
+# reindex the residue index which is used in the positional encoding
+reset_res_index:
+  - chain:
+      id: A
--- a/example/nanobody_scaffolds/8coh.cif
+++ b/example/nanobody_scaffolds/8coh.cif
--- a/example/nanobody_scaffolds/8coh.yaml
+++ b/example/nanobody_scaffolds/8coh.yaml
@@ -0,0 +1,60 @@
+path: 8coh.cif
+include: 
+  - chain: 
+      id: A
+      res_index: ..126
+design:
+  - chain:
+      id: A
+      res_index: 26..33,51..58,97..115
+
+structure_groups:
+  - group:
+      id: A
+      visibility: 2
+  - group:
+      id: A
+      visibility: 0
+      res_index: 26..33,51..58,97..115
+
+
+# Flexible lengths for CDR 1
+exclude: 
+  - chain:
+      id: A
+      res_index: 26..28 # take out 3
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 2
+exclude: 
+  - chain:
+      id: A
+      res_index: 51..53 # take out 3
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 3
+exclude: 
+  - chain:
+      id: A
+      res_index: 97..103 # take out seven
+
+design_insertions:
+  - insertion:
+      id: A
+      res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..14
+
+# reindex the residue index which is used in the positional encoding
+reset_res_index:
+  - chain:
+      id: A
--- a/example/nanobody_scaffolds/8z8v.cif
+++ b/example/nanobody_scaffolds/8z8v.cif
--- a/example/nanobody_scaffolds/8z8v.yaml
+++ b/example/nanobody_scaffolds/8z8v.yaml
@@ -0,0 +1,60 @@
+path: 8z8v.cif
+include: 
+  - chain: 
+      id: B
+design:
+  - chain:
+      id: B
+      res_index: 26..33,51..58,98..108
+
+structure_groups:
+  - group:
+      id: B
+      visibility: 2
+  - group:
+      id: B
+      visibility: 0
+      res_index: 26..33,51..58,98..108
+
+
+
+# Flexible lengths for CDR 1
+exclude: 
+  - chain:
+      id: B
+      res_index: 26..28 # take out 3
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 26 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 2
+exclude: 
+  - chain:
+      id: B
+      res_index: 51..53 # take out 3
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 51 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..5
+
+# Flexible lengths for CDR 3
+exclude: 
+  - chain:
+      id: B
+      res_index: 98..100 # take out 3
+
+design_insertions:
+  - insertion:
+      id: B
+      res_index: 98 # The res_index'th residue will be a designed one (starting to count from 1)
+      num_residues: 1..12
+
+# reindex the residue index which is used in the positional encoding
+reset_res_index:
+  - chain:
+      id: B
--- a/example/peptide_against_disordered_region_of_protein/cryptochrome4.yaml
+++ b/example/peptide_against_disordered_region_of_protein/cryptochrome4.yaml
@@ -0,0 +1,24 @@
+entities:
+  - protein: 
+      id: B
+      sequence: 12..20
+  - file:
+      path: cryptochrome4_european_robin_bird_boltz_prediction.cif
+       
+      include:
+        - chain:
+            id: A
+
+      structure_groups:
+        - group:
+            visibility: 1
+            id: A
+        - group:
+            visibility: 0
+            id: A
+            res_index: 494..
+
+      binding_types:
+        - chain:
+            id: A
+            binding: 494..507
--- a/example/peptide_against_disordered_region_of_protein/cryptochrome4_european_robin_bird_boltz_prediction.cif
+++ b/example/peptide_against_disordered_region_of_protein/cryptochrome4_european_robin_bird_boltz_prediction.cif
--- a/example/peptide_against_specific_site_on_ragc/6wj3.cif
+++ b/example/peptide_against_specific_site_on_ragc/6wj3.cif
--- a/example/peptide_against_specific_site_on_ragc/rragc.yaml
+++ b/example/peptide_against_specific_site_on_ragc/rragc.yaml
@@ -0,0 +1,16 @@
+entities:
+  - protein: 
+      id: P
+      sequence: 5..20
+  - file:
+      path: 6wj3.cif
+       
+      include: 
+        - chain:
+            id: G
+
+      binding_types:
+        - chain:
+            id: G
+            binding: 190,193,194,258,259,262,263,205,214,215,216,217,218,219,220,221,222,232,236,239,278,279,280,281,282,283,284,285,286,240,245,246,249,250,253,254,256,257,261,262
+          
--- a/example/peptide_against_specific_site_on_ragc/rragc_site1.yaml
+++ b/example/peptide_against_specific_site_on_ragc/rragc_site1.yaml
@@ -0,0 +1,16 @@
+entities:
+  - protein: 
+      id: P
+      sequence: 10..22
+  - file:
+      path: 6wj3.cif
+       
+      include: 
+        - chain:
+            id: G
+
+      binding_types:
+        - chain:
+            id: G
+            binding: 190,193,194,258,259,262,263,205,214,215,216,217,218,219,220,221,222
+          
--- a/example/peptide_against_specific_site_on_ragc/rragc_site2.yaml
+++ b/example/peptide_against_specific_site_on_ragc/rragc_site2.yaml
@@ -0,0 +1,16 @@
+entities:
+  - protein: 
+      id: P
+      sequence: 10..22
+  - file:
+      path: 6wj3.cif
+       
+      include: 
+        - chain:
+            id: G
+
+      binding_types:
+        - chain:
+            id: G
+            binding: 232,236,239,278,279,280,281,282,283,284,285,286,240,245,246,249,250,253,254,256,257,261,262
+          
--- a/example/protein_binding_small_molecule/brilacidin.yaml
+++ b/example/protein_binding_small_molecule/brilacidin.yaml
@@ -0,0 +1,7 @@
+entities:
+  - protein:
+      id: A
+      sequence: 140..180
+  - ligand:
+      id: B
+      smiles: "C1CNC[C@@H]1OC2=C(C=C(C=C2NC(=O)C3=CC(=NC=N3)C(=O)NC4=CC(=CC(=C4O[C@@H]5CCNC5)NC(=O)CCCCN=C(N)N)C(F)(F)F)C(F)(F)F)NC(=O)CCCCN=C(N)N"
--- a/example/protein_binding_small_molecule/chorismite.yaml
+++ b/example/protein_binding_small_molecule/chorismite.yaml
@@ -0,0 +1,7 @@
+entities:
+  - protein:
+      id: A
+      sequence: 140..180
+  - ligand:
+      id: B
+      ccd: TSA
--- a/example/protein_binding_small_molecule/rucaparib.yaml
+++ b/example/protein_binding_small_molecule/rucaparib.yaml
@@ -0,0 +1,7 @@
+    entities:
+    - protein:
+        id: A
+        sequence: 140..180
+    - ligand:
+        id: B
+        ccd: RPB
--- a/example/streptavidin_partially_flexible_target/1mk5.cif
+++ b/example/streptavidin_partially_flexible_target/1mk5.cif
--- a/example/streptavidin_partially_flexible_target/cyclic.yaml
+++ b/example/streptavidin_partially_flexible_target/cyclic.yaml
@@ -0,0 +1,22 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 8..18
+      cyclic: true
+  - file:
+      path: 1mk5.cif
+       
+      include: 
+        - chain:
+            id: A
+
+      structure_groups:
+        - group:
+            id: A
+            visibility: 1
+        - group:
+            id: A
+            visibility: 0
+            res_index: 32..42
+
+
--- a/example/streptavidin_partially_flexible_target/disulfide.yaml
+++ b/example/streptavidin_partially_flexible_target/disulfide.yaml
@@ -0,0 +1,24 @@
+entities:
+  - protein: 
+      id: C
+      sequence: 1..5C6C1..5
+  - file:
+      path: 1mk5.cif
+       
+      include: 
+        - chain:
+            id: A
+
+      structure_groups:
+        - group:
+            id: A
+            visibility: 1
+        - group:
+            id: A
+            visibility: 0
+            res_index: 32..42
+
+constraints:
+  - bond:
+      atom1: [C, 2, SG] 
+      atom2: [C, 9, SG]
--- a/Show More
+++ b/Show More