add documentation to exmaples. fix design insertions such that more than 1 insertion are now possible. fix 1 indexing in design insertion size specification

2026-06-04 11:54:23 +08:00 · 2025-11-08 18:31:10 +00:00
parent 6a82850a6e
commit 71cf7884a1
19 changed files with 156 additions and 40 deletions
--- a/3
+++ b/3
@@ -41,9 +41,10 @@ COPY . /app
 RUN pip install --no-cache-dir -e /app

 ARG DOWNLOAD_WEIGHTS=false
+ARG HF_TOKEN=""
 RUN mkdir -p "${HF_HOME}" && \
    if [ "${DOWNLOAD_WEIGHTS}" = "true" ]; then \
-        boltzgen download all --cache "${HF_HOME}" --force_download; \
+        HF_TOKEN="${HF_TOKEN}" boltzgen download --models-cache-dir "${HF_HOME}" --force-download --show-paths; \
    fi

 ARG USERNAME=boltzgen
--- a/example/binding_disordered_peptides/tpp4.yaml
+++ b/example/binding_disordered_peptides/tpp4.yaml
@@ -1,7 +1,11 @@
 entities:
+  # Specify a designed protein chain 
+  # random number between 120 and 140 of designed residues (inclusive)
  - protein:
      id: A
      sequence: 120..140
+  # Specify a non-designed protein chain
+  # fixed 20-mer, all residues fixed
  - protein:
      id: B
      sequence: GGGILPWKWPWWPWRRGGG
--- a/example/binding_disordered_regions_of_proteins/hoxd13.yaml
+++ b/example/binding_disordered_regions_of_proteins/hoxd13.yaml
@@ -1,35 +1,50 @@
 entities:
+  # Specify a designed protein chain 
  - protein:
      id: B
+      # random number between 40 and 80 of designed residues (inclusive)
      sequence: 40..80
+  # Specification of the target which is extracted from a .cif file
  - file:
+      # path to the target structure
      path: hoxd13.cif
+      # Which chain and residues in the .cif file to use as target (uses only A: 1..71 here)
      include: 
        - chain:
            id: A 
            res_index: ..71
+      # Which regions of the target should have their structure specified
+      # Here we hide the entire target fragment (no coordinates specified)
      structure_groups:
        - group:
-            visibility: 0
+            visibility: 0 
            id: "all"
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 57..71 on chain A
      binding_types:
        - chain:
            id: A
            binding: 57..71

-  
+  # Specify a non-designed protein chain fused to chain A
  - protein:
      id: C
      fuse: A
+      # fixed 8-mer, all residues fixed, and all residues marked as binding
      sequence: AAAAAAAA
      binding_types: BBBBBBBB
+
+  # Continue pulling from the rest of chain A of the same file and fuse it to chain A
  - file:
      path: hoxd13.cif
+      # Subsequently included residues are fused onto chain A
      fuse: A
+      # Include the remainder of chain A (residue 72 to the end)
      include:
        - chain:
            id: A
            res_index: 72..
+      # Start with all hidden, then explicitly show structure of a tail segment (281 to the end)
      structure_groups:
        - group:
            visibility: 0
--- a/example/binding_disordered_regions_of_proteins/npm1.yaml
+++ b/example/binding_disordered_regions_of_proteins/npm1.yaml
@@ -1,17 +1,25 @@
 entities:
+  # Specify a designed protein chain 
  - protein:
      id: G
+      # random number between 40 and 80 of designed residues (inclusive)
      sequence: 40..80
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: npm1.cif
+      # Which chain and residues in the .cif file to use as target (uses only chain A here)
      include:
        - chain:
            id: A
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 123..240 on chain Aand not bind to residues 1..122 on chain A
      binding_types:
        - chain:
            id: A
            binding: 123..240
            not_binding: 1..122
+      # Which regions of the target should have their structure specified
+      # Here we keep structure specified (visible) for two disjoint segments of chain A
      structure_groups:
        - group:
            visibility: 1
--- a/example/binding_disordered_regions_of_proteins/nup98.yaml
+++ b/example/binding_disordered_regions_of_proteins/nup98.yaml
@@ -1,13 +1,19 @@
 entities:
+  # Specify a designed protein chain 
+  # random number between 40 and 80 of designed residues (inclusive)
  - protein:
      id: G
      sequence: 40..80
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: nup98.cif
+      # Which chain and residues in the .cif file to use as target (uses only A: 1..400 here)
      include:
        - chain:
            id: A
            res_index: 1..400
+      # Which regions of the target should have their structure specified
+      # Here we hide the entire included target fragment (no coordinates specified)
      structure_groups:
        - group:
            visibility: 0
--- a/example/cyclic_against_hiv_antibody_site/9d3d.yaml
+++ b/example/cyclic_against_hiv_antibody_site/9d3d.yaml
@@ -1,7 +1,9 @@
 entities:
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 9d3d.cif
-       
+      # Which chain and residues in the .cif file to use as target
+      # Here we include chains A, B, and C (all residues on each chain)
      include:
        - chain:
            id: A
@@ -9,13 +11,15 @@ entities:
            id: B
        - chain:
            id: C
-
+      # Include residues that are within a radius of a reference region
+      # Here we include residues within a radius 30 of G:106..118
      include_proximity:
        - chain: 
            id: G
            res_index: 106..118
            radius: 30
-      
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 91, 128, and 131 on chains A, B, and C
      binding_types:
        - chain:
            id: A
@@ -27,7 +31,10 @@ entities:
            id: C
            binding: 91,128,131

+  # Specify a designed protein chain
+  # random number between 8 and 18 of designed residues (inclusive)
  - protein:
      id: E
      sequence: 8..18
+      # Make the designed protein chain cyclic
      cyclic: True
--- a/example/cyclotide/3ivq.yaml
+++ b/example/cyclotide/3ivq.yaml
@@ -1,25 +1,34 @@
 entities:
+  # Specify a designed protein chain
  - protein: 
      id: B
+      # 3 design residues, Cystein, 8 design residues, Cystein, 
+      # 6 design residues, Cystein, 5 design residues, Cystein, 
+      # 3 design residues, Cystein, 1 design residue, Cystein, 2 design residues
      sequence: 3C8C6C5C3C1C2
+      # Make the designed peptide cyclic
      cyclic: true

+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 3ivq.cif
-       
+      # Which chain and residues in the .cif file to use as target (here includes all residues on chain A)
      include:
        - chain:
            id: A

+      # Which regions of the target should have their structure specified
+      # Here we specify that all included target residues should have their structure specified
      structure_groups: "all"

 constraints:
+  # Specify covalent disulfide bonds between the designed residues on chain B 
  - bond:
-      atom1: [B, 4, SG] 
-      atom2: [B, 26, SG]
+      atom1: [B, 4, SG]  # connect SG of residue 4 to SG of residue 26 on chain B
+      atom2: [B, 26, SG] 
  - bond:
-      atom1: [B, 13, SG] 
-      atom2: [B, 30, SG]
+      atom1: [B, 13, SG]  # connect SG of residue 13 to SG of residue 30 on chain B
+      atom2: [B, 30, SG] 
  - bond:
-      atom1: [B, 20, SG] 
-      atom2: [B, 32, SG]
+      atom1: [B, 20, SG]  # connect SG of residue 20 to SG of residue 32 on chain B
+      atom2: [B, 32, SG] 
--- a/example/cyclotide/5wrd.yaml
+++ b/example/cyclotide/5wrd.yaml
@@ -1,25 +1,34 @@
 entities:
+  # Specify a designed protein chain
+  # 3 design residues, Cystein, 8 design residues, Cystein, 
+  # 6 design residues, Cystein, 5 design residues, Cystein, 
+  # 3 design residues, Cystein, 1 design residue, Cystein, 2 design residues
  - protein: 
      id: B
      sequence: 3C8C6C5C3C1C2
+      # Make the designed peptide cyclic
      cyclic: true

+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 5wrd.cif
-       
+      # Which chain and residues in the .cif file to use as target (here all residues on chain A)
      include:
        - chain:
            id: A

+      # Which regions of the target should have their structure specified
+      # Here we specify that all included target residues should have their structure specified
      structure_groups: "all"

 constraints:
+  # Specify covalent difulside bonds within designed chain B 
  - bond:
-      atom1: [B, 4, SG] 
+      atom1: [B, 4, SG]  # connect SG of residue 4 to SG of residue 26 on chain B
      atom2: [B, 26, SG]
  - bond:
-      atom1: [B, 13, SG] 
+      atom1: [B, 13, SG]  # connect SG of residue 13 to SG of residue 30 on chain B
      atom2: [B, 30, SG]
  - bond:
-      atom1: [B, 20, SG] 
+      atom1: [B, 20, SG]  # connect SG of residue 20 to SG of residue 32 on chain B
      atom2: [B, 32, SG]
--- a/example/cylcic_against_kras_with_specific_site/cyclicdesign.yaml
+++ b/example/cylcic_against_kras_with_specific_site/cyclicdesign.yaml
@@ -1,17 +1,24 @@
 entities:
+  # Specify a designed protein chain
+  # random number between 8 and 16 of designed residues (inclusive)
  - protein: 
      id: B
      sequence: 8..16
+      # Make the designed peptide cyclic
      cyclic: true
+
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 8jjs.cif
-       
+      # Which chain and residues in the .cif file to use as target
+      # Here we include chains A and C (all residues on each chain)
      include: 
        - chain:
            id: A
        - chain:
            id: C
-
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify discrete binding residues on chain A
      binding_types:
        - chain:
            id: A
--- a/example/denovo_zinc_finger_against_dna/vanilla_protein.yaml
+++ b/example/denovo_zinc_finger_against_dna/vanilla_protein.yaml
@@ -1,11 +1,15 @@
 entities:
-
+  # Specify a designed protein chain
+  # random number between 40 and 120 of designed residues (inclusive)
  - protein: 
      id: G
      sequence: 40..120
+
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: zf.cif
      include: 
+        # Which chain and residues in the .cif file to use as target (here includes all residues on chains C1 and B1)
        - chain:
            id: C1
        - chain:
--- a/example/denovo_zinc_finger_against_dna/zinc_finger.yaml
+++ b/example/denovo_zinc_finger_against_dna/zinc_finger.yaml
@@ -1,30 +1,40 @@
 entities:
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: zf.cif
-      include: "all"
+      # Which parts of the .cif file to include as target (here includes all chains/residues)
+      include: "all" 
+      # Which parts of the included content to exclude (here excludes residues 10, 63..69, and 185.. in chain A1)
      exclude:
        - chain:
            id: A1
            res_index: ..10,63..69,185..
+      # Where to insert new designable residues into existing chains
+      # Here we insert 3..8 residues after residue 63 in chain A1
      design_insertions:
        - insertion:
            id: A1
            res_index: 63 
            num_residues: 3..8
-          
+      # Which regions of the target should have their structure specified
+      # Here we hide everything (no structure specified)
      structure_groups:
        - group:
            visibility: 0
            id: "all"
-
+      # Which residues in the target should be redesigned
+      # Here we declare A1:11..184 to be redesigned
      design:
        - chain:
            id: A1
            res_index: 11..184
+      # Which residues in the target are explicitly not redesignable (override entries in design)
+      # Here we carve out fixed positions on chain A1 to not be redesigned
      not_design:
        - chain:
            id: A1
            res_index: 11..20,29,33,39..48,57,61,72..81,90,94,100..109,118,122,129..138,147,151,157..166,175,179
+      # Reset residue numbering to be contiguous for chain A1
      reset_res_index:
        - chain:
            id: A1
--- a/example/disulfide_peptide_with_betahairpin_conditioning/cropped_target.yaml
+++ b/example/disulfide_peptide_with_betahairpin_conditioning/cropped_target.yaml
@@ -1,18 +1,24 @@
 entities:
+  # Specify a designed protein chain
+  # 1 design residue, Cystein, 11..16 design residues, Cystein, 1 design residue, Cystein
  - protein:
      id: B
      sequence: 1C11..16C1
-
+      # Specify the secondary structure of the designed peptide (here we require sheet at residues 1 and 3..11)
      secondary_structure:
          sheet: 1,3..11

+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 7nre.cif
+      # Which chain and residues in the .cif file to use as target (here includes residues 24..on chain A)
      include:
        - chain:
            id: A
            res_index: 24..
      
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 26..31,381,408 on chain A
      binding_types:
        - chain:
            id: A
@@ -20,7 +26,8 @@ entities:
  

 constraints:
+  # Specify covalent disulfide bonds between the designed residues on chain B 
  - bond:
-      atom1: [B, 2, SG] 
+      atom1: [B, 2, SG]  # connect SG of residue 2 to SG of residue 14 on chain B
      atom2: [B, 14, SG]

--- a/example/disulfide_peptide_with_betahairpin_conditioning/proximity_cropped_target.yaml
+++ b/example/disulfide_peptide_with_betahairpin_conditioning/proximity_cropped_target.yaml
@@ -1,23 +1,31 @@
 entities:
+  # Specify a designed protein chain
+  # 1 design residue, Cystein, 11..16 design residues, Cystein, 1 design residue, Cystein
  - protein:
      id: B
      sequence: 1C11..16C1
-
+      # Specify the secondary structure of the designed peptide (here we require sheet at residues 1 and 3..11)
      secondary_structure:
          sheet: 1,3..11

+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 7nre.cif
+      # Which chain and residues in the .cif file to use as target (here includes residues 24..on chain A)
      include:
        - chain:
            id: A
            res_index: 24..
      
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 26..31,381,408 on chain A
      binding_types:
        - chain:
            id: A
            binding: 26..31,381,408

+      # Include residues within a radius of a reference region
+      # Here we include residues within a radius 28 of residues 26..31,381,408 on chain A
      include_proximity:
        - chain: 
            id: A
@@ -28,7 +36,8 @@ entities:
  

 constraints:
+  # Specify covalent disulfide bonds between the designed residues on chain B 
  - bond:
-      atom1: [B, 2, SG] 
+      atom1: [B, 2, SG]  # connect SG of residue 2 to SG of residue 14 on chain B
      atom2: [B, 14, SG]

--- a/example/double_disulfide_peptide_against_specific_site/norepinephrine.yaml
+++ b/example/double_disulfide_peptide_against_specific_site/norepinephrine.yaml
@@ -1,24 +1,29 @@
 entities:
+  # Specify a designed protein chain
+  # 1..3 design residues, Cystein, Cystein, 4 design residues, Cystein, 1..3 design residues, Cystein, 1..3 design residues
  - protein:
      id: B
      sequence: 1..3CC4C1..3C1..3
-
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 8wtw.cif
-       
+      # Which chain and residues in the .cif file to use as target (here includes all residues on chain A)
      include: 
        - chain:
            id: A

+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residue 24 on chain A
      binding_types:
        - chain:
            id: A
            binding: 24
          
 constraints:
+  # Specify covalent disulfide bonds between the designed residues on chain B 
  - bond:
-      atom1: [B, 2, SG] 
+      atom1: [B, 2, SG]  # connect SG of residue 2 to SG of residue 10 on chain B 
      atom2: [B, 10, SG]
  - bond:
-      atom1: [B, 3, SG] 
+      atom1: [B, 3, SG]  # connect SG of residue 3 to SG of residue 8 on chain B
      atom2: [B, 8, SG]
--- a/example/vanilla_peptide_with_target_binding_site/beetletert.yaml
+++ b/example/vanilla_peptide_with_target_binding_site/beetletert.yaml
@@ -1,19 +1,24 @@
 entities:
+  # Specify a designed protein chain
+  # random number between 12 and 20 of designed residues (inclusive)
  - protein: 
      id: G
      sequence: 12..20

+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 5cqg.cif
-       
+      # Which chain and residues in the .cif file to use as target (uses only chain A here)
      include:
        - chain:
            id: A
-
+      # Which regions of the target the design should or should NOT bind to
+      # Here we specify that the design should bind to residues 343, 344, and 251 on chain A
      binding_types:
        - chain:
            id: A
            binding: 343,344,251
-          
+      # Which regions of the target should have their structure specified
+      # Here we specify that all included target residues should have their structure specified
      structure_groups: "all"

--- a/example/vanilla_protein/1g13prot.yaml
+++ b/example/vanilla_protein/1g13prot.yaml
@@ -1,10 +1,13 @@
 entities:
+  # Specify a designed protein chain
+  # random number between 80 and 140 of designed residues (inclusive)
  - protein: 
      id: C
      sequence: 80..140
+  # Specification of the target which is extracted from a .cif file
  - file:
      path: 1g13.cif
-       
+      # Which chain and residues in the .cif file to use as target (uses only chain A here)
      include: 
        - chain:
            id: A
--- a/src/boltzgen/cli/boltzgen.py
+++ b/src/boltzgen/cli/boltzgen.py
@@ -1202,7 +1202,6 @@ def check_design_spec(
    parsed = parser.parse_yaml(design_spec, mols, moldir)
    structure = parsed.structure
    design_info = parsed.design_info
-
    design_color_features = np.ones_like(design_info.res_binding_type) * 0.8
    design_color_features[design_info.res_binding_type.astype(bool)] = 1.0
    extract_mask = np.zeros(len(structure.residues), dtype=bool)
--- a/src/boltzgen/data/parse/schema.py
+++ b/src/boltzgen/data/parse/schema.py
@@ -1072,7 +1072,9 @@ class YamlDesignParser:
                raise ValueError(f"Unsupported file type: {str(path)}")

        name = path.stem
-        target = self.parse_boltzgen_schema(name, data, mols, mol_dir, base_file_path=path.parent)
+        target = self.parse_boltzgen_schema(
+            name, data, mols, mol_dir, base_file_path=path.parent
+        )
        return target

    def log_once(self, msg: str):
@@ -1570,7 +1572,7 @@ class YamlDesignParser:
        if isinstance(path, list) or Path(path).suffix == ".yaml":
            if isinstance(path, list):
                path = random.choice(path)
-            
+
            resolved_path = (base_file_path / path).resolve()
            with resolved_path.open("r") as f:
                file = yaml.safe_load(f)
@@ -1968,6 +1970,7 @@ class YamlDesignParser:

        # Parse and apply design insertions
        if design_insertions is not None:
+            num_inserted = 0
            for list_element in design_insertions:
                insertion = list_element["insertion"]
                if "id" not in insertion:
@@ -1978,10 +1981,15 @@ class YamlDesignParser:
                    raise ValueError(msg)
                chain_id = insertion["id"]
                res_index = insertion["res_index"] - 1  # 1 index input to 0 indexed
+                res_index += num_inserted
                ss_insert_type = insertion.get("secondary_structure", "UNSPECIFIED")
+
+                # We add +1 because the parse_range function is usually used for indexing where we then convert the 1 based inputs to 0 indexing
                num_residues = insertion["num_residues"]
                num_residues = parse_range(num_residues)
                num_residues = np.random.choice(num_residues).item()
+                num_residues += 1
+                num_inserted += num_residues

                if chain_id not in structure.chains["name"]:
                    msg = f"Specified chain id {chain_id} not in file {path}."
--- a/src/boltzgen/task/filter/filter.py
+++ b/src/boltzgen/task/filter/filter.py
@@ -279,7 +279,7 @@ class Filter(Task):
                    {
                        "feature": "ALA_fraction",
                        "lower_is_better": True,
-                        "threshold": 0.2,
+                        "threshold": 0.3,
                    },
                    {
                        "feature": "GLY_fraction",
@@ -294,7 +294,7 @@ class Filter(Task):
                    {
                        "feature": "LEU_fraction",
                        "lower_is_better": True,
-                        "threshold": 0.2,
+                        "threshold": 0.3,
                    },
                    {
                        "feature": "VAL_fraction",