add documentation to exmaples. fix design insertions such that more than 1 insertion are now possible. fix 1 indexing in design insertion size specification

This commit is contained in:
HannesStark
2025-11-08 18:31:10 +00:00
parent 6a82850a6e
commit 71cf7884a1
19 changed files with 156 additions and 40 deletions

View File

@@ -41,9 +41,10 @@ COPY . /app
RUN pip install --no-cache-dir -e /app
ARG DOWNLOAD_WEIGHTS=false
ARG HF_TOKEN=""
RUN mkdir -p "${HF_HOME}" && \
if [ "${DOWNLOAD_WEIGHTS}" = "true" ]; then \
boltzgen download all --cache "${HF_HOME}" --force_download; \
HF_TOKEN="${HF_TOKEN}" boltzgen download --models-cache-dir "${HF_HOME}" --force-download --show-paths; \
fi
ARG USERNAME=boltzgen

View File

@@ -1,7 +1,11 @@
entities:
# Specify a designed protein chain
# random number between 120 and 140 of designed residues (inclusive)
- protein:
id: A
sequence: 120..140
# Specify a non-designed protein chain
# fixed 20-mer, all residues fixed
- protein:
id: B
sequence: GGGILPWKWPWWPWRRGGG

View File

@@ -1,35 +1,50 @@
entities:
# Specify a designed protein chain
- protein:
id: B
# random number between 40 and 80 of designed residues (inclusive)
sequence: 40..80
# Specification of the target which is extracted from a .cif file
- file:
# path to the target structure
path: hoxd13.cif
# Which chain and residues in the .cif file to use as target (uses only A: 1..71 here)
include:
- chain:
id: A
res_index: ..71
# Which regions of the target should have their structure specified
# Here we hide the entire target fragment (no coordinates specified)
structure_groups:
- group:
visibility: 0
visibility: 0
id: "all"
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 57..71 on chain A
binding_types:
- chain:
id: A
binding: 57..71
# Specify a non-designed protein chain fused to chain A
- protein:
id: C
fuse: A
# fixed 8-mer, all residues fixed, and all residues marked as binding
sequence: AAAAAAAA
binding_types: BBBBBBBB
# Continue pulling from the rest of chain A of the same file and fuse it to chain A
- file:
path: hoxd13.cif
# Subsequently included residues are fused onto chain A
fuse: A
# Include the remainder of chain A (residue 72 to the end)
include:
- chain:
id: A
res_index: 72..
# Start with all hidden, then explicitly show structure of a tail segment (281 to the end)
structure_groups:
- group:
visibility: 0

View File

@@ -1,17 +1,25 @@
entities:
# Specify a designed protein chain
- protein:
id: G
# random number between 40 and 80 of designed residues (inclusive)
sequence: 40..80
# Specification of the target which is extracted from a .cif file
- file:
path: npm1.cif
# Which chain and residues in the .cif file to use as target (uses only chain A here)
include:
- chain:
id: A
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 123..240 on chain Aand not bind to residues 1..122 on chain A
binding_types:
- chain:
id: A
binding: 123..240
not_binding: 1..122
# Which regions of the target should have their structure specified
# Here we keep structure specified (visible) for two disjoint segments of chain A
structure_groups:
- group:
visibility: 1

View File

@@ -1,13 +1,19 @@
entities:
# Specify a designed protein chain
# random number between 40 and 80 of designed residues (inclusive)
- protein:
id: G
sequence: 40..80
# Specification of the target which is extracted from a .cif file
- file:
path: nup98.cif
# Which chain and residues in the .cif file to use as target (uses only A: 1..400 here)
include:
- chain:
id: A
res_index: 1..400
# Which regions of the target should have their structure specified
# Here we hide the entire included target fragment (no coordinates specified)
structure_groups:
- group:
visibility: 0

View File

@@ -1,7 +1,9 @@
entities:
# Specification of the target which is extracted from a .cif file
- file:
path: 9d3d.cif
# Which chain and residues in the .cif file to use as target
# Here we include chains A, B, and C (all residues on each chain)
include:
- chain:
id: A
@@ -9,13 +11,15 @@ entities:
id: B
- chain:
id: C
# Include residues that are within a radius of a reference region
# Here we include residues within a radius 30 of G:106..118
include_proximity:
- chain:
id: G
res_index: 106..118
radius: 30
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 91, 128, and 131 on chains A, B, and C
binding_types:
- chain:
id: A
@@ -27,7 +31,10 @@ entities:
id: C
binding: 91,128,131
# Specify a designed protein chain
# random number between 8 and 18 of designed residues (inclusive)
- protein:
id: E
sequence: 8..18
# Make the designed protein chain cyclic
cyclic: True

View File

@@ -1,25 +1,34 @@
entities:
# Specify a designed protein chain
- protein:
id: B
# 3 design residues, Cystein, 8 design residues, Cystein,
# 6 design residues, Cystein, 5 design residues, Cystein,
# 3 design residues, Cystein, 1 design residue, Cystein, 2 design residues
sequence: 3C8C6C5C3C1C2
# Make the designed peptide cyclic
cyclic: true
# Specification of the target which is extracted from a .cif file
- file:
path: 3ivq.cif
# Which chain and residues in the .cif file to use as target (here includes all residues on chain A)
include:
- chain:
id: A
# Which regions of the target should have their structure specified
# Here we specify that all included target residues should have their structure specified
structure_groups: "all"
constraints:
# Specify covalent disulfide bonds between the designed residues on chain B
- bond:
atom1: [B, 4, SG]
atom2: [B, 26, SG]
atom1: [B, 4, SG] # connect SG of residue 4 to SG of residue 26 on chain B
atom2: [B, 26, SG]
- bond:
atom1: [B, 13, SG]
atom2: [B, 30, SG]
atom1: [B, 13, SG] # connect SG of residue 13 to SG of residue 30 on chain B
atom2: [B, 30, SG]
- bond:
atom1: [B, 20, SG]
atom2: [B, 32, SG]
atom1: [B, 20, SG] # connect SG of residue 20 to SG of residue 32 on chain B
atom2: [B, 32, SG]

View File

@@ -1,25 +1,34 @@
entities:
# Specify a designed protein chain
# 3 design residues, Cystein, 8 design residues, Cystein,
# 6 design residues, Cystein, 5 design residues, Cystein,
# 3 design residues, Cystein, 1 design residue, Cystein, 2 design residues
- protein:
id: B
sequence: 3C8C6C5C3C1C2
# Make the designed peptide cyclic
cyclic: true
# Specification of the target which is extracted from a .cif file
- file:
path: 5wrd.cif
# Which chain and residues in the .cif file to use as target (here all residues on chain A)
include:
- chain:
id: A
# Which regions of the target should have their structure specified
# Here we specify that all included target residues should have their structure specified
structure_groups: "all"
constraints:
# Specify covalent difulside bonds within designed chain B
- bond:
atom1: [B, 4, SG]
atom1: [B, 4, SG] # connect SG of residue 4 to SG of residue 26 on chain B
atom2: [B, 26, SG]
- bond:
atom1: [B, 13, SG]
atom1: [B, 13, SG] # connect SG of residue 13 to SG of residue 30 on chain B
atom2: [B, 30, SG]
- bond:
atom1: [B, 20, SG]
atom1: [B, 20, SG] # connect SG of residue 20 to SG of residue 32 on chain B
atom2: [B, 32, SG]

View File

@@ -1,17 +1,24 @@
entities:
# Specify a designed protein chain
# random number between 8 and 16 of designed residues (inclusive)
- protein:
id: B
sequence: 8..16
# Make the designed peptide cyclic
cyclic: true
# Specification of the target which is extracted from a .cif file
- file:
path: 8jjs.cif
# Which chain and residues in the .cif file to use as target
# Here we include chains A and C (all residues on each chain)
include:
- chain:
id: A
- chain:
id: C
# Which regions of the target the design should or should NOT bind to
# Here we specify discrete binding residues on chain A
binding_types:
- chain:
id: A

View File

@@ -1,11 +1,15 @@
entities:
# Specify a designed protein chain
# random number between 40 and 120 of designed residues (inclusive)
- protein:
id: G
sequence: 40..120
# Specification of the target which is extracted from a .cif file
- file:
path: zf.cif
include:
# Which chain and residues in the .cif file to use as target (here includes all residues on chains C1 and B1)
- chain:
id: C1
- chain:

View File

@@ -1,30 +1,40 @@
entities:
# Specification of the target which is extracted from a .cif file
- file:
path: zf.cif
include: "all"
# Which parts of the .cif file to include as target (here includes all chains/residues)
include: "all"
# Which parts of the included content to exclude (here excludes residues 10, 63..69, and 185.. in chain A1)
exclude:
- chain:
id: A1
res_index: ..10,63..69,185..
# Where to insert new designable residues into existing chains
# Here we insert 3..8 residues after residue 63 in chain A1
design_insertions:
- insertion:
id: A1
res_index: 63
num_residues: 3..8
# Which regions of the target should have their structure specified
# Here we hide everything (no structure specified)
structure_groups:
- group:
visibility: 0
id: "all"
# Which residues in the target should be redesigned
# Here we declare A1:11..184 to be redesigned
design:
- chain:
id: A1
res_index: 11..184
# Which residues in the target are explicitly not redesignable (override entries in design)
# Here we carve out fixed positions on chain A1 to not be redesigned
not_design:
- chain:
id: A1
res_index: 11..20,29,33,39..48,57,61,72..81,90,94,100..109,118,122,129..138,147,151,157..166,175,179
# Reset residue numbering to be contiguous for chain A1
reset_res_index:
- chain:
id: A1

View File

@@ -1,18 +1,24 @@
entities:
# Specify a designed protein chain
# 1 design residue, Cystein, 11..16 design residues, Cystein, 1 design residue, Cystein
- protein:
id: B
sequence: 1C11..16C1
# Specify the secondary structure of the designed peptide (here we require sheet at residues 1 and 3..11)
secondary_structure:
sheet: 1,3..11
# Specification of the target which is extracted from a .cif file
- file:
path: 7nre.cif
# Which chain and residues in the .cif file to use as target (here includes residues 24..on chain A)
include:
- chain:
id: A
res_index: 24..
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 26..31,381,408 on chain A
binding_types:
- chain:
id: A
@@ -20,7 +26,8 @@ entities:
constraints:
# Specify covalent disulfide bonds between the designed residues on chain B
- bond:
atom1: [B, 2, SG]
atom1: [B, 2, SG] # connect SG of residue 2 to SG of residue 14 on chain B
atom2: [B, 14, SG]

View File

@@ -1,23 +1,31 @@
entities:
# Specify a designed protein chain
# 1 design residue, Cystein, 11..16 design residues, Cystein, 1 design residue, Cystein
- protein:
id: B
sequence: 1C11..16C1
# Specify the secondary structure of the designed peptide (here we require sheet at residues 1 and 3..11)
secondary_structure:
sheet: 1,3..11
# Specification of the target which is extracted from a .cif file
- file:
path: 7nre.cif
# Which chain and residues in the .cif file to use as target (here includes residues 24..on chain A)
include:
- chain:
id: A
res_index: 24..
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 26..31,381,408 on chain A
binding_types:
- chain:
id: A
binding: 26..31,381,408
# Include residues within a radius of a reference region
# Here we include residues within a radius 28 of residues 26..31,381,408 on chain A
include_proximity:
- chain:
id: A
@@ -28,7 +36,8 @@ entities:
constraints:
# Specify covalent disulfide bonds between the designed residues on chain B
- bond:
atom1: [B, 2, SG]
atom1: [B, 2, SG] # connect SG of residue 2 to SG of residue 14 on chain B
atom2: [B, 14, SG]

View File

@@ -1,24 +1,29 @@
entities:
# Specify a designed protein chain
# 1..3 design residues, Cystein, Cystein, 4 design residues, Cystein, 1..3 design residues, Cystein, 1..3 design residues
- protein:
id: B
sequence: 1..3CC4C1..3C1..3
# Specification of the target which is extracted from a .cif file
- file:
path: 8wtw.cif
# Which chain and residues in the .cif file to use as target (here includes all residues on chain A)
include:
- chain:
id: A
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residue 24 on chain A
binding_types:
- chain:
id: A
binding: 24
constraints:
# Specify covalent disulfide bonds between the designed residues on chain B
- bond:
atom1: [B, 2, SG]
atom1: [B, 2, SG] # connect SG of residue 2 to SG of residue 10 on chain B
atom2: [B, 10, SG]
- bond:
atom1: [B, 3, SG]
atom1: [B, 3, SG] # connect SG of residue 3 to SG of residue 8 on chain B
atom2: [B, 8, SG]

View File

@@ -1,19 +1,24 @@
entities:
# Specify a designed protein chain
# random number between 12 and 20 of designed residues (inclusive)
- protein:
id: G
sequence: 12..20
# Specification of the target which is extracted from a .cif file
- file:
path: 5cqg.cif
# Which chain and residues in the .cif file to use as target (uses only chain A here)
include:
- chain:
id: A
# Which regions of the target the design should or should NOT bind to
# Here we specify that the design should bind to residues 343, 344, and 251 on chain A
binding_types:
- chain:
id: A
binding: 343,344,251
# Which regions of the target should have their structure specified
# Here we specify that all included target residues should have their structure specified
structure_groups: "all"

View File

@@ -1,10 +1,13 @@
entities:
# Specify a designed protein chain
# random number between 80 and 140 of designed residues (inclusive)
- protein:
id: C
sequence: 80..140
# Specification of the target which is extracted from a .cif file
- file:
path: 1g13.cif
# Which chain and residues in the .cif file to use as target (uses only chain A here)
include:
- chain:
id: A

View File

@@ -1202,7 +1202,6 @@ def check_design_spec(
parsed = parser.parse_yaml(design_spec, mols, moldir)
structure = parsed.structure
design_info = parsed.design_info
design_color_features = np.ones_like(design_info.res_binding_type) * 0.8
design_color_features[design_info.res_binding_type.astype(bool)] = 1.0
extract_mask = np.zeros(len(structure.residues), dtype=bool)

View File

@@ -1072,7 +1072,9 @@ class YamlDesignParser:
raise ValueError(f"Unsupported file type: {str(path)}")
name = path.stem
target = self.parse_boltzgen_schema(name, data, mols, mol_dir, base_file_path=path.parent)
target = self.parse_boltzgen_schema(
name, data, mols, mol_dir, base_file_path=path.parent
)
return target
def log_once(self, msg: str):
@@ -1570,7 +1572,7 @@ class YamlDesignParser:
if isinstance(path, list) or Path(path).suffix == ".yaml":
if isinstance(path, list):
path = random.choice(path)
resolved_path = (base_file_path / path).resolve()
with resolved_path.open("r") as f:
file = yaml.safe_load(f)
@@ -1968,6 +1970,7 @@ class YamlDesignParser:
# Parse and apply design insertions
if design_insertions is not None:
num_inserted = 0
for list_element in design_insertions:
insertion = list_element["insertion"]
if "id" not in insertion:
@@ -1978,10 +1981,15 @@ class YamlDesignParser:
raise ValueError(msg)
chain_id = insertion["id"]
res_index = insertion["res_index"] - 1 # 1 index input to 0 indexed
res_index += num_inserted
ss_insert_type = insertion.get("secondary_structure", "UNSPECIFIED")
# We add +1 because the parse_range function is usually used for indexing where we then convert the 1 based inputs to 0 indexing
num_residues = insertion["num_residues"]
num_residues = parse_range(num_residues)
num_residues = np.random.choice(num_residues).item()
num_residues += 1
num_inserted += num_residues
if chain_id not in structure.chains["name"]:
msg = f"Specified chain id {chain_id} not in file {path}."

View File

@@ -279,7 +279,7 @@ class Filter(Task):
{
"feature": "ALA_fraction",
"lower_is_better": True,
"threshold": 0.2,
"threshold": 0.3,
},
{
"feature": "GLY_fraction",
@@ -294,7 +294,7 @@ class Filter(Task):
{
"feature": "LEU_fraction",
"lower_is_better": True,
"threshold": 0.2,
"threshold": 0.3,
},
{
"feature": "VAL_fraction",