Update amlt and config files for full run

2026-06-04 13:30:33 +08:00 · 2022-08-19 20:33:07 +00:00
parent b57da0647b
commit aad06dd86a
2 changed files with 50 additions and 11 deletions
--- a/config_jsons/full_run.json
+++ b/config_jsons/full_run.json
@@ -0,0 +1,29 @@
+{
+    "shift_angles_zero_twopi": false,
+    "noise_prior": "gaussian",
+    "timesteps": 250,
+    "variance_schedule": "linear",
+    "variance_scale": 1.0,
+    "time_encoding": "gaussian_fourier",
+    "implementation": "huggingface_encoder",
+    "position_embedding_type": "absolute",
+    "num_hidden_layers": 12,
+    "hidden_size": 384,
+    "intermediate_size": 768,
+    "num_heads": 12,
+    "dropout_p": 0.1,
+    "decoder": "mlp",
+    "gradient_clip": 1.0,
+    "lr": 5e-5,
+    "loss": "radian_l1_smooth",
+    "l2_norm": 0.0,
+    "l1_norm": 0.0,
+    "circle_reg": 0.0,
+    "lr_scheduler": "",
+    "min_epochs": 10000,
+    "max_epochs": 10000,
+    "early_stop_patience": 0,
+    "use_swa": false,
+    "batch_size": 64,
+    "multithread": true
+}
--- a/scripts/amlt.yaml
+++ b/scripts/amlt.yaml
@@ -1,5 +1,4 @@
-description: Initial run of CATH dataset
-# Build takes about 30 minutes
+description: Initial full run of CATH dataset

 target:
  service: sing  # Target service platform
@@ -7,22 +6,33 @@ target:
  workspace_name: msrresrchws  # AML workspace name to use

 environment: # https://singularitydocs.azurewebsites.net/docs/container_images/
-  # image: amlt-sing/pytorch-1.10.0-a100 # run amlt cache base-images
  image: amlt-sing/pytorch-1.11.0
-  # https://hub.docker.com/layers/pytorch/pytorch/pytorch/1.12.0-cuda11.3-cudnn8-runtime/images/sha256-1ef1f61b13738de8086ae7e1ce57c89f154e075dae0b165f7590b9405efeb6fe?context=explore
-  # image: pytorch/pytorch:1.12.0-cuda11.3-cudnn8-runtime # Local debugging
-  conda_yaml_file: $CONFIG_DIR/../environment.yml
-  skip_conda_packages_on_sing: ['python', 'torch', 'tensorflow', 'cudatoolkit', 'deepspeed', 'pip', 'jupyter', 'black', 'gpustat']
+  # core packages
+  # numpy - included
+  # pandas - included
+  # tqdm - included
+  # matplotlib - included
+  # seaborn - installed here
+  # mpl-scatter-density - installed here
+  # astropy - installed here
+  # pytorch - included
+  # pytorch lightning - installed here 
+  # transformers - installed here
  setup:
-    - pip install sequence-models
+    - pip install seaborn  # https://seaborn.pydata.org/installing.html
+    - pip install mpl-scatter-density  # https://github.com/astrofrog/mpl-scatter-density
+    - pip install astropy  # https://docs.astropy.org/en/stable/install.html
+    - pip install transformers==4.11.3  # https://huggingface.co/docs/transformers/installation
+    - pip install pytorch-lightning==1.6.4  # https://www.pytorchlightning.ai/
+    - pip install sequence-models  # https://github.com/microsoft/protein-sequence-models

 code:
  local_dir: $CONFIG_DIR/..  # elative to config file directory

 jobs:
- name: training_cath  # Unique name for each job
-  sku:  24G2-P40 # 16G4-P100 = 16GB memory (these may be more free), 4 GPU P100 (16GB VRAM); G1 = any 1 GPU, 8C1 = 8 GB ram, 1 core; 80G1-A100 = A100 GPU, run amlt target list singularity -v
+- name:   # Unique name for each job
+  sku:  16G4-V100 # 16G4-P100 = 16GB memory (these may be more free), 4 GPU P100 (16GB VRAM); G1 = any 1 GPU, 8C1 = 8 GB ram, 1 core; 80G1-A100 = A100 GPU, run amlt target list singularity -v
  priority: high
  sla_tier: premium
  command:
-  - python bin/train.py -o $$AMLT_OUTPUT_DIR/results
+  - python bin/train.py config_jsons/full_run.json -o $$AMLT_OUTPUT_DIR/results