Files
foundry/scripts/slurm/launch.sh
Nathaniel Corley 5a492032d5 refactor: new modelhub (#109)
* Initial commit of chiral changes

Initial checkin of chiral feature code

Add chiral metric

* Update the way chiral features are incorporated into the model

Move initialization to new func

use default pytorch reset parameters

fix initialization for chirals

config

rename argument of confidence head

fix initialization for chirals

* refactor: src nest, rename rf2aa to modelhub

* refactor: initial commit without projects

* Initial commit of chiral changes

* Initial checkin of chiral feature code

* Add chiral metric

* Remove option for double residual connection.  Add kq_norm oiptions to base (20250125) config.

* Restoring flag

* config

* rename argument of confidence head

* Update the way chiral features are incorporated into the model

* config

* rename argument of confidence head

* Update the way chiral features are incorporated into the model

* Initial commit of chiral changes

Initial checkin of chiral feature code

Add chiral metric

* Update the way chiral features are incorporated into the model

Move initialization to new func

use default pytorch reset parameters

fix initialization for chirals

config

rename argument of confidence head

fix initialization for chirals

* refactor: new modelhub

---------

Co-authored-by: fdimaio <dimaio@uw.edu>
Co-authored-by: HaotianZhangAI4Science <haotianzhang@zju.edu.cn>
2025-04-08 13:33:17 -07:00

79 lines
3.1 KiB
Bash
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
#SBATCH -p gpu-train
#SBATCH --nodes 2
#SBATCH --gres=gpu:l40:8
#SBATCH --ntasks-per-node 8
#SBATCH --mem=512g
#SBATCH -t 7-00:00:00
#SBATCH -J af3-old-msas-pdb-only-experimental
#SBATCH -o slurm_logs/%x_%j.out
#SBATCH -e slurm_logs/%x_%j.err
#SBATCH --no-kill=off
### Excluded Nodes:
### To call this script run: `sbatch launch.sh` from this directory
### For reference, see the Lightning Fabric + SLURM guide: https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html
# (In case we're still running in debug mode)
unset DEBUG_PORT
unset PROJECT_PATH
# (SLURM setup, ensuring we have a unique port per job, and setting the master address to Rank 0)
export MASTER_PORT=$((1024 + RANDOM % 64512))
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
### Set custom paths
# WARNING: You will need to update these paths to match your local setup
# ... cifutils and datahub
export PYTHONPATH="/home/ncorley/projects/datahub/src:/home/ncorley/projects/cifutils/src:/home/ncorley/projects/modelhub/src"
# ... project path (if not using root src/modelhub)
export PROJECT_PATH="/home/ncorley/projects/modelhub/projects/rfscore"
# ... cache directory for Triton kernels (e.g., DeepSpeed4Science fused kernels)
export TRITON_CACHE_DIR="/home/ncorley/.triton" # Change this to a directory with write permissions
### Environment flags
# Debugging flags (optional)
export NCCL_DEBUG=INFO # NCCL internal debugging
export PYTHONFAULTHANDLER=1 # Catches Python core dumps (e.g., segmentation faults)
# Expand CUDA memory
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Turn off NVLink (L40 do not have NVLink)
export NCCL_P2P_DISABLE=1
# OPENMP and OPENBLAS optimizations
# https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#utilize-openmp
# NOTE: Must be optimized per-system; see: https://github.com/pytorch/pytorch/blob/65e6194aeb3269a182cfe2c05c122159da12770f/torch/distributed/run.py#L596-L608
export OMP_NUM_THREADS=4
export OPENBLAS_NUM_THREADS=4
#######################################################################################################
### WARNING: The command below is just an example. It will fail if you don't update the experiment ###
### config in the command below. Please adapt according to your target experiment ###
#######################################################################################################
### Set the effective batch size
EFFECTIVE_BATCH_SIZE=16
### Compose the training script
DEVICES_PER_NODE=${SLURM_NTASKS_PER_NODE:-8} # Default to 8 if not set
echo "Running on $SLURM_NNODES nodes with $DEVICES_PER_NODE tasks per node"
### Calculate grad_accum_steps
GRAD_ACCUM_STEPS=$((EFFECTIVE_BATCH_SIZE / (DEVICES_PER_NODE * SLURM_NNODES)))
echo "Grad Accumulation Steps: $GRAD_ACCUM_STEPS"
command="srun --kill-on-bad-exit ../../src/modelhub/train.py \
experiment=$SLURM_JOB_NAME \
++trainer.devices_per_node=$DEVICES_PER_NODE \
++trainer.num_nodes=$SLURM_NNODES \
++trainer.grad_accum_steps=$GRAD_ACCUM_STEPS"
echo -e "command\t$command"
# Let 'er rip
$command