mirror of
https://github.com/RosettaCommons/foundry.git
synced 2026-06-04 13:24:22 +08:00
* Initial commit of chiral changes Initial checkin of chiral feature code Add chiral metric * Update the way chiral features are incorporated into the model Move initialization to new func use default pytorch reset parameters fix initialization for chirals config rename argument of confidence head fix initialization for chirals * refactor: src nest, rename rf2aa to modelhub * refactor: initial commit without projects * Initial commit of chiral changes * Initial checkin of chiral feature code * Add chiral metric * Remove option for double residual connection. Add kq_norm oiptions to base (20250125) config. * Restoring flag * config * rename argument of confidence head * Update the way chiral features are incorporated into the model * config * rename argument of confidence head * Update the way chiral features are incorporated into the model * Initial commit of chiral changes Initial checkin of chiral feature code Add chiral metric * Update the way chiral features are incorporated into the model Move initialization to new func use default pytorch reset parameters fix initialization for chirals config rename argument of confidence head fix initialization for chirals * refactor: new modelhub --------- Co-authored-by: fdimaio <dimaio@uw.edu> Co-authored-by: HaotianZhangAI4Science <haotianzhang@zju.edu.cn>
79 lines
3.1 KiB
Bash
79 lines
3.1 KiB
Bash
#!/bin/bash
|
||
#SBATCH -p gpu-train
|
||
#SBATCH --nodes 2
|
||
#SBATCH --gres=gpu:l40:8
|
||
#SBATCH --ntasks-per-node 8
|
||
#SBATCH --mem=512g
|
||
#SBATCH -t 7-00:00:00
|
||
#SBATCH -J af3-old-msas-pdb-only-experimental
|
||
#SBATCH -o slurm_logs/%x_%j.out
|
||
#SBATCH -e slurm_logs/%x_%j.err
|
||
#SBATCH --no-kill=off
|
||
|
||
### Excluded Nodes:
|
||
|
||
### To call this script run: `sbatch launch.sh` from this directory
|
||
### For reference, see the Lightning Fabric + SLURM guide: https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html
|
||
|
||
# (In case we're still running in debug mode)
|
||
unset DEBUG_PORT
|
||
unset PROJECT_PATH
|
||
|
||
# (SLURM setup, ensuring we have a unique port per job, and setting the master address to Rank 0)
|
||
export MASTER_PORT=$((1024 + RANDOM % 64512))
|
||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
|
||
|
||
### Set custom paths
|
||
# WARNING: You will need to update these paths to match your local setup
|
||
# ... cifutils and datahub
|
||
export PYTHONPATH="/home/ncorley/projects/datahub/src:/home/ncorley/projects/cifutils/src:/home/ncorley/projects/modelhub/src"
|
||
# ... project path (if not using root src/modelhub)
|
||
export PROJECT_PATH="/home/ncorley/projects/modelhub/projects/rfscore"
|
||
# ... cache directory for Triton kernels (e.g., DeepSpeed4Science fused kernels)
|
||
export TRITON_CACHE_DIR="/home/ncorley/.triton" # Change this to a directory with write permissions
|
||
|
||
### Environment flags
|
||
|
||
# Debugging flags (optional)
|
||
export NCCL_DEBUG=INFO # NCCL internal debugging
|
||
export PYTHONFAULTHANDLER=1 # Catches Python core dumps (e.g., segmentation faults)
|
||
|
||
# Expand CUDA memory
|
||
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
||
|
||
# Turn off NVLink (L40 do not have NVLink)
|
||
export NCCL_P2P_DISABLE=1
|
||
|
||
# OPENMP and OPENBLAS optimizations
|
||
# https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#utilize-openmp
|
||
# NOTE: Must be optimized per-system; see: https://github.com/pytorch/pytorch/blob/65e6194aeb3269a182cfe2c05c122159da12770f/torch/distributed/run.py#L596-L608
|
||
export OMP_NUM_THREADS=4
|
||
export OPENBLAS_NUM_THREADS=4
|
||
|
||
#######################################################################################################
|
||
### WARNING: The command below is just an example. It will fail if you don't update the experiment ###
|
||
### config in the command below. Please adapt according to your target experiment ###
|
||
#######################################################################################################
|
||
|
||
### Set the effective batch size
|
||
EFFECTIVE_BATCH_SIZE=16
|
||
|
||
### Compose the training script
|
||
DEVICES_PER_NODE=${SLURM_NTASKS_PER_NODE:-8} # Default to 8 if not set
|
||
echo "Running on $SLURM_NNODES nodes with $DEVICES_PER_NODE tasks per node"
|
||
|
||
### Calculate grad_accum_steps
|
||
GRAD_ACCUM_STEPS=$((EFFECTIVE_BATCH_SIZE / (DEVICES_PER_NODE * SLURM_NNODES)))
|
||
echo "Grad Accumulation Steps: $GRAD_ACCUM_STEPS"
|
||
|
||
command="srun --kill-on-bad-exit ../../src/modelhub/train.py \
|
||
experiment=$SLURM_JOB_NAME \
|
||
++trainer.devices_per_node=$DEVICES_PER_NODE \
|
||
++trainer.num_nodes=$SLURM_NNODES \
|
||
++trainer.grad_accum_steps=$GRAD_ACCUM_STEPS"
|
||
|
||
echo -e "command\t$command"
|
||
|
||
# Let 'er rip
|
||
$command
|