#!/bin/bash
#SBATCH -p gpu-train
#SBATCH --nodes 2
#SBATCH --gres=gpu:l40:8
#SBATCH --ntasks-per-node 8
#SBATCH --mem=512g
#SBATCH -t 7-00:00:00
#SBATCH -J af3-old-msas-pdb-only-experimental
#SBATCH -o slurm_logs/%x_%j.out
#SBATCH -e slurm_logs/%x_%j.err
#SBATCH --no-kill=off

### Excluded Nodes:

### To call this script run:  `sbatch launch.sh` from this directory
### For reference, see the Lightning Fabric + SLURM guide: https://lightning.ai/docs/fabric/stable/guide/multi_node/slurm.html

# (In case we're still running in debug mode)
unset DEBUG_PORT
unset PROJECT_PATH

# (SLURM setup, ensuring we have a unique port per job, and setting the master address to Rank 0)
export MASTER_PORT=$((1024 + RANDOM % 64512))
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)

### Set custom paths
# WARNING: You will need to update these paths to match your local setup
# ... cifutils and datahub
export PYTHONPATH="/home/ncorley/projects/datahub/src:/home/ncorley/projects/cifutils/src:/home/ncorley/projects/modelhub/src"
# ... project path (if not using root src/modelhub)
export PROJECT_PATH="/home/ncorley/projects/modelhub/projects/rfscore"  
# ... cache directory for Triton kernels (e.g., DeepSpeed4Science fused kernels)
export TRITON_CACHE_DIR="/home/ncorley/.triton" # Change this to a directory with write permissions

### Environment flags

# Debugging flags (optional)
export NCCL_DEBUG=INFO # NCCL internal debugging
export PYTHONFAULTHANDLER=1 # Catches Python core dumps (e.g., segmentation faults)

# Expand CUDA memory
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# Turn off NVLink (L40 do not have NVLink)
export NCCL_P2P_DISABLE=1

# OPENMP and OPENBLAS optimizations
# https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#utilize-openmp
# NOTE: Must be optimized per-system; see: https://github.com/pytorch/pytorch/blob/65e6194aeb3269a182cfe2c05c122159da12770f/torch/distributed/run.py#L596-L608
export OMP_NUM_THREADS=4    
export OPENBLAS_NUM_THREADS=4

#######################################################################################################
### WARNING: The command below is just an example. It will fail if you don't update the experiment  ###
###          config in the command below. Please adapt according to your target experiment          ###
#######################################################################################################

### Set the effective batch size
EFFECTIVE_BATCH_SIZE=16

### Compose the training script
DEVICES_PER_NODE=${SLURM_NTASKS_PER_NODE:-8}  # Default to 8 if not set
echo "Running on $SLURM_NNODES nodes with $DEVICES_PER_NODE tasks per node"

### Calculate grad_accum_steps
GRAD_ACCUM_STEPS=$((EFFECTIVE_BATCH_SIZE / (DEVICES_PER_NODE * SLURM_NNODES)))
echo "Grad Accumulation Steps: $GRAD_ACCUM_STEPS"

command="srun --kill-on-bad-exit ../../src/modelhub/train.py \
    experiment=$SLURM_JOB_NAME \
    ++trainer.devices_per_node=$DEVICES_PER_NODE \
    ++trainer.num_nodes=$SLURM_NNODES \
    ++trainer.grad_accum_steps=$GRAD_ACCUM_STEPS"

echo -e "command\t$command"

# Let 'er rip
$command