#!/usr/bin/env bash # hf-xet-diag-macos.sh — macOS-only diagnostics runner # Runs a target command, periodically snapshots stacks with `sample`, # detects hangs, and can dump cores with `lldb`. # Installs hf-xet debug symbols if available. # Output directory defaults to include mangled command string for easy correlation. set -Eeuo pipefail # Defaults INTERVAL=120 OUTDIR="" OUTDIR_SET="" print_usage() { cat <<'USAGE' Usage: hf-xet-diag-macos.sh [options] -- [args...] Runs a target command, periodically snapshots stacks (via `sample`), detects hangs, and can dump cores (via `lldb`). Also installs hf-xet debug symbols if available. Requires: sample, lldb (Xcode Command Line Tools), curl, unzip, pip Options: -i, --interval SECONDS Stack snapshot cadence (default: 120) -o, --outdir DIR Output directory (default: diag__) -h, --help Show this help Examples: ./hf-xet-diag-macos.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct" ./hf-xet-diag-macos.sh -i 60 -- ./myapp --flag USAGE } # --- option parsing --- while [[ $# -gt 0 ]]; do case "$1" in -i|--interval) INTERVAL="${2:-}"; shift 2 ;; -o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;; -h|--help) print_usage; exit 0 ;; --) shift; break ;; *) break ;; esac done if [[ $# -lt 1 ]]; then echo "ERROR: No command provided." print_usage; exit 2 fi CMD=( "$@" ) # Tool availability check missing=() for cmd in sample lldb curl unzip; do if ! command -v "$cmd" >/dev/null 2>&1; then missing+=("$cmd") fi done if [ ${#missing[@]} -ne 0 ]; then echo "Missing required tools: ${missing[*]}" echo "Install Xcode Command Line Tools with:" echo " xcode-select --install" exit 2 fi # If no outdir given, generate one based on command if [[ -z "$OUTDIR_SET" ]]; then CMD_STR="${CMD[*]}" SAFE_CMD=$(echo "$CMD_STR" | tr -c 'A-Za-z0-9' '_' ) OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)" fi mkdir -p "$OUTDIR"/{stacks,dumps} CONSOLE_LOG="$OUTDIR/console.log" ENV_LOG="$OUTDIR/env.log" PID_FILE="$OUTDIR/pid" echo "Diagnostics output: $OUTDIR" echo "Stack trace interval: ${INTERVAL}s" echo "Command: ${CMD[*]}" # --- collect some quick system info --- { echo "=== $(date "+%Y-%m-%dT%H:%M:%S%z") ===" echo "uname -a:"; uname -a echo echo "top snapshot:"; top -l 1 | grep -E "^CPU|^Phys" || true echo echo "ulimit -a:"; ulimit -a || true echo echo "python version:"; python3 -VV || true echo } > "$ENV_LOG" 2>&1 || true # --- download hf-xet dbg symbols --- WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}') if [ -z "$WHEEL_VERSION" ]; then echo "Error: hf-xet package is not installed. Please install it before running this script." >&2 exit 1 fi echo "hf-xet wheel version: $WHEEL_VERSION" SYMBOL_DIR="symbols-$WHEEL_VERSION" if [ -d "$SYMBOL_DIR" ]; then echo "Existing symbols dir found, assuming previously installed." else SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')" WHEEL_DIR="$SITE_PACKAGES/hf_xet" DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info" WHEEL_FILE="$DIST_INFO/WHEEL" # Reconstruct wheel name from wheel version and wheel tag WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE) SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.dylib.dSYM" echo "Downloading debug symbols: $SYMBOL_FILENAME" # If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0" RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/') DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip" curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip if [ $? -ne 0 ]; then echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2 exit 1 fi # Extract just the needed symbol file unzip dbg-symbols.zip -d "$SYMBOL_DIR" # Copy to package directory cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/" echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR" fi # --- launch target --- SCRIPT_START_TIME=$(date +%s) REF_FILE="$OUTDIR/.ref_timestamp" touch "$REF_FILE" # Reference file for finding logs created after this point # Ensure REF_FILE is cleaned up on exit trap 'rm -f "$REF_FILE"' EXIT echo "Launching target at $(date "+%Y-%m-%dT%H:%M:%S%z") ..." | tee -a "$CONSOLE_LOG" ( "${CMD[@]}" & echo $! > "$PID_FILE" ) 2>&1 | tee -a "$CONSOLE_LOG" & LOGGER_BG=$! # read PID for _ in {1..50}; do [[ -s "$PID_FILE" ]] && break sleep 0.1 done if [[ ! -s "$PID_FILE" ]]; then echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG" exit 1 fi TARGET_PID="$(cat "$PID_FILE")" echo "Started PID: $TARGET_PID" | tee -a "$CONSOLE_LOG" # --- stack capture + hang detection --- declare -a LAST_STACKS=() capture_stack() { local ts stack_file ts="$(date +%Y%m%d%H%M%S)" stack_file="$OUTDIR/stacks/stack_${ts}.txt" sample "$TARGET_PID" 5 -file "$stack_file" || true echo "$(date "+%Y-%m-%dT%H:%M:%S%z") captured stack -> $stack_file" | tee -a "$CONSOLE_LOG" LAST_STACKS+=("$stack_file") if (( ${#LAST_STACKS[@]} > 3 )); then LAST_STACKS=("${LAST_STACKS[@]: -3}") fi check_hang } check_hang() { # need three snapshots to decide if (( ${#LAST_STACKS[@]} < 3 )); then return; fi # normalize: strip addresses and empty lines norm1=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[0]}" | grep -v '^$') norm2=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[1]}" | grep -v '^$') norm3=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[2]}" | grep -v '^$') diff12=$(diff <(echo "$norm1") <(echo "$norm2") || true) diff23=$(diff <(echo "$norm2") <(echo "$norm3") || true) # If either diff is non-empty => stacks changed -> NOT a hang if [[ -n "$diff12" || -n "$diff23" ]]; then return fi # Otherwise both diffs empty => stacks the same across 3 snapshots => HANG echo "⚠️ Hang detected at $(date "+%Y-%m-%dT%H:%M:%S%z") — taking core dump." | tee -a "$CONSOLE_LOG" take_core_dump LAST_STACKS=() } take_core_dump() { local ts core_file ts="$(date +%Y%m%d%H%M%S)" core_file="$OUTDIR/dumps/dump_${TARGET_PID}_${ts}.core" lldb -p "$TARGET_PID" -o "process save-core $core_file" -o "quit" >>"$CONSOLE_LOG" 2>&1 || true echo "Core dump saved: $core_file" | tee -a "$CONSOLE_LOG" } # --- monitoring loop --- LAST_SNAPSHOT_AT=0 while kill -0 "$TARGET_PID" 2>/dev/null; do now=$(date +%s) if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then capture_stack || true LAST_SNAPSHOT_AT=$now fi sleep 1 done echo "Process $TARGET_PID has exited at $(date "+%Y-%m-%dT%H:%M:%S%z")." | tee -a "$CONSOLE_LOG" # --- collect xet log files from this execution --- HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}" XET_LOG_DIR="$HF_HOME/xet/logs" if [[ -d "$XET_LOG_DIR" ]]; then echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG" mkdir -p "$OUTDIR/xet_logs" # Find log files created after script start using reference file find "$XET_LOG_DIR" -name "xet_*.log" -type f -newer "$REF_FILE" 2>/dev/null | while read -r logfile; do cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \ echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG" done else echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG" fi echo "Logs and stacks are in: $OUTDIR" disown "$LOGGER_BG" 2>/dev/null || true