Files
xet-core/scripts/diag/hf-xet-diag-linux.sh
Rajat Arya c0f7980616 feat: smoke tests using hf CLI with bucket and large-file coverage (#710)
## Summary

- Rewrites smoke tests to drive everything through the `hf` CLI rather
than the huggingface_hub Python API, covering the actual user-facing
surface area of hf-xet
- Moves smoke tests and diagnostic scripts into a `scripts/` directory
for cleaner repo layout
- Adds storage bucket test suite exercising the full bucket lifecycle
- Adds 50 MB and 100 MB files to repo upload/download tests

## Test matrix (14 tests, all passing)

**Repository tests** (`hf upload` / `hf download`)
- Upload single file, upload folder
- Download individual files + SHA-256 verify
- Download entire repo + SHA-256 verify
- Overwrite file and verify new content served
- Delete file and confirm absent

**Bucket tests** (`hf buckets`)
- `cp` upload / download + verify
- `sync` upload / download + verify
- Recursive list confirms expected paths
- Overwrite via `cp` + verify
- `sync --delete` removes extraneous remote files
- `rm` + confirm absent from listing

## Test plan
- [x] Run `HF_TOKEN=... ./scripts/smoke_tests/run.sh` and confirm all 14
tests pass
- [x] Run `./scripts/smoke_tests/run.sh --skip-buckets` for repo-only
path
- [x] Run with `--hf-xet-version <version>` to confirm PyPI cache bypass
works

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 19:07:05 -07:00

298 lines
9.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# hf-xet-diag.sh — Linux-only diagnostics runner
# Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
# Output directory defaults to include mangled command string for easy correlation.
set -Eeuo pipefail
# Defaults
INTERVAL=120
OUTDIR=""
PRELOAD_HELPER=true
OUTDIR_SET=""
print_usage() {
cat <<'USAGE'
Usage: hf-xet-diag-linux.sh [options] -- <command> [args...]
Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
Output directory defaults to include mangled command string for easy correlation.
Uses gdb for stack snapshots and hang detection.
Requires gdb, gcc, gcore. Install on Linux with:
sudo apt-get install gdb build-essential
Options:
-i, --interval SECONDS Stack snapshot cadence (default: 120)
-o, --outdir DIR Output directory (default: diag_<CMD>_<timestamp>)
--no-preload Do NOT preload ptrace-bypass helper
-h, --help Show this help
Examples:
./hf-xet-diag-linux.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct"
./hf-xet-diag-linux.sh -i 30 -o diag -- ./server --port 8080
USAGE
}
# --- option parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
-i|--interval) INTERVAL="${2:-}"; shift 2 ;;
-o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;;
--no-preload) PRELOAD_HELPER=false; shift ;;
-h|--help) print_usage; exit 0 ;;
--) shift; break ;;
*) break ;;
esac
done
if [[ $# -lt 1 ]]; then
echo "ERROR: No command provided."
print_usage; exit 2
fi
CMD=( "$@" )
missing=()
for cmd in gdb gcore; do
if ! command -v "$cmd" >/dev/null 2>&1; then
missing+=("$cmd")
fi
done
if [ ${#missing[@]} -ne 0 ]; then
echo "Missing required tools: ${missing[*]}"
echo ""
echo "Requires gdb & gcore. Install on Linux with:"
echo " sudo apt-get install gdb"
exit 2
fi
# If no outdir given, generate one based on command
if [[ -z "$OUTDIR_SET" ]]; then
CMD_STR="${CMD[*]}"
SAFE_CMD=$(echo "$CMD_STR" | tr -cs 'A-Za-z0-9._-' '_' )
OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)"
fi
mkdir -p "$OUTDIR"/{stacks,dumps}
CONSOLE_LOG="$OUTDIR/console.log"
ENV_LOG="$OUTDIR/env.log"
PID_FILE="$OUTDIR/pid"
echo "Diagnostics output: $OUTDIR"
echo "Stack trace interval: ${INTERVAL}s"
echo "Command: ${CMD[*]}"
# --- collect some quick system info ---
{
echo "=== $(date -Is) ==="
echo "uname -a:"; uname -a
echo
echo "top snapshot:"; top -b -n 1 | grep -E "^%Cpu|^MiB Mem|^MiB Swap" || true
echo
echo "ulimit -a:"; ulimit -a || true
echo
echo "python version:"; python -VV || true
echo
} > "$ENV_LOG" 2>&1 || true
# --- ptrace helper build (optional) ---
ALLOW_PTRACE_SO="/tmp/liballow_ptrace.so"
maybe_build_ptrace_helper() {
[[ "$PRELOAD_HELPER" == true ]] || return 0
[[ -f "$ALLOW_PTRACE_SO" ]] && return 0
if ! command -v gcc >/dev/null 2>&1; then
echo "Note: gcc not found; skipping ptrace helper." | tee -a "$CONSOLE_LOG"
return 0
fi
cat >/tmp/allow-ptrace.c <<'EOF'
#define _GNU_SOURCE
#include <sys/prctl.h>
#ifndef PR_SET_PTRACER
#define PR_SET_PTRACER 0x59616d61
#endif
#ifndef PR_SET_PTRACER_ANY
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
#endif
__attribute__((constructor))
static void allow_ptrace_ctor(void) {
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
}
EOF
gcc -shared -fPIC -O2 -o "$ALLOW_PTRACE_SO" /tmp/allow-ptrace.c 2>>"$CONSOLE_LOG" || {
echo "Warning: failed to build ptrace helper; proceeding without." | tee -a "$CONSOLE_LOG"
return
}
echo "Built ptrace helper: $ALLOW_PTRACE_SO" | tee -a "$CONSOLE_LOG"
}
maybe_build_ptrace_helper
# --- download hf-xet dbg symbols ---
WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}')
if [ -z "$WHEEL_VERSION" ]; then
echo "Error: hf-xet package is not installed. Please install it before running this script." >&2
exit 1
fi
echo "hf-xet wheel version: $WHEEL_VERSION"
SYMBOL_DIR="symbols-$WHEEL_VERSION"
if [ -d "$SYMBOL_DIR" ]; then
echo "Existing symbols dir found, assuming previously installed."
else
SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')"
WHEEL_DIR="$SITE_PACKAGES/hf_xet"
DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info"
WHEEL_FILE="$DIST_INFO/WHEEL"
# Reconstruct wheel name from wheel version and wheel tag
WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE)
SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.so.dbg"
echo "Downloading debug symbols: $SYMBOL_FILENAME"
# If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0"
RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/')
DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip"
curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip
if [ $? -ne 0 ]; then
echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2
exit 1
fi
# Extract just the needed symbol file
unzip dbg-symbols.zip -d "$SYMBOL_DIR"
# Copy to package directory
cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/"
echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR"
fi
# --- launch target ---
SCRIPT_START_TIME=$(date +%s)
echo "Launching target at $(date -Is) ..." | tee -a "$CONSOLE_LOG"
LAUNCH_ENV=()
if [[ "$PRELOAD_HELPER" == true && -f "$ALLOW_PTRACE_SO" ]]; then
LAUNCH_ENV=( "LD_PRELOAD=$ALLOW_PTRACE_SO${LD_PRELOAD:+:$LD_PRELOAD}" )
echo "Using LD_PRELOAD=$ALLOW_PTRACE_SO to relax ptrace restrictions." | tee -a "$CONSOLE_LOG"
fi
if [[ ${#LAUNCH_ENV[@]} -gt 0 ]]; then
(
env "${LAUNCH_ENV[@]}" "${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee -a "$CONSOLE_LOG" &
else
(
"${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee -a "$CONSOLE_LOG" &
fi
LOGGER_BG=$!
# read PID
for _ in {1..50}; do
[[ -s "$PID_FILE" ]] && break
sleep 0.1
done
if [[ ! -s "$PID_FILE" ]]; then
echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG"
exit 1
fi
TARGET_PID="$(cat "$PID_FILE")"
echo "Started PID: $TARGET_PID" | tee -a "$CONSOLE_LOG"
# --- stack capture + hang detection ---
declare -a LAST_STACKS=()
capture_stack() {
local ts stack_file
ts="$(date +%Y%m%d%H%M%S)"
stack_file="$OUTDIR/stacks/stack_${ts}.txt"
if command -v gdb >/dev/null 2>&1; then
gdb -p "$TARGET_PID" -batch \
-ex "set pagination off" \
-ex "thread apply all bt full" >"$stack_file" 2>&1 || true
elif command -v eu-stack >/dev/null 2>&1; then
eu-stack -p "$TARGET_PID" >"$stack_file" 2>&1 || true
elif command -v pstack >/dev/null 2>&1; then
pstack "$TARGET_PID" >"$stack_file" 2>&1 || true
else
for t in /proc/"$TARGET_PID"/task/*/stack; do
echo "=== $t ==="; cat "$t"; echo
done >"$stack_file" 2>&1 || true
fi
echo "$(date -Is) captured stack -> $stack_file" | tee -a "$CONSOLE_LOG"
LAST_STACKS+=("$stack_file")
if (( ${#LAST_STACKS[@]} > 3 )); then
LAST_STACKS=("${LAST_STACKS[@]: -3}")
fi
check_hang
}
check_hang() {
if (( ${#LAST_STACKS[@]} < 3 )); then return; fi
norm1=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[0]}" | grep -v '^$')
norm2=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[1]}" | grep -v '^$')
norm3=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[2]}" | grep -v '^$')
diff12=$(diff <(echo "$norm1") <(echo "$norm2") || true)
diff23=$(diff <(echo "$norm2") <(echo "$norm3") || true)
# if no diff between stacks 1-2 and 2-3, we have a hang
if [[ -n "$diff12" || -n "$diff23" ]]; then
return
fi
echo "⚠️ Hang heuristic triggered at $(date -Is)" | tee -a "$CONSOLE_LOG"
take_core_dump
LAST_STACKS=()
}
take_core_dump() {
local ts core_file
ts="$(date +%Y%m%d%H%M%S)"
core_file="$OUTDIR/dumps/core_${ts}"
if command -v gcore >/dev/null 2>&1; then
gcore -o "$core_file" "$TARGET_PID" >>"$CONSOLE_LOG" 2>&1 || true
echo "Core dump saved: ${core_file}.${TARGET_PID}" | tee -a "$CONSOLE_LOG"
else
echo "gcore not available, saving partial /proc/$TARGET_PID/mem dump" | tee -a "$CONSOLE_LOG"
dd if="/proc/$TARGET_PID/mem" of="${core_file}.raw" bs=1M count=50 status=none 2>>"$CONSOLE_LOG" || true
echo "Partial raw dump saved: ${core_file}.raw" | tee -a "$CONSOLE_LOG"
fi
}
# --- monitoring loop ---
LAST_SNAPSHOT_AT=0
while kill -0 "$TARGET_PID" 2>/dev/null; do
now=$(date +%s)
if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then
capture_stack || true
LAST_SNAPSHOT_AT=$now
fi
sleep 1
done
echo "Process $TARGET_PID has exited at $(date -Is)." | tee -a "$CONSOLE_LOG"
# --- collect xet log files from this execution ---
HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
XET_LOG_DIR="$HF_HOME/xet/logs"
if [[ -d "$XET_LOG_DIR" ]]; then
echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG"
mkdir -p "$OUTDIR/xet_logs"
# Find log files created during or after script start time using GNU find
find "$XET_LOG_DIR" -name "xet_*.log" -type f -newermt "@$SCRIPT_START_TIME" 2>/dev/null | while read -r logfile; do
cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \
echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG"
done
else
echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG"
fi
echo "Logs and stacks are in: $OUTDIR"
disown "$LOGGER_BG" 2>/dev/null || true