Files
xet-core/scripts/diag/hf-xet-diag-windows.sh
Rajat Arya c0f7980616 feat: smoke tests using hf CLI with bucket and large-file coverage (#710)
## Summary

- Rewrites smoke tests to drive everything through the `hf` CLI rather
than the huggingface_hub Python API, covering the actual user-facing
surface area of hf-xet
- Moves smoke tests and diagnostic scripts into a `scripts/` directory
for cleaner repo layout
- Adds storage bucket test suite exercising the full bucket lifecycle
- Adds 50 MB and 100 MB files to repo upload/download tests

## Test matrix (14 tests, all passing)

**Repository tests** (`hf upload` / `hf download`)
- Upload single file, upload folder
- Download individual files + SHA-256 verify
- Download entire repo + SHA-256 verify
- Overwrite file and verify new content served
- Delete file and confirm absent

**Bucket tests** (`hf buckets`)
- `cp` upload / download + verify
- `sync` upload / download + verify
- Recursive list confirms expected paths
- Overwrite via `cp` + verify
- `sync --delete` removes extraneous remote files
- `rm` + confirm absent from listing

## Test plan
- [x] Run `HF_TOKEN=... ./scripts/smoke_tests/run.sh` and confirm all 14
tests pass
- [x] Run `./scripts/smoke_tests/run.sh --skip-buckets` for repo-only
path
- [x] Run with `--hf-xet-version <version>` to confirm PyPI cache bypass
works

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 19:07:05 -07:00

182 lines
5.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# hf-xet-diag-windows.sh — Windows diagnostics runner (Git-Bash)
# Runs a target command, periodically snapshots stacks with procdump, and saves dumps.
# Also downloads & installs hf-xet debug symbols (PDB).
# Output directory defaults to include mangled command string for easy correlation.
set -Eeuo pipefail
# Defaults
INTERVAL=120
OUTDIR=""
OUTDIR_SET=""
print_usage() {
cat <<'USAGE'
Usage: hf-xet-diag-windows.sh [options] -- <command> [args...]
Runs a target command, periodically snapshots stacks with procdump, and saves dumps.
Also downloads & installs hf-xet debug symbols (PDB).
Options:
-i, --interval SECONDS Dump snapshot cadence (default: 120)
-o, --outdir DIR Output directory (default: diag_<CMD>_<timestamp>)
-h, --help Show this help
Examples:
./hf-xet-diag-windows.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct"
./hf-xet-diag-windows.sh -i 30 -- ./server.exe --port 8080
USAGE
}
# --- option parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
-i|--interval) INTERVAL="${2:-}"; shift 2 ;;
-o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;;
-h|--help) print_usage; exit 0 ;;
--) shift; break ;;
*) break ;;
esac
done
if [[ $# -lt 1 ]]; then
echo "ERROR: No command provided."
print_usage; exit 2
fi
CMD=( "$@" )
# --- ensure procdump available ---
if ! command -v ./procdump.exe >/dev/null 2>&1; then
echo "procdump.exe not found — downloading..."
curl -L -o Procdump.zip https://download.sysinternals.com/files/Procdump.zip
unzip -o Procdump.zip
chmod +x procdump.exe
fi
# --- build outdir ---
if [[ -z "$OUTDIR_SET" ]]; then
CMD_STR="${CMD[*]}"
SAFE_CMD=$(echo "$CMD_STR" | tr -cs 'A-Za-z0-9._-' '_' )
OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)"
fi
mkdir -p "$OUTDIR"/{stacks,dumps}
CONSOLE_LOG="$OUTDIR/console.log"
ENV_LOG="$OUTDIR/env.log"
PID_FILE="$OUTDIR/pid"
echo "Diagnostics output: $OUTDIR"
echo "Dump interval: ${INTERVAL}s"
echo "Command: ${CMD[*]}"
# --- collect system info ---
{
echo "=== $(date -Is) ==="
echo "systeminfo:"; systeminfo || true
echo
echo "tasklist snapshot:"; tasklist || true
echo
echo "python version:"; python -VV || true
} > "$ENV_LOG" 2>&1 || true
# --- download hf-xet debug symbols (PDB) ---
WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}')
if [ -z "$WHEEL_VERSION" ]; then
echo "Error: hf-xet package is not installed. Please install it before running this script." >&2
exit 1
fi
echo "hf-xet wheel version: $WHEEL_VERSION"
SYMBOL_DIR="symbols-$WHEEL_VERSION"
if [ -d "$SYMBOL_DIR" ]; then
echo "Existing symbols dir found, assuming previously installed."
else
SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')"
WHEEL_DIR="$SITE_PACKAGES/hf_xet"
DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info"
WHEEL_FILE="$DIST_INFO/WHEEL"
# Reconstruct wheel name from wheel version and wheel tag
WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE)
SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.pdb"
echo "Downloading debug symbols: $SYMBOL_FILENAME"
# If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0"
RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/')
DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip"
curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip
if [ $? -ne 0 ]; then
echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2
exit 1
fi
# Extract just the needed symbol file
unzip dbg-symbols.zip -d "$SYMBOL_DIR"
# Copy to package directory
cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/hf_xet.pdb"
echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR/hf_xet.pdb"
fi
# --- launch target ---
SCRIPT_START_TIME=$(date +%s)
(
"${CMD[@]}" & echo $! > "$PID_FILE"
) 2>&1 | tee "$CONSOLE_LOG" &
LOGGER_BG=$!
# wait until PID file filled
for _ in {1..50}; do
[[ -s "$PID_FILE" ]] && break
sleep 0.1
done
if [[ ! -s "$PID_FILE" ]]; then
echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG"
exit 1
fi
TARGET_PID="$(cat "$PID_FILE")"
# map Git-Bash PID to Windows PID
WINPID=$(ps | awk -v pid="$TARGET_PID" '$1 == pid {print $4}')
if [[ -z "$WINPID" ]]; then
echo "ERROR: Could not map to Windows PID." | tee -a "$CONSOLE_LOG"
exit 1
fi
echo "Started PID: $TARGET_PID (Windows PID=$WINPID)" | tee -a "$CONSOLE_LOG"
# --- periodic dump loop ---
LAST_SNAPSHOT_AT=0
while kill -0 "$TARGET_PID" 2>/dev/null; do
now=$(date +%s)
if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then
ts="$(date +%Y%m%d%H%M%S)"
dump_file="$OUTDIR/stacks/dump_${ts}.dmp"
echo "$(date -Is) capturing dump to $dump_file" | tee -a "$CONSOLE_LOG"
./procdump.exe -accepteula -mp "$WINPID" "$dump_file" >> "$CONSOLE_LOG" 2>&1 || true
LAST_SNAPSHOT_AT=$now
fi
sleep 1
done
echo "Process $TARGET_PID has exited at $(date -Is)." | tee -a "$CONSOLE_LOG"
# --- collect xet log files from this execution ---
HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
XET_LOG_DIR="$HF_HOME/xet/logs"
if [[ -d "$XET_LOG_DIR" ]]; then
echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG"
mkdir -p "$OUTDIR/xet_logs"
# Find log files created during or after script start time using GNU find
find "$XET_LOG_DIR" -name "xet_*.log" -type f -newermt "@$SCRIPT_START_TIME" 2>/dev/null | while read -r logfile; do
cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \
echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG"
done
else
echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG"
fi
echo "Logs and dumps are in: $OUTDIR"
disown "$LOGGER_BG" 2>/dev/null || true