mirror of
https://github.com/huggingface/xet-core.git
synced 2026-06-04 13:30:29 +08:00
- also updated README - added analysis script to load latest dump collected --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
298 lines
9.1 KiB
Bash
Executable File
298 lines
9.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# hf-xet-diag.sh — Linux-only diagnostics runner
|
|
# Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
|
|
# Output directory defaults to include mangled command string for easy correlation.
|
|
|
|
set -Eeuo pipefail
|
|
|
|
# Defaults
|
|
INTERVAL=120
|
|
OUTDIR=""
|
|
PRELOAD_HELPER=true
|
|
OUTDIR_SET=""
|
|
|
|
print_usage() {
|
|
cat <<'USAGE'
|
|
Usage: hf-xet-diag-linux.sh [options] -- <command> [args...]
|
|
|
|
Runs a target command, periodically snapshots stacks, detects hangs, and can dump cores.
|
|
Output directory defaults to include mangled command string for easy correlation.
|
|
|
|
Uses gdb for stack snapshots and hang detection.
|
|
|
|
Requires gdb, gcc, gcore. Install on Linux with:
|
|
sudo apt-get install gdb build-essential
|
|
|
|
Options:
|
|
-i, --interval SECONDS Stack snapshot cadence (default: 120)
|
|
-o, --outdir DIR Output directory (default: diag_<CMD>_<timestamp>)
|
|
--no-preload Do NOT preload ptrace-bypass helper
|
|
-h, --help Show this help
|
|
|
|
Examples:
|
|
./hf-xet-diag-linux.sh -- python hfxet-test.py "Qwen/Qwen2.5-VL-3B-Instruct"
|
|
./hf-xet-diag-linux.sh -i 30 -o diag -- ./server --port 8080
|
|
USAGE
|
|
}
|
|
|
|
# --- option parsing ---
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
-i|--interval) INTERVAL="${2:-}"; shift 2 ;;
|
|
-o|--outdir) OUTDIR="${2:-}"; OUTDIR_SET=1; shift 2 ;;
|
|
--no-preload) PRELOAD_HELPER=false; shift ;;
|
|
-h|--help) print_usage; exit 0 ;;
|
|
--) shift; break ;;
|
|
*) break ;;
|
|
esac
|
|
done
|
|
if [[ $# -lt 1 ]]; then
|
|
echo "ERROR: No command provided."
|
|
print_usage; exit 2
|
|
fi
|
|
CMD=( "$@" )
|
|
|
|
missing=()
|
|
for cmd in gdb gcore; do
|
|
if ! command -v "$cmd" >/dev/null 2>&1; then
|
|
missing+=("$cmd")
|
|
fi
|
|
done
|
|
|
|
if [ ${#missing[@]} -ne 0 ]; then
|
|
echo "Missing required tools: ${missing[*]}"
|
|
echo ""
|
|
echo "Requires gdb & gcore. Install on Linux with:"
|
|
echo " sudo apt-get install gdb"
|
|
exit 2
|
|
fi
|
|
|
|
# If no outdir given, generate one based on command
|
|
if [[ -z "$OUTDIR_SET" ]]; then
|
|
CMD_STR="${CMD[*]}"
|
|
SAFE_CMD=$(echo "$CMD_STR" | tr -cs 'A-Za-z0-9._-' '_' )
|
|
OUTDIR="diag_${SAFE_CMD}_$(date +%Y%m%d%H%M%S)"
|
|
fi
|
|
|
|
mkdir -p "$OUTDIR"/{stacks,dumps}
|
|
CONSOLE_LOG="$OUTDIR/console.log"
|
|
ENV_LOG="$OUTDIR/env.log"
|
|
PID_FILE="$OUTDIR/pid"
|
|
|
|
echo "Diagnostics output: $OUTDIR"
|
|
echo "Stack trace interval: ${INTERVAL}s"
|
|
echo "Command: ${CMD[*]}"
|
|
|
|
# --- collect some quick system info ---
|
|
{
|
|
echo "=== $(date -Is) ==="
|
|
echo "uname -a:"; uname -a
|
|
echo
|
|
echo "top snapshot:"; top -b -n 1 | grep -E "^%Cpu|^MiB Mem|^MiB Swap" || true
|
|
echo
|
|
echo "ulimit -a:"; ulimit -a || true
|
|
echo
|
|
echo "python version:"; python -VV || true
|
|
echo
|
|
} > "$ENV_LOG" 2>&1 || true
|
|
|
|
# --- ptrace helper build (optional) ---
|
|
ALLOW_PTRACE_SO="/tmp/liballow_ptrace.so"
|
|
maybe_build_ptrace_helper() {
|
|
[[ "$PRELOAD_HELPER" == true ]] || return 0
|
|
[[ -f "$ALLOW_PTRACE_SO" ]] && return 0
|
|
if ! command -v gcc >/dev/null 2>&1; then
|
|
echo "Note: gcc not found; skipping ptrace helper." | tee -a "$CONSOLE_LOG"
|
|
return 0
|
|
fi
|
|
cat >/tmp/allow-ptrace.c <<'EOF'
|
|
#define _GNU_SOURCE
|
|
#include <sys/prctl.h>
|
|
#ifndef PR_SET_PTRACER
|
|
#define PR_SET_PTRACER 0x59616d61
|
|
#endif
|
|
#ifndef PR_SET_PTRACER_ANY
|
|
#define PR_SET_PTRACER_ANY ((unsigned long)-1)
|
|
#endif
|
|
__attribute__((constructor))
|
|
static void allow_ptrace_ctor(void) {
|
|
prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
|
|
}
|
|
EOF
|
|
gcc -shared -fPIC -O2 -o "$ALLOW_PTRACE_SO" /tmp/allow-ptrace.c 2>>"$CONSOLE_LOG" || {
|
|
echo "Warning: failed to build ptrace helper; proceeding without." | tee -a "$CONSOLE_LOG"
|
|
return
|
|
}
|
|
echo "Built ptrace helper: $ALLOW_PTRACE_SO" | tee -a "$CONSOLE_LOG"
|
|
}
|
|
maybe_build_ptrace_helper
|
|
|
|
# --- download hf-xet dbg symbols ---
|
|
WHEEL_VERSION=$(pip show hf-xet | awk '/^Version:/{printf $2}')
|
|
if [ -z "$WHEEL_VERSION" ]; then
|
|
echo "Error: hf-xet package is not installed. Please install it before running this script." >&2
|
|
exit 1
|
|
fi
|
|
echo "hf-xet wheel version: $WHEEL_VERSION"
|
|
SYMBOL_DIR="symbols-$WHEEL_VERSION"
|
|
|
|
if [ -d "$SYMBOL_DIR" ]; then
|
|
echo "Existing symbols dir found, assuming previously installed."
|
|
else
|
|
SITE_PACKAGES="$(pip show hf-xet | awk '/^Location:/{printf $2}')"
|
|
WHEEL_DIR="$SITE_PACKAGES/hf_xet"
|
|
DIST_INFO="$SITE_PACKAGES/hf_xet-$WHEEL_VERSION.dist-info"
|
|
WHEEL_FILE="$DIST_INFO/WHEEL"
|
|
|
|
# Reconstruct wheel name from wheel version and wheel tag
|
|
WHEEL_TAG=$(awk '/^Tag:/{printf $2}' $WHEEL_FILE)
|
|
SYMBOL_FILENAME="hf_xet-$WHEEL_VERSION-$WHEEL_TAG.so.dbg"
|
|
|
|
echo "Downloading debug symbols: $SYMBOL_FILENAME"
|
|
# If the version is of format "1.1.10rc0", change it to our release tag format like "1.1.10-rc0"
|
|
RELEASE_TAG=$(echo -n "$WHEEL_VERSION" | sed 's/\([0-9]\)\(rc.*\)$/\1-\2/')
|
|
DOWNLOAD_URL="https://github.com/huggingface/xet-core/releases/download/v${RELEASE_TAG}/dbg-symbols.zip"
|
|
curl -fL "$DOWNLOAD_URL" -o dbg-symbols.zip
|
|
if [ $? -ne 0 ]; then
|
|
echo "Error: Failed to download debug symbols from $DOWNLOAD_URL" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Extract just the needed symbol file
|
|
unzip dbg-symbols.zip -d "$SYMBOL_DIR"
|
|
|
|
# Copy to package directory
|
|
cp -r "$SYMBOL_DIR/dbg-symbols/$SYMBOL_FILENAME" "$WHEEL_DIR/"
|
|
echo "Installed dbg symbol $SYMBOL_FILENAME to $WHEEL_DIR"
|
|
fi
|
|
|
|
# --- launch target ---
|
|
SCRIPT_START_TIME=$(date +%s)
|
|
echo "Launching target at $(date -Is) ..." | tee -a "$CONSOLE_LOG"
|
|
|
|
LAUNCH_ENV=()
|
|
if [[ "$PRELOAD_HELPER" == true && -f "$ALLOW_PTRACE_SO" ]]; then
|
|
LAUNCH_ENV=( "LD_PRELOAD=$ALLOW_PTRACE_SO${LD_PRELOAD:+:$LD_PRELOAD}" )
|
|
echo "Using LD_PRELOAD=$ALLOW_PTRACE_SO to relax ptrace restrictions." | tee -a "$CONSOLE_LOG"
|
|
fi
|
|
|
|
if [[ ${#LAUNCH_ENV[@]} -gt 0 ]]; then
|
|
(
|
|
env "${LAUNCH_ENV[@]}" "${CMD[@]}" & echo $! > "$PID_FILE"
|
|
) 2>&1 | tee -a "$CONSOLE_LOG" &
|
|
else
|
|
(
|
|
"${CMD[@]}" & echo $! > "$PID_FILE"
|
|
) 2>&1 | tee -a "$CONSOLE_LOG" &
|
|
fi
|
|
LOGGER_BG=$!
|
|
|
|
# read PID
|
|
for _ in {1..50}; do
|
|
[[ -s "$PID_FILE" ]] && break
|
|
sleep 0.1
|
|
done
|
|
if [[ ! -s "$PID_FILE" ]]; then
|
|
echo "ERROR: Could not determine child PID." | tee -a "$CONSOLE_LOG"
|
|
exit 1
|
|
fi
|
|
TARGET_PID="$(cat "$PID_FILE")"
|
|
echo "Started PID: $TARGET_PID" | tee -a "$CONSOLE_LOG"
|
|
|
|
# --- stack capture + hang detection ---
|
|
declare -a LAST_STACKS=()
|
|
|
|
capture_stack() {
|
|
local ts stack_file
|
|
ts="$(date +%Y%m%d%H%M%S)"
|
|
stack_file="$OUTDIR/stacks/stack_${ts}.txt"
|
|
|
|
if command -v gdb >/dev/null 2>&1; then
|
|
gdb -p "$TARGET_PID" -batch \
|
|
-ex "set pagination off" \
|
|
-ex "thread apply all bt full" >"$stack_file" 2>&1 || true
|
|
elif command -v eu-stack >/dev/null 2>&1; then
|
|
eu-stack -p "$TARGET_PID" >"$stack_file" 2>&1 || true
|
|
elif command -v pstack >/dev/null 2>&1; then
|
|
pstack "$TARGET_PID" >"$stack_file" 2>&1 || true
|
|
else
|
|
for t in /proc/"$TARGET_PID"/task/*/stack; do
|
|
echo "=== $t ==="; cat "$t"; echo
|
|
done >"$stack_file" 2>&1 || true
|
|
fi
|
|
|
|
echo "$(date -Is) captured stack -> $stack_file" | tee -a "$CONSOLE_LOG"
|
|
|
|
LAST_STACKS+=("$stack_file")
|
|
if (( ${#LAST_STACKS[@]} > 3 )); then
|
|
LAST_STACKS=("${LAST_STACKS[@]: -3}")
|
|
fi
|
|
check_hang
|
|
}
|
|
|
|
check_hang() {
|
|
if (( ${#LAST_STACKS[@]} < 3 )); then return; fi
|
|
norm1=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[0]}" | grep -v '^$')
|
|
norm2=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[1]}" | grep -v '^$')
|
|
norm3=$(sed 's/0x[0-9a-f]\+//g' "${LAST_STACKS[2]}" | grep -v '^$')
|
|
|
|
diff12=$(diff <(echo "$norm1") <(echo "$norm2") || true)
|
|
diff23=$(diff <(echo "$norm2") <(echo "$norm3") || true)
|
|
|
|
# if no diff between stacks 1-2 and 2-3, we have a hang
|
|
if [[ -n "$diff12" || -n "$diff23" ]]; then
|
|
return
|
|
fi
|
|
|
|
echo "⚠️ Hang heuristic triggered at $(date -Is)" | tee -a "$CONSOLE_LOG"
|
|
take_core_dump
|
|
LAST_STACKS=()
|
|
}
|
|
|
|
take_core_dump() {
|
|
local ts core_file
|
|
ts="$(date +%Y%m%d%H%M%S)"
|
|
core_file="$OUTDIR/dumps/core_${ts}"
|
|
|
|
if command -v gcore >/dev/null 2>&1; then
|
|
gcore -o "$core_file" "$TARGET_PID" >>"$CONSOLE_LOG" 2>&1 || true
|
|
echo "Core dump saved: ${core_file}.${TARGET_PID}" | tee -a "$CONSOLE_LOG"
|
|
else
|
|
echo "gcore not available, saving partial /proc/$TARGET_PID/mem dump" | tee -a "$CONSOLE_LOG"
|
|
dd if="/proc/$TARGET_PID/mem" of="${core_file}.raw" bs=1M count=50 status=none 2>>"$CONSOLE_LOG" || true
|
|
echo "Partial raw dump saved: ${core_file}.raw" | tee -a "$CONSOLE_LOG"
|
|
fi
|
|
}
|
|
|
|
# --- monitoring loop ---
|
|
LAST_SNAPSHOT_AT=0
|
|
while kill -0 "$TARGET_PID" 2>/dev/null; do
|
|
now=$(date +%s)
|
|
if (( now - LAST_SNAPSHOT_AT >= INTERVAL )); then
|
|
capture_stack || true
|
|
LAST_SNAPSHOT_AT=$now
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo "Process $TARGET_PID has exited at $(date -Is)." | tee -a "$CONSOLE_LOG"
|
|
|
|
# --- collect xet log files from this execution ---
|
|
HF_HOME="${HF_HOME:-$HOME/.cache/huggingface}"
|
|
XET_LOG_DIR="$HF_HOME/xet/logs"
|
|
if [[ -d "$XET_LOG_DIR" ]]; then
|
|
echo "Collecting xet logs from $XET_LOG_DIR ..." | tee -a "$CONSOLE_LOG"
|
|
mkdir -p "$OUTDIR/xet_logs"
|
|
|
|
# Find log files created during or after script start time using GNU find
|
|
find "$XET_LOG_DIR" -name "xet_*.log" -type f -newermt "@$SCRIPT_START_TIME" 2>/dev/null | while read -r logfile; do
|
|
cp "$logfile" "$OUTDIR/xet_logs/" 2>/dev/null && \
|
|
echo " Copied: $(basename "$logfile")" | tee -a "$CONSOLE_LOG"
|
|
done
|
|
else
|
|
echo "No xet log directory found at $XET_LOG_DIR" | tee -a "$CONSOLE_LOG"
|
|
fi
|
|
|
|
echo "Logs and stacks are in: $OUTDIR"
|
|
disown "$LOGGER_BG" 2>/dev/null || true
|