Merge remote-tracking branch 'origin/main' into feat/file-chunk-hashes-and-compose

This commit is contained in:
Adrien
2026-05-21 20:12:16 +02:00
30 changed files with 4598 additions and 173 deletions

View File

@@ -15,14 +15,30 @@ permissions:
jobs:
bump_crates_version:
name: Bump crate versions and create PR
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
timeout-minutes: 60
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- name: Install Rust 1.94
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.10'
- name: Install libpython for pyo3 linking
run: |
set -eux
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
sudo apt-get update
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
sudo apt-get install -y software-properties-common
sudo add-apt-repository -y ppa:deadsnakes/ppa
sudo apt-get update
fi
sudo apt-get install -y "libpython${py_ver}-dev"
- uses: ./.github/actions/cache-rust-build
- name: Update crate versions

View File

@@ -13,9 +13,11 @@ concurrency:
jobs:
fmt:
name: Rustfmt
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: nightly
@@ -25,16 +27,25 @@ jobs:
cargo fmt --manifest-path ./Cargo.toml --all -- --check
cargo fmt --manifest-path ./hf_xet/Cargo.toml --all -- --check
detect-unused-dependencies:
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Machete
uses: bnjbvr/cargo-machete@b81ce1560c5fbd0210cb66d88bf210329ff04266 # main
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
- name: Install cargo-machete
run: cargo install --locked cargo-machete@0.9.2
- name: Run cargo-machete
run: cargo machete
check-bench-compiles:
name: Check benchmarks compile
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
@@ -43,15 +54,31 @@ jobs:
run: |
cargo bench --no-run --workspace --exclude git_xet
build_and_test-linux:
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- name: Install Rust 1.94
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
components: clippy
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.10'
- name: Install libpython for pyo3 linking
run: |
set -eux
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
sudo apt-get update
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
sudo apt-get install -y software-properties-common
sudo add-apt-repository -y ppa:deadsnakes/ppa
sudo apt-get update
fi
sudo apt-get install -y "libpython${py_ver}-dev"
- uses: ./.github/actions/cache-rust-build
- name: Lint
run: |
@@ -68,9 +95,6 @@ jobs:
- name: Build and Test hf_xet
run: |
cd hf_xet && cargo test --verbose --no-fail-fast
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.10'
- name: Create venv
run: python3 -m venv .venv
- name: Build wheel
@@ -159,10 +183,16 @@ jobs:
pytest hf_xet/tests/ -v
build_and_test-wasm:
name: Build WASM
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- name: Checkout repository
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- name: Install Rust 1.94
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
- uses: ./.github/actions/build-wasm
- name: Check hf_xet_thin_wasm Cargo.lock has no uncommitted changes
working-directory: wasm/hf_xet_thin_wasm
@@ -176,9 +206,11 @@ jobs:
test -z "$(git status --porcelain Cargo.lock)" || (echo "hf_xet_wasm Cargo.lock has uncommitted changes!" && exit 1)
cargo-audit:
name: Cargo Audit
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1

View File

@@ -14,10 +14,12 @@ permissions:
jobs:
hub-python-tests:
runs-on: ubuntu-latest
runs-on:
group: aws-general-8-plus
steps:
# checkout out xet-core
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
# checkout out huggingface_hub
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
@@ -26,6 +28,20 @@ jobs:
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: '3.10'
- name: Install libpython for pyo3 linking
run: |
set -eux
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
sudo apt-get update
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
sudo apt-get install -y software-properties-common
sudo add-apt-repository -y ppa:deadsnakes/ppa
sudo apt-get update
fi
sudo apt-get install -y "libpython${py_ver}-dev"
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
with:
toolchain: 1.94.1
- name: Create venv
run: python3 -m venv .venv
- name: Build wheel

View File

@@ -10,6 +10,9 @@ on:
tag:
description: "Tag to test (e.g., v1.0.3-rc2)"
required: true
permissions: {}
jobs:
trigger_rc_testing:
runs-on: ubuntu-latest

50
Cargo.lock generated
View File

@@ -1129,20 +1129,14 @@ dependencies = [
[[package]]
name = "ctor"
version = "0.6.3"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
checksum = "f7335955a5f85f95f3188623240e081e7b2059a8ad1bae68944b7cfdd718fb10"
dependencies = [
"ctor-proc-macro",
"dtor",
"link-section",
"linktime-proc-macro",
]
[[package]]
name = "ctor-proc-macro"
version = "0.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
[[package]]
name = "ctr"
version = "0.9.2"
@@ -1284,21 +1278,6 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
[[package]]
name = "dtor"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
dependencies = [
"dtor-proc-macro",
]
[[package]]
name = "dtor-proc-macro"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
[[package]]
name = "dunce"
version = "1.0.5"
@@ -2732,6 +2711,18 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "link-section"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea2c24837c4fd5ab6a31d64133eae954f5199247523cf29586117e85245c0dd3"
[[package]]
name = "linktime-proc-macro"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
@@ -3068,15 +3059,14 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
[[package]]
name = "openssl"
version = "0.10.76"
version = "0.10.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
dependencies = [
"bitflags 2.11.0",
"cfg-if 1.0.4",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
@@ -3115,9 +3105,9 @@ dependencies = [
[[package]]
name = "openssl-sys"
version = "0.9.112"
version = "0.9.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
dependencies = [
"cc",
"libc",

View File

@@ -12,7 +12,13 @@ members = [
"git_xet",
"simulation",
]
exclude = ["simulation/chunk_cache_bench", "hf_xet", "wasm/hf_xet_wasm", "wasm/hf_xet_thin_wasm"]
exclude = [
"simulation/chunk_cache_bench",
"hf_xet",
"wasm/hf_xet_wasm",
"wasm/hf_xet_thin_wasm",
"examples/xet_pkg_napi",
]
[workspace.package]
version = "1.5.2"
@@ -52,7 +58,7 @@ console-subscriber = "0.5"
countio = { version = "0.3", features = ["futures"] }
crc32fast = "1.5"
csv = "1"
ctor = "0.6"
ctor = "1"
dirs = "6.0"
futures = "0.3"
humantime = "2.1"

View File

@@ -0,0 +1,16 @@
This update adds a new public deduplication helper for computing restart-safe chunk boundaries from existing chunk boundary metadata.
What changed
- Added `xet_data::deduplication::next_stable_chunk_boundary(starting_position, chunk_boundaries) -> Option<usize>`.
- Re-exported it from `xet_data::deduplication` so downstream crates can use it directly.
- The function scans forward from `starting_position` and returns the next chunk boundary that satisfies the stable-boundary condition:
- two consecutive chunk sizes in `[2 * min_chunk, max_chunk - min_chunk)`,
- where `min_chunk` and `max_chunk` are derived from chunking constants.
Why this matters
- Callers that already have chunk-boundary metadata can locate a stable resume boundary without re-reading file bytes.
- This enables deterministic alignment behavior for resumed/partial workflows that need chunk boundaries robust to prefix changes.
Usage notes
- `chunk_boundaries` should be monotonically increasing chunk-end offsets produced by the same chunking configuration.
- `starting_position` may be any byte offset (not necessarily a chunk boundary) and is used as the reference offset from which to search for the next stable chunk boundary.

5
examples/xet_pkg_napi/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
node_modules/
*.node
index.js
index.d.ts
downloads/

3448
examples/xet_pkg_napi/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,38 @@
# Standalone — not part of the xet-core workspace. The workspace excludes this
# directory, but napi-rs's build flow can resolve cargo against the *original*
# (non-worktree) repo path during development, which would otherwise drag this
# crate back into the workspace it's deliberately outside of.
[workspace]
[package]
name = "xet_pkg_napi"
version = "0.0.1"
edition = "2024"
license = "Apache-2.0"
description = "Smoke-test napi binding for hf-xet (xet_pkg). Verifies that the Rust client builds and links inside a Node.js native addon."
publish = false
[lib]
crate-type = ["cdylib"]
[dependencies]
# hf-xet is published as `hf-xet` but the lib name is `xet`. Pull it via path.
hf-xet = { path = "../../xet_pkg" }
# napi-rs 2.x — the standard for Rust → Node.js native addons.
napi = { version = "2", default-features = false, features = ["napi8"] }
napi-derive = "2"
[build-dependencies]
napi-build = "2"
# cargo-machete can't match `hf-xet` (package name) to `xet::` (lib name in source).
[package.metadata.cargo-machete]
ignored = ["hf-xet"]
[profile.release]
lto = true
opt-level = 3
debug = 1
strip = "symbols"

View File

@@ -0,0 +1,103 @@
# xet_pkg_napi — napi smoke test for hf-xet
A minimal [napi-rs](https://napi.rs) native addon that links against the
`xet_pkg` (`hf-xet`) crate. Verifies that `hf-xet` compiles, links, starts up,
and can actually pull a file from CAS — all from inside a Node.js native
module.
## What it exports
The Rust crate at `src/lib.rs` exposes three functions to Node:
- `initLogging(version: string)` — installs `xet`'s tracing subscriber.
- `smokeTest(): string` — builds a `XetSession` synchronously and constructs
upload-commit + file-download-group builders. No I/O.
- `downloadFile(opts): { destPath, bytesDownloaded }` — actually downloads a
Xet-stored file from the HuggingFace Hub. **Synchronous**: blocks the libuv
main thread until the download finishes, so the JS event loop is paused for
the duration. Acceptable for a smoke test; a real binding should wrap this
in `napi::Task` / `tokio::task::spawn_blocking`.
This crate is **excluded from the workspace** (see the root `Cargo.toml`)
and carries its own `[workspace]` table because it has its own
`crate-type = ["cdylib"]` and ships under the `napi-rs/cli` build flow rather
than `cargo build`.
## Build & run
Requires Node ≥ 18, a Rust toolchain, and outbound network access to
`huggingface.co` and `cas-bridge.xethub.hf.co`.
```sh
cd examples/xet_pkg_napi
npm install
npm run build:debug # or `npm run build` for release
npm run smoke
```
`napi build` writes two artifacts next to `package.json`:
- `xet-pkg-napi.<platform>-<arch>.node` — the compiled cdylib
- `index.js` / `index.d.ts` — a CJS shim that picks the right `.node` for the
current platform
`smoke.mjs`:
1. Issues a `HEAD` against the HF Hub `resolve` URL with a non-default
User-Agent (Cloudfront strips `X-Xet-Hash` on cache hits served to
default UAs).
2. Reads `X-Xet-Hash`, `X-Linked-Size`, and `X-Linked-Etag` from the response.
3. Calls `downloadFile()` with the parsed metadata.
4. Verifies the on-disk size matches `X-Linked-Size`.
### Configuration
All env vars are optional. Defaults target a tiny (~540 KB) public Xet file
so the smoke test runs quickly without an HF token.
| Var | Default |
| -------------- | -------------------------------------------------- |
| `HF_ENDPOINT` | `https://huggingface.co` |
| `HF_REPO_TYPE` | `model` (`model` \| `dataset` \| `space`) |
| `HF_REPO` | `hf-internal-testing/tiny-random-bert` |
| `HF_BRANCH` | `main` |
| `HF_FILENAME` | `pytorch_model.bin` |
| `HF_TOKEN` | _unset_ (required for private repos) |
| `HF_DEST_DIR` | `./downloads` |
## Expected output
```
loaded addon, exports: [ 'initLogging', 'smokeTest', 'downloadFile' ]
Fetching xet metadata for model:hf-internal-testing/tiny-random-bert/pytorch_model.bin@main
https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin
xet-hash: 75402e74462600f62ca4a08b91c9218f36075860d5f6d7eb07f4c29ed7fa4ad6
size: 540,217 bytes
sha256: 9922e8996d0c7e24c7f4e7a5d9c5b7303549f4ee94de0f1138b103014b51be13
smokeTest: xet session built; runtime initialized
Downloading -> downloads/pytorch_model.bin
Result:
bytes downloaded: 540,217
on-disk size: 540,217
elapsed: 1.23s
OK — file downloaded and size matches.
```
## Notes / caveats
- **Synchronous download.** A real binding should expose this as
`#[napi]` async fn or wrap in `napi::Task` so the JS event loop isn't blocked
while xet pulls bytes from CAS.
- **No double runtime.** `xet-runtime` owns its own tokio runtime; it doesn't
piggyback on libuv. The blocking calls used here use `block_on` against
xet's runtime, so napi's main thread is the only thread that gets parked.
- **Metadata source.** The xet hash + file size come from the HF Hub's
`X-Xet-Hash` / `X-Linked-Size` headers. A non-default `User-Agent` is
required because Cloudfront caches strip those headers on cache hits served
to default UAs.
- **napi feature level.** Built against `napi8`. Bumping to `napi9`+ would
unlock newer N-API surfaces if needed.

View File

@@ -0,0 +1,3 @@
fn main() {
napi_build::setup();
}

35
examples/xet_pkg_napi/package-lock.json generated Normal file
View File

@@ -0,0 +1,35 @@
{
"name": "xet-pkg-napi",
"version": "0.0.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "xet-pkg-napi",
"version": "0.0.1",
"devDependencies": {
"@napi-rs/cli": "^2.18.4"
},
"engines": {
"node": ">=18"
}
},
"node_modules/@napi-rs/cli": {
"version": "2.18.4",
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.4.tgz",
"integrity": "sha512-SgJeA4df9DE2iAEpr3M2H0OKl/yjtg1BnRI5/JyowS71tUWhrfSu2LT0V3vlHET+g1hBVlrO60PmEXwUEKp8Mg==",
"dev": true,
"license": "MIT",
"bin": {
"napi": "scripts/index.js"
},
"engines": {
"node": ">= 10"
},
"funding": {
"type": "github",
"url": "https://github.com/sponsors/Brooooooklyn"
}
}
}
}

View File

@@ -0,0 +1,22 @@
{
"name": "xet-pkg-napi",
"version": "0.0.1",
"description": "napi-rs smoke test for hf-xet",
"private": true,
"main": "index.js",
"type": "commonjs",
"napi": {
"name": "xet-pkg-napi"
},
"scripts": {
"build": "napi build --platform --release",
"build:debug": "napi build --platform",
"smoke": "node smoke.mjs"
},
"devDependencies": {
"@napi-rs/cli": "^2.18.4"
},
"engines": {
"node": ">=18"
}
}

View File

@@ -0,0 +1,112 @@
// Smoke driver: load the napi addon, fetch a public Xet file's metadata from
// the HuggingFace Hub, then download the file via the binding.
//
// Run after `npm run build` (or `npm run build:debug`):
//
// node smoke.mjs
//
// Optional env vars (defaults pick a tiny ~540KB public Xet file):
// HF_ENDPOINT default: https://huggingface.co
// HF_REPO_TYPE default: model (model | dataset | space)
// HF_REPO default: hf-internal-testing/tiny-random-bert
// HF_BRANCH default: main
// HF_FILENAME default: pytorch_model.bin
// HF_TOKEN optional; required for private repos
// HF_DEST_DIR default: ./downloads
import { createRequire } from "node:module";
import { mkdirSync, statSync, rmSync } from "node:fs";
import { join, basename } from "node:path";
const require = createRequire(import.meta.url);
const addon = require("./index.js");
console.log("loaded addon, exports:", Object.keys(addon));
const endpoint = process.env.HF_ENDPOINT ?? "https://huggingface.co";
const repoType = process.env.HF_REPO_TYPE ?? "model";
const repoId = process.env.HF_REPO ?? "hf-internal-testing/tiny-random-bert";
const branch = process.env.HF_BRANCH ?? "main";
const filename = process.env.HF_FILENAME ?? "pytorch_model.bin";
const token = process.env.HF_TOKEN ?? null;
const destDir = process.env.HF_DEST_DIR ?? "./downloads";
const repoPathSegment = repoType === "model" ? "" : `${repoType}s/`;
const apiTypeSegment = `${repoType}s`;
const resolveUrl =
`${endpoint}/${repoPathSegment}${repoId}/resolve/${branch}/${filename}`;
const tokenRefreshUrl =
`${endpoint}/api/${apiTypeSegment}/${repoId}/xet-read-token/${branch}`;
console.log(`\nFetching xet metadata for ${repoType}:${repoId}/${filename}@${branch}`);
console.log(` ${resolveUrl}`);
// HEAD against /resolve/ — Cloudfront strips X-Xet-Hash on cache hits served
// to default UAs, so spoof a hf-xet-style User-Agent + cache-bust the URL.
const headResp = await fetch(`${resolveUrl}?_=${Date.now()}`, {
method: "HEAD",
redirect: "manual",
headers: {
"User-Agent": "xet-pkg-napi-smoke/0.1",
...(token ? { Authorization: `Bearer ${token}` } : {}),
},
});
if (headResp.status !== 302 && headResp.status !== 200) {
throw new Error(
`HEAD ${resolveUrl} returned ${headResp.status} ${headResp.statusText}` +
(token ? "" : " — set HF_TOKEN if the repo is private"),
);
}
const xetHash = headResp.headers.get("x-xet-hash");
const linkedSize = headResp.headers.get("x-linked-size");
const linkedEtag = headResp.headers.get("x-linked-etag");
if (!xetHash || !linkedSize) {
throw new Error(
"Hub did not return X-Xet-Hash / X-Linked-Size headers — is this a Xet-stored file?",
);
}
const sha256 = linkedEtag ? linkedEtag.replace(/"/g, "") : null;
const fileSize = Number(linkedSize);
console.log(` xet-hash: ${xetHash}`);
console.log(` size: ${fileSize.toLocaleString()} bytes`);
console.log(` sha256: ${sha256 ?? "(unknown)"}`);
mkdirSync(destDir, { recursive: true });
const destPath = join(destDir, basename(filename));
rmSync(destPath, { force: true });
addon.initLogging("xet_pkg_napi/0.0.1 smoke");
const smokeResult = addon.smokeTest();
console.log(`smokeTest: ${smokeResult}`);
console.log(`\nDownloading -> ${destPath}`);
const t0 = Date.now();
const result = addon.downloadFile({
tokenRefreshUrl,
...(token ? { authToken: token } : {}),
xetHash,
fileSize,
...(sha256 ? { sha256 } : {}),
destPath,
});
const elapsedSec = (Date.now() - t0) / 1000;
const stat = statSync(destPath);
console.log(`\nResult:`);
console.log(` bytes downloaded: ${result.bytesDownloaded.toLocaleString()}`);
console.log(` on-disk size: ${stat.size.toLocaleString()}`);
console.log(` elapsed: ${elapsedSec.toFixed(2)}s`);
if (stat.size !== fileSize) {
throw new Error(
`size mismatch: expected ${fileSize}, got ${stat.size} on disk`,
);
}
console.log("\nOK — file downloaded and size matches.");

View File

@@ -0,0 +1,106 @@
//! napi smoke-test binding for `hf-xet` (the `xet` crate at `xet_pkg/`).
//!
//! Exposes:
//! - `initLogging(version)` — install xet's tracing subscriber
//! - `smokeTest()` — build a `XetSession` and runtime helpers without doing any I/O
//! - `downloadFile(opts)` — actually download a Xet-stored file from the HuggingFace Hub to a local path
//!
//! `downloadFile` is intentionally synchronous: it blocks the caller until the
//! download completes, internally using `xet`'s `*_blocking` APIs which run on
//! xet-runtime's own tokio runtime. Calling it from JS will block the libuv
//! main thread for the duration — fine for a smoke test, but a real binding
//! should wrap this in `napi::Task` / `tokio::task::spawn_blocking` so the JS
//! event loop stays responsive.
use napi::{Error as NapiError, Status};
use napi_derive::napi;
use xet::xet_session::{HeaderMap, HeaderValue, XetFileInfo, XetSessionBuilder, header};
fn to_napi_err<E: std::fmt::Display>(e: E) -> NapiError {
NapiError::new(Status::GenericFailure, e.to_string())
}
#[napi(js_name = "initLogging")]
pub fn init_logging(version: String) {
xet::init_logging(version);
}
#[napi(js_name = "smokeTest")]
pub fn smoke_test() -> Result<String, NapiError> {
let session = XetSessionBuilder::new().build().map_err(to_napi_err)?;
let _upload = session.new_upload_commit().map_err(to_napi_err)?;
let _download = session.new_file_download_group().map_err(to_napi_err)?;
Ok("xet session built; runtime initialized".to_string())
}
/// Options for [`downloadFile`].
///
/// `xetHash` and `fileSize` come from the HuggingFace Hub's
/// `X-Xet-Hash` and `X-Linked-Size` response headers (issue a HEAD against
/// the `/{repo}/resolve/{ref}/{filename}` URL with a `User-Agent` to see them
/// — Cloudfront strips them on cache hits without a UA hint).
#[napi(object, js_name = "DownloadFileOptions")]
pub struct DownloadFileOptions {
/// The HuggingFace Hub's xet-read-token endpoint, e.g.
/// `https://huggingface.co/api/models/{repo}/xet-read-token/{ref}`.
pub token_refresh_url: String,
/// Optional bearer token for the refresh endpoint. Required for private
/// repos; for public repos this can be `null`.
pub auth_token: Option<String>,
/// The xet content hash (hex string) of the file to download.
pub xet_hash: String,
/// The file's size in bytes. JS `number` is precise up to 2^53; HF files
/// are well under that.
pub file_size: i64,
/// Optional SHA-256 (hex) used by xet to verify the download.
pub sha256: Option<String>,
/// Local filesystem destination for the downloaded file.
pub dest_path: String,
}
/// Result of a successful [`downloadFile`] call.
#[napi(object, js_name = "DownloadFileResult")]
pub struct DownloadFileResult {
pub dest_path: String,
pub bytes_downloaded: i64,
}
#[napi(js_name = "downloadFile")]
pub fn download_file(opts: DownloadFileOptions) -> Result<DownloadFileResult, NapiError> {
let mut headers = HeaderMap::new();
if let Some(token) = opts.auth_token.as_deref() {
let value = HeaderValue::from_str(&format!("Bearer {token}"))
.map_err(|e| NapiError::new(Status::InvalidArg, format!("invalid auth token: {e}")))?;
headers.insert(header::AUTHORIZATION, value);
}
let file_size: u64 = opts
.file_size
.try_into()
.map_err(|_| NapiError::new(Status::InvalidArg, "fileSize must be non-negative"))?;
let file_info = match opts.sha256 {
Some(sha) => XetFileInfo::new_with_sha256(opts.xet_hash, file_size, sha),
None => XetFileInfo::new(opts.xet_hash, file_size),
};
let session = XetSessionBuilder::new().build().map_err(to_napi_err)?;
let group = session
.new_file_download_group()
.map_err(to_napi_err)?
.with_token_refresh_url(opts.token_refresh_url, headers)
.build_blocking()
.map_err(to_napi_err)?;
let dest_path = std::path::PathBuf::from(&opts.dest_path);
group
.download_file_to_path_blocking(file_info, dest_path.clone())
.map_err(to_napi_err)?;
let report = group.finish_blocking().map_err(to_napi_err)?;
let bytes_downloaded: i64 = report.progress.total_bytes_completed.try_into().unwrap_or(i64::MAX);
Ok(DownloadFileResult {
dest_path: opts.dest_path,
bytes_downloaded,
})
}

50
hf_xet/Cargo.lock generated
View File

@@ -633,20 +633,14 @@ dependencies = [
[[package]]
name = "ctor"
version = "0.6.3"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
dependencies = [
"ctor-proc-macro",
"dtor",
"link-section",
"linktime-proc-macro",
]
[[package]]
name = "ctor-proc-macro"
version = "0.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
[[package]]
name = "debugid"
version = "0.8.0"
@@ -707,21 +701,6 @@ dependencies = [
"syn",
]
[[package]]
name = "dtor"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
dependencies = [
"dtor-proc-macro",
]
[[package]]
name = "dtor-proc-macro"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
[[package]]
name = "dunce"
version = "1.0.5"
@@ -1617,6 +1596,18 @@ dependencies = [
"libc",
]
[[package]]
name = "link-section"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
[[package]]
name = "linktime-proc-macro"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"
@@ -1875,15 +1866,14 @@ checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107"
[[package]]
name = "openssl"
version = "0.10.76"
version = "0.10.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
dependencies = [
"bitflags 2.11.0",
"cfg-if 1.0.4",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
@@ -1916,9 +1906,9 @@ dependencies = [
[[package]]
name = "openssl-sys"
version = "0.9.112"
version = "0.9.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
dependencies = [
"cc",
"libc",

View File

@@ -791,20 +791,14 @@ dependencies = [
[[package]]
name = "ctor"
version = "0.6.3"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
dependencies = [
"ctor-proc-macro",
"dtor",
"link-section",
"linktime-proc-macro",
]
[[package]]
name = "ctor-proc-macro"
version = "0.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
[[package]]
name = "daemonize"
version = "0.5.0"
@@ -919,21 +913,6 @@ dependencies = [
"const-random",
]
[[package]]
name = "dtor"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
dependencies = [
"dtor-proc-macro",
]
[[package]]
name = "dtor-proc-macro"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
[[package]]
name = "dunce"
version = "1.0.5"
@@ -1866,12 +1845,24 @@ dependencies = [
"redox_syscall 0.7.3",
]
[[package]]
name = "link-section"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
[[package]]
name = "linked-hash-map"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linktime-proc-macro"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
[[package]]
name = "linux-raw-sys"
version = "0.4.15"
@@ -2207,15 +2198,14 @@ dependencies = [
[[package]]
name = "openssl"
version = "0.10.76"
version = "0.10.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
dependencies = [
"bitflags 2.11.0",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
@@ -2245,9 +2235,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
[[package]]
name = "openssl-sys"
version = "0.9.112"
version = "0.9.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
dependencies = [
"cc",
"libc",

View File

@@ -471,20 +471,14 @@ dependencies = [
[[package]]
name = "ctor"
version = "0.6.3"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
dependencies = [
"ctor-proc-macro",
"dtor",
"link-section",
"linktime-proc-macro",
]
[[package]]
name = "ctor-proc-macro"
version = "0.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
[[package]]
name = "deranged"
version = "0.5.8"
@@ -536,21 +530,6 @@ dependencies = [
"syn",
]
[[package]]
name = "dtor"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
dependencies = [
"dtor-proc-macro",
]
[[package]]
name = "dtor-proc-macro"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
[[package]]
name = "dunce"
version = "1.0.5"
@@ -1217,6 +1196,18 @@ dependencies = [
"libc",
]
[[package]]
name = "link-section"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
[[package]]
name = "linktime-proc-macro"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"

View File

@@ -492,20 +492,14 @@ dependencies = [
[[package]]
name = "ctor"
version = "0.6.3"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
dependencies = [
"ctor-proc-macro",
"dtor",
"link-section",
"linktime-proc-macro",
]
[[package]]
name = "ctor-proc-macro"
version = "0.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
[[package]]
name = "deranged"
version = "0.5.8"
@@ -557,21 +551,6 @@ dependencies = [
"syn",
]
[[package]]
name = "dtor"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
dependencies = [
"dtor-proc-macro",
]
[[package]]
name = "dtor-proc-macro"
version = "0.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
[[package]]
name = "dunce"
version = "1.0.5"
@@ -1305,6 +1284,18 @@ dependencies = [
"libc",
]
[[package]]
name = "link-section"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
[[package]]
name = "linktime-proc-macro"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
[[package]]
name = "linux-raw-sys"
version = "0.12.1"

View File

@@ -156,6 +156,7 @@ impl RemoteClient {
let result = RetryWrapper::new(self.ctx.clone(), api_tag)
.with_429_no_retry()
.with_expected_404()
.log_errors_as_info()
.run(move || client.get(url.clone()).with_extension(Api(api_tag)).send())
.await;
@@ -583,7 +584,7 @@ impl Client for RemoteClient {
// Use the no-read-timeout client for shard uploads. reqwest's per-request timeout()
// does NOT override the client-level read_timeout(), so we use a separate client
// with no read_timeout. Server-side shard processing scales linearly with file entry
// count and can exceed the global read_timeout (120s) for large shards.
// count and can exceed the global read_timeout (300s) for large shards.
#[cfg(not(target_family = "wasm"))]
let client = self.shard_upload_http_client.clone();

View File

@@ -31,6 +31,7 @@ pub struct RetryWrapper {
no_retry_on_429: bool,
retry_on_403: bool,
expected_416: bool,
expected_404: bool,
log_errors_as_info: bool,
api_tag: &'static str,
connection_permit: Option<Mutex<ConnectionPermitInfo>>,
@@ -46,6 +47,7 @@ impl RetryWrapper {
no_retry_on_429: false,
retry_on_403: false,
expected_416: false,
expected_404: false,
log_errors_as_info: false,
api_tag,
connection_permit: None,
@@ -77,6 +79,15 @@ impl RetryWrapper {
self
}
/// Mark 404 responses as expected (e.g. `query_dedup` cache miss). When set,
/// a 404 is still returned as a fatal (non-retried) error to the caller — which is
/// usually handled by the caller converting it to `Ok(None)` — but it is logged as
/// a cache miss instead of the misleading `"Fatal Error"`.
pub fn with_expected_404(mut self) -> Self {
self.expected_404 = true;
self
}
pub fn log_errors_as_info(mut self) -> Self {
self.log_errors_as_info = true;
self
@@ -166,6 +177,9 @@ impl RetryWrapper {
} else if e.status() == Some(StatusCode::RANGE_NOT_SATISFIABLE) && self.expected_416 {
let cas_err = process_error("Reached end of reconstruction 416 (Range Not Satisfiable)", e, true);
Err(RetryableReqwestError::FatalError(cas_err))
} else if e.status() == Some(StatusCode::NOT_FOUND) && self.expected_404 {
let cas_err = process_error("Not Found (cache miss)", e, true);
Err(RetryableReqwestError::FatalError(cas_err))
} else {
let cas_err = process_error("Fatal Error", e, false);
Err(RetryableReqwestError::FatalError(cas_err))
@@ -857,6 +871,37 @@ mod tests {
check_json_unexpected_eof_retry(&server).await;
}
#[tokio::test]
async fn test_404_expected_is_fatal_and_not_retried() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/not_found"))
.respond_with(ResponseTemplate::new(404))
.expect(1)
.mount(&server)
.await;
let client = make_client();
let counter = Arc::new(AtomicU32::new(0));
let counter_ = counter.clone();
let result = connection_wrapper("test_404_expected_is_fatal_and_not_retried")
.with_max_attempts(3)
.with_expected_404()
.run(move || {
let url = format!("{}/not_found", server.uri());
counter_.fetch_add(1, Ordering::Relaxed);
client.clone().get(&url).send()
})
.await;
assert!(result.is_err());
let err = result.unwrap_err();
assert_eq!(err.status(), Some(StatusCode::NOT_FOUND));
assert_eq!(counter.load(Ordering::SeqCst), 1);
}
#[tokio::test]
async fn test_403_no_retry_by_default() {
let server = MockServer::start().await;

View File

@@ -185,9 +185,10 @@ impl MDBMinimalShard {
let _ = MDBShardFileHeader::deserialize(reader)?;
let mut file_info_views = Vec::<MDBFileInfoView>::new();
let mut seen_file_hashes = HashSet::new();
process_shard_file_info_section(reader, |fiv: MDBFileInfoView| {
// register the offset here to the file entries
if include_files {
if include_files && seen_file_hashes.insert(fiv.file_hash()) {
file_info_views.push(fiv);
}
Ok(())
@@ -232,11 +233,14 @@ impl MDBMinimalShard {
let _ = MDBShardFileHeader::deserialize(&mut Cursor::new(&buf))?;
let mut file_info_views = Vec::<MDBFileInfoView>::new();
let mut seen_file_hashes = HashSet::new();
process_shard_file_info_section_async(reader, |fiv: MDBFileInfoView| {
// register the offset here to the file entries
if include_files {
file_callback(&fiv)?;
file_info_views.push(fiv);
if seen_file_hashes.insert(fiv.file_hash()) {
file_info_views.push(fiv);
}
}
Ok(())
})
@@ -507,17 +511,28 @@ mod tests {
use rand::rngs::SmallRng;
use rand::{RngExt, SeedableRng};
use super::super::MDBShardInfo;
use super::super::file_structs::MDBFileInfo;
use super::super::file_structs::{FileDataSequenceHeader, MDBFileInfo};
use super::super::shard_file::test_routines::{
convert_to_file, gen_random_shard, gen_random_shard_with_xorb_references,
convert_to_file, gen_random_file_info, gen_random_shard, gen_random_shard_with_xorb_references,
};
use super::super::shard_in_memory::MDBInMemoryShard;
use super::super::xorb_structs::MDBXorbInfo;
use super::super::xorb_structs::{MDBXorbInfo, XorbChunkSequenceHeader};
use super::super::{MDBShardFileHeader, MDBShardInfo};
use super::MDBMinimalShard;
use crate::error::Result;
use crate::merklehash::MerkleHash;
fn file_info_stream(file_infos: &[MDBFileInfo]) -> Vec<u8> {
let mut buffer = Vec::new();
MDBShardFileHeader::default().serialize(&mut buffer).unwrap();
for file_info in file_infos {
file_info.serialize(&mut buffer).unwrap();
}
FileDataSequenceHeader::bookend().serialize(&mut buffer).unwrap();
XorbChunkSequenceHeader::bookend().serialize(&mut buffer).unwrap();
buffer
}
fn verify_serialization(min_shard: &MDBMinimalShard, mem_shard: &MDBInMemoryShard) -> Result<()> {
for verification in [true, false] {
// compute size, with verification if possible only
@@ -665,6 +680,43 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_minimal_shard_deduplicates_file_infos_first_wins() {
let mut rng = rand::rngs::StdRng::seed_from_u64(7);
let first = gen_random_file_info(&mut rng, &2, false, false);
let mut duplicate = gen_random_file_info(&mut rng, &3, true, true);
duplicate.metadata.file_hash = first.metadata.file_hash;
let buffer = file_info_stream(&[first.clone(), duplicate.clone()]);
let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();
assert_eq!(min_shard.num_files(), 1);
assert_eq!(MDBFileInfo::from(min_shard.file(0).unwrap()), first);
let mut callback_file_infos = Vec::new();
let min_shard_async = MDBMinimalShard::from_reader_async_with_custom_callbacks(
&mut &buffer[..],
true,
true,
|f| {
callback_file_infos.push(MDBFileInfo::from(f));
Ok(())
},
|_| Ok(()),
)
.await
.unwrap();
assert_eq!(min_shard, min_shard_async);
assert_eq!(callback_file_infos, vec![first.clone(), duplicate]);
let mut reserialized = Vec::new();
min_shard.serialize(&mut reserialized, false).unwrap();
let shard_info = MDBShardInfo::load_from_reader(&mut Cursor::new(&reserialized)).unwrap();
let file_infos = shard_info.read_all_file_info_sections(&mut Cursor::new(&reserialized)).unwrap();
assert_eq!(file_infos, vec![first]);
}
#[tokio::test]
async fn test_shards() -> Result<()> {
let shard = gen_random_shard(0, &[], &[0], false, false)?;

View File

@@ -282,9 +282,15 @@ impl Chunker {
/// partition_scan_bytes is the number of bytes to scan at each
/// proposed partition boundary in search of a valid chunk.
///
/// Due to a known issue in how we do chunking, note that these
/// partitions are not 100% guaranteed to align. See the
/// parallel_chunking.pdf for details.
/// Partition alignment is guaranteed by the hash warmup fix: the
/// chunker feeds `min_chunk - 64 - 1` bytes before scanning for
/// boundaries, ensuring the gear hash window is fully warmed (purely
/// data-dependent) at all accepted trigger positions. This function
/// additionally verifies the absence of hidden triggers by re-chunking
/// with `min_chunk = 0`. See `parallel chunking.lyx` for the proof.
///
/// For finding stable chunk boundaries from existing chunk boundaries (without
/// data access), see [`next_stable_chunk_boundary`].
pub fn find_partitions<R: Read + Seek>(
reader: &mut R,
file_size: usize,
@@ -353,6 +359,53 @@ pub fn find_partitions<R: Read + Seek>(
Ok(partitions)
}
/// Given a list of chunk boundaries in a file and an arbitrary reference position,
/// returns the next stable chunk boundary at or after that position.
///
/// `starting_position` may be any byte offset in the file; it does not need to
/// be an existing chunk boundary. The search starts at the first chunk boundary
/// `>= starting_position`.
///
/// A stable chunk boundary is defined such that any possible changes in the data
/// before `starting_position` would produce the same chunk boundaries at the
/// stable boundary and later. The fixed data between `starting_position` and
/// the returned stable boundary is always sufficient to restore the chunker to
/// its original chunk boundaries.
///
/// The stability condition requires two consecutive chunks after `starting_position`,
/// both with sizes in `[2 * min_chunk, max_chunk - min_chunk)`. The boundary
/// at the end of the second such chunk is the stable chunk boundary.
///
/// The lower bound is `2 * min_chunk` rather than `min_chunk` (as used in
/// [`find_partitions`]) because this function operates on existing chunk
/// boundaries without data access, and cannot verify the absence of hidden
/// hash triggers in the `[c_k, c_k + min_chunk)` skip zone. A shadow-zone
/// trigger can advance a modified chunker by up to `min_chunk`, so the next
/// chunk must be at least `2 * min_chunk` to remain reachable.
///
/// See `parallel chunking.lyx` for the full proof and `find_stable_start` in
/// `merkle_hash_subtree.rs` for the analogous construction in merkle hashing.
pub fn next_stable_chunk_boundary(starting_position: usize, chunk_boundaries: &[usize]) -> Option<usize> {
let minimum_chunk = *TARGET_CHUNK_SIZE / *MINIMUM_CHUNK_DIVISOR;
let maximum_chunk = *TARGET_CHUNK_SIZE * *MAXIMUM_CHUNK_MULTIPLIER;
let start_idx = chunk_boundaries.partition_point(|&x| x < starting_position);
for i in start_idx..chunk_boundaries.len().saturating_sub(2) {
let size_a = chunk_boundaries[i + 1] - chunk_boundaries[i];
let size_b = chunk_boundaries[i + 2] - chunk_boundaries[i + 1];
if size_a >= 2 * minimum_chunk
&& size_a < maximum_chunk - minimum_chunk
&& size_b >= 2 * minimum_chunk
&& size_b < maximum_chunk - minimum_chunk
{
return Some(chunk_boundaries[i + 2]);
}
}
None
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;

View File

@@ -5,7 +5,7 @@ mod defrag_prevention;
mod file_deduplication;
mod interface;
pub use chunking::{Chunker, find_partitions};
pub use chunking::{Chunker, find_partitions, next_stable_chunk_boundary};
pub use data_aggregator::DataAggregator;
pub use dedup_metrics::DeduplicationMetrics;
pub use file_deduplication::FileDeduper;

View File

@@ -827,9 +827,58 @@ Now, with this implementation, can we still perform parallel chunking?
\end_layout
\begin_layout Standard
I do not believe so.
Due to the hash disagreement, under adverserial settings it is possible
to construct two chunk sequences which will *never* align.
\series bold
Update:
\series default
The implementation has been fixed.
Instead of skipping
\begin_inset Formula $m$
\end_inset
bytes (setting HashStreamStart =
\begin_inset Formula $i+m$
\end_inset
), the chunker now starts feeding the hash at
\begin_inset Formula $m-k-1$
\end_inset
bytes from the chunk start.
This ensures the hash window has been fed at least
\begin_inset Formula $k+1$
\end_inset
bytes by position
\begin_inset Formula $m$
\end_inset
, so the hash output at every accepted trigger position (
\begin_inset Formula $\ge m$
\end_inset
from chunk start) depends only on the last
\begin_inset Formula $k$
\end_inset
bytes of data, independent of the chunk starting point.
Therefore
\begin_inset Formula $H(a_{1},b)=H(a_{2},b)$
\end_inset
for all
\begin_inset Formula $b$
\end_inset
where
\begin_inset Formula $b-a_{1}\ge m$
\end_inset
and
\begin_inset Formula $b-a_{2}\ge m$
\end_inset
, and the parallel chunking proof in Section 2 holds.
\end_layout
\end_body

View File

@@ -0,0 +1,312 @@
use std::collections::HashSet;
use rand::rngs::StdRng;
use rand::{RngExt, SeedableRng};
use xet_data::deduplication::constants::TARGET_CHUNK_SIZE;
use xet_data::deduplication::{Chunk, Chunker, next_stable_chunk_boundary};
use xet_runtime::test_set_constants;
test_set_constants! {
TARGET_CHUNK_SIZE = 1024;
}
fn make_random_data(seed: u64, len: usize) -> Vec<u8> {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = vec![0u8; len];
rng.fill(&mut data[..]);
data
}
fn chunk_data(data: &[u8]) -> Vec<Chunk> {
let mut chunker = Chunker::default();
chunker.next_block(data, true)
}
fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec<usize> {
chunks
.iter()
.scan(0usize, |pos, c| {
*pos += c.data.len();
Some(*pos)
})
.collect()
}
fn verify_alignment(
original_boundaries: &[usize],
new_boundaries: &[usize],
stable: usize,
file_size: usize,
starting_position: usize,
mutation_seed: u64,
) {
let orig_set: HashSet<usize> = original_boundaries.iter().copied().collect();
let new_set: HashSet<usize> = new_boundaries.iter().copied().collect();
for &oc in original_boundaries {
if oc >= stable && oc < file_size {
assert!(
new_set.contains(&oc),
"Original chunk boundary {oc} (>= stable {stable}) missing from new chunk boundaries. \
starting_position={starting_position}, mutation_seed={mutation_seed}"
);
}
}
for &nc in new_boundaries {
if nc >= stable && nc < file_size {
assert!(
orig_set.contains(&nc),
"New chunk boundary {nc} (>= stable {stable}) not in original chunk boundaries. \
starting_position={starting_position}, mutation_seed={mutation_seed}"
);
}
}
}
/// For a given data buffer, exercise `next_stable_chunk_boundary` at random
/// starting positions across the full data range with random mutations.
fn stress_test_stable_chunk_boundaries(data: &[u8], seed: u64, num_positions: usize, num_mutations: u64) {
let file_size = data.len();
let chunks = chunk_data(data);
let chunk_boundaries = get_chunk_boundaries(&chunks);
assert!(
chunk_boundaries.len() > 10,
"Need enough chunks for meaningful testing, got {}",
chunk_boundaries.len()
);
let mut rng = StdRng::seed_from_u64(seed);
let mut tested_stable = 0u64;
for trial in 0..num_positions {
let starting_position = rng.random_range(1..file_size);
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
Some(s) => s,
None => continue,
};
assert!(
chunk_boundaries.contains(&stable),
"Stable chunk boundary {stable} is not a member of original chunk boundaries"
);
assert!(
stable >= starting_position,
"Stable chunk boundary {stable} must be at or after starting_position {starting_position}"
);
tested_stable += 1;
for mutation_seed in 0..num_mutations {
let combined_seed = (trial as u64) * 10000 + mutation_seed + 1;
let mut modified = data.to_vec();
let mut mrng = StdRng::seed_from_u64(combined_seed);
mrng.fill(&mut modified[..starting_position]);
let new_chunks = chunk_data(&modified);
let new_boundaries = get_chunk_boundaries(&new_chunks);
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, combined_seed);
}
}
let min_expected = (num_positions / 4).max(1);
assert!(
tested_stable >= min_expected as u64,
"Too few starting positions had stable chunk boundaries: {tested_stable} (expected >= {min_expected})"
);
}
#[test]
fn test_stable_chunk_boundary_edge_cases() {
let data = make_random_data(42, 50_000);
let chunks = chunk_data(&data);
let chunk_boundaries = get_chunk_boundaries(&chunks);
// starting_position at 0: should still return a valid point
let stable_0 = next_stable_chunk_boundary(0, &chunk_boundaries);
if let Some(s) = stable_0 {
assert!(chunk_boundaries.contains(&s));
}
// starting_position past all chunk boundaries: should return None
let past_end = *chunk_boundaries.last().unwrap() + 1;
assert!(next_stable_chunk_boundary(past_end, &chunk_boundaries).is_none());
// starting_position near end with too few remaining points
if chunk_boundaries.len() >= 2 {
let near_end = chunk_boundaries[chunk_boundaries.len() - 2];
assert!(next_stable_chunk_boundary(near_end + 1, &chunk_boundaries).is_none());
}
// Degenerate inputs
assert!(next_stable_chunk_boundary(0, &[]).is_none());
assert!(next_stable_chunk_boundary(0, &[100]).is_none());
assert!(next_stable_chunk_boundary(0, &[100, 200]).is_none());
}
#[test]
fn test_stable_chunk_boundary_with_constant_data() {
// Constant data produces max-chunk-sized chunks (forced boundaries).
// With target=1024: max_chunk=2048, min_chunk=128, so max-min=1920.
// Forced boundaries at size 2048 fail the upper bound check of < 1920.
let data = vec![0u8; 50_000];
let chunks = chunk_data(&data);
let chunk_boundaries = get_chunk_boundaries(&chunks);
let stable = next_stable_chunk_boundary(0, &chunk_boundaries);
assert!(stable.is_none(), "Constant data should have no stable chunk boundary (all forced cuts)");
}
#[test]
fn test_stable_chunk_boundary_smoke_stress() {
let data = make_random_data(42, 50_000);
stress_test_stable_chunk_boundaries(&data, 42, 5, 5);
}
#[test]
fn test_stable_chunk_boundary_smoke_varied_seeds() {
for seed in [1, 7, 255] {
let data = make_random_data(seed, 50_000);
stress_test_stable_chunk_boundaries(&data, seed + 77, 5, 5);
}
}
#[cfg(not(feature = "smoke-test"))]
#[test]
fn test_stable_chunk_boundary_stress() {
let data = make_random_data(42, 256_000);
stress_test_stable_chunk_boundaries(&data, 42, 100, 20);
}
#[cfg(not(feature = "smoke-test"))]
#[test]
fn test_stable_chunk_boundary_stress_varied_seeds() {
for seed in [1, 7, 13, 100, 255, 1024, 42424, 999999] {
let data = make_random_data(seed, 100_000);
stress_test_stable_chunk_boundaries(&data, seed + 77, 50, 20);
}
}
#[cfg(not(feature = "smoke-test"))]
#[test]
fn test_stable_chunk_boundary_mutation_types() {
let data = make_random_data(42, 100_000);
let chunks = chunk_data(&data);
let chunk_boundaries = get_chunk_boundaries(&chunks);
let file_size = data.len();
let mid_idx = chunk_boundaries.len() / 2;
let starting_position = chunk_boundaries[mid_idx];
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
Some(s) => s,
None => return,
};
// Zero-fill
{
let mut modified = data.to_vec();
modified[..starting_position].fill(0);
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 0);
}
// 0xFF-fill
{
let mut modified = data.to_vec();
modified[..starting_position].fill(0xFF);
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 1);
}
// Reverse the prefix
{
let mut modified = data.to_vec();
modified[..starting_position].reverse();
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 2);
}
// XOR with a pattern
{
let mut modified = data.to_vec();
for (i, byte) in modified[..starting_position].iter_mut().enumerate() {
*byte ^= (i & 0xFF) as u8;
}
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 3);
}
// Many different random fills
for seed in 0..200 {
let mut modified = data.to_vec();
let mut rng = StdRng::seed_from_u64(seed + 5000);
rng.fill(&mut modified[..starting_position]);
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, seed + 5000);
}
}
#[cfg(not(feature = "smoke-test"))]
#[test]
fn test_stable_chunk_boundary_is_tight() {
// Verify the stable chunk boundary is actually needed: the chunk boundary
// just before stable should NOT always be stable (there should exist some
// mutation that breaks it).
let data = make_random_data(42, 256_000);
let chunks = chunk_data(&data);
let chunk_boundaries = get_chunk_boundaries(&chunks);
let mut found_non_stable_predecessor = false;
for &starting_position in chunk_boundaries.iter().take(chunk_boundaries.len() / 2) {
if starting_position == 0 {
continue;
}
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
Some(s) => s,
None => continue,
};
let stable_idx = chunk_boundaries.iter().position(|&x| x == stable).unwrap();
if stable_idx == 0 {
continue;
}
let predecessor = chunk_boundaries[stable_idx - 1];
if predecessor <= starting_position {
continue;
}
let orig_set: HashSet<usize> = chunk_boundaries.iter().copied().collect();
for seed in 0..200 {
let mut modified = data.to_vec();
let mut rng = StdRng::seed_from_u64(seed + 90000);
rng.fill(&mut modified[..starting_position]);
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
let new_set: HashSet<usize> = new_boundaries.iter().copied().collect();
if !new_set.contains(&predecessor)
|| new_boundaries
.iter()
.any(|&nc| nc >= predecessor && nc < stable && !orig_set.contains(&nc))
{
found_non_stable_predecessor = true;
break;
}
}
if found_non_stable_predecessor {
break;
}
}
assert!(
found_non_stable_predecessor,
"Could not find any case where the predecessor of a stable chunk boundary was actually unstable. \
This suggests the stability condition may be too conservative."
);
}

View File

@@ -54,10 +54,10 @@ crate::config_group!({
/// transfers to complete. If no data is received for this duration, the connection
/// is considered stalled and will timeout.
///
/// The default value is 120 seconds.
/// The default value is 300 seconds.
///
/// Use the environment variable `HF_XET_CLIENT_READ_TIMEOUT` to set this value.
ref read_timeout: Duration = Duration::from_secs(120);
ref read_timeout: Duration = Duration::from_secs(300);
/// Send a report of a successful partial upload every 512kb.
///

View File

@@ -229,7 +229,7 @@ macro_rules! test_set_constants {
)+) => {
use $crate::configuration_utils::ctor_reexport as ctor;
#[ctor::ctor]
#[ctor::ctor(unsafe)]
fn set_constants_on_load() {
$(
let val = $val;
@@ -300,7 +300,7 @@ macro_rules! test_set_config {
)+) => {
use $crate::configuration_utils::ctor_reexport as config_ctor;
#[config_ctor::ctor]
#[config_ctor::ctor(unsafe)]
fn set_config_on_load() {
$(
let group_name_upper = stringify!($group_name).to_uppercase();