mirror of
https://github.com/huggingface/xet-core.git
synced 2026-06-04 13:30:29 +08:00
Merge remote-tracking branch 'origin/main' into feat/file-chunk-hashes-and-compose
This commit is contained in:
18
.github/workflows/bump-crates-version.yml
vendored
18
.github/workflows/bump-crates-version.yml
vendored
@@ -15,14 +15,30 @@ permissions:
|
||||
jobs:
|
||||
bump_crates_version:
|
||||
name: Bump crate versions and create PR
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
timeout-minutes: 60
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- name: Install Rust 1.94
|
||||
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install libpython for pyo3 linking
|
||||
run: |
|
||||
set -eux
|
||||
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
||||
sudo apt-get update
|
||||
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
|
||||
sudo apt-get install -y software-properties-common
|
||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||
sudo apt-get update
|
||||
fi
|
||||
sudo apt-get install -y "libpython${py_ver}-dev"
|
||||
- uses: ./.github/actions/cache-rust-build
|
||||
|
||||
- name: Update crate versions
|
||||
|
||||
54
.github/workflows/ci.yml
vendored
54
.github/workflows/ci.yml
vendored
@@ -13,9 +13,11 @@ concurrency:
|
||||
jobs:
|
||||
fmt:
|
||||
name: Rustfmt
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: nightly
|
||||
@@ -25,16 +27,25 @@ jobs:
|
||||
cargo fmt --manifest-path ./Cargo.toml --all -- --check
|
||||
cargo fmt --manifest-path ./hf_xet/Cargo.toml --all -- --check
|
||||
detect-unused-dependencies:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- name: Machete
|
||||
uses: bnjbvr/cargo-machete@b81ce1560c5fbd0210cb66d88bf210329ff04266 # main
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
- name: Install cargo-machete
|
||||
run: cargo install --locked cargo-machete@0.9.2
|
||||
- name: Run cargo-machete
|
||||
run: cargo machete
|
||||
check-bench-compiles:
|
||||
name: Check benchmarks compile
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
@@ -43,15 +54,31 @@ jobs:
|
||||
run: |
|
||||
cargo bench --no-run --workspace --exclude git_xet
|
||||
build_and_test-linux:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- name: Install Rust 1.94
|
||||
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
components: clippy
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install libpython for pyo3 linking
|
||||
run: |
|
||||
set -eux
|
||||
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
||||
sudo apt-get update
|
||||
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
|
||||
sudo apt-get install -y software-properties-common
|
||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||
sudo apt-get update
|
||||
fi
|
||||
sudo apt-get install -y "libpython${py_ver}-dev"
|
||||
- uses: ./.github/actions/cache-rust-build
|
||||
- name: Lint
|
||||
run: |
|
||||
@@ -68,9 +95,6 @@ jobs:
|
||||
- name: Build and Test hf_xet
|
||||
run: |
|
||||
cd hf_xet && cargo test --verbose --no-fail-fast
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Create venv
|
||||
run: python3 -m venv .venv
|
||||
- name: Build wheel
|
||||
@@ -159,10 +183,16 @@ jobs:
|
||||
pytest hf_xet/tests/ -v
|
||||
build_and_test-wasm:
|
||||
name: Build WASM
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- name: Install Rust 1.94
|
||||
uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
- uses: ./.github/actions/build-wasm
|
||||
- name: Check hf_xet_thin_wasm Cargo.lock has no uncommitted changes
|
||||
working-directory: wasm/hf_xet_thin_wasm
|
||||
@@ -176,9 +206,11 @@ jobs:
|
||||
test -z "$(git status --porcelain Cargo.lock)" || (echo "hf_xet_wasm Cargo.lock has uncommitted changes!" && exit 1)
|
||||
cargo-audit:
|
||||
name: Cargo Audit
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
|
||||
18
.github/workflows/hf-xet-tests.yml
vendored
18
.github/workflows/hf-xet-tests.yml
vendored
@@ -14,10 +14,12 @@ permissions:
|
||||
|
||||
jobs:
|
||||
hub-python-tests:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on:
|
||||
group: aws-general-8-plus
|
||||
steps:
|
||||
# checkout out xet-core
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
- uses: huggingface/hf-workflows/.github/actions/setup-hf-registry-proxies@21845492ef47d4e46b07a3b2a6e14fd4748c39cc # main
|
||||
# checkout out huggingface_hub
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
|
||||
with:
|
||||
@@ -26,6 +28,20 @@ jobs:
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
|
||||
with:
|
||||
python-version: '3.10'
|
||||
- name: Install libpython for pyo3 linking
|
||||
run: |
|
||||
set -eux
|
||||
py_ver=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
|
||||
sudo apt-get update
|
||||
if ! apt-cache show "libpython${py_ver}-dev" >/dev/null 2>&1; then
|
||||
sudo apt-get install -y software-properties-common
|
||||
sudo add-apt-repository -y ppa:deadsnakes/ppa
|
||||
sudo apt-get update
|
||||
fi
|
||||
sudo apt-get install -y "libpython${py_ver}-dev"
|
||||
- uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
|
||||
with:
|
||||
toolchain: 1.94.1
|
||||
- name: Create venv
|
||||
run: python3 -m venv .venv
|
||||
- name: Build wheel
|
||||
|
||||
3
.github/workflows/pre-release-testing.yml
vendored
3
.github/workflows/pre-release-testing.yml
vendored
@@ -10,6 +10,9 @@ on:
|
||||
tag:
|
||||
description: "Tag to test (e.g., v1.0.3-rc2)"
|
||||
required: true
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
trigger_rc_testing:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
50
Cargo.lock
generated
50
Cargo.lock
generated
@@ -1129,20 +1129,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.6.3"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
|
||||
checksum = "f7335955a5f85f95f3188623240e081e7b2059a8ad1bae68944b7cfdd718fb10"
|
||||
dependencies = [
|
||||
"ctor-proc-macro",
|
||||
"dtor",
|
||||
"link-section",
|
||||
"linktime-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor-proc-macro"
|
||||
version = "0.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
||||
|
||||
[[package]]
|
||||
name = "ctr"
|
||||
version = "0.9.2"
|
||||
@@ -1284,21 +1278,6 @@ version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
|
||||
|
||||
[[package]]
|
||||
name = "dtor"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
|
||||
dependencies = [
|
||||
"dtor-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor-proc-macro"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
@@ -2732,6 +2711,18 @@ dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-section"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea2c24837c4fd5ab6a31d64133eae954f5199247523cf29586117e85245c0dd3"
|
||||
|
||||
[[package]]
|
||||
name = "linktime-proc-macro"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
@@ -3068,15 +3059,14 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.76"
|
||||
version = "0.10.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
|
||||
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"cfg-if 1.0.4",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
@@ -3115,9 +3105,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.112"
|
||||
version = "0.9.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
|
||||
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -12,7 +12,13 @@ members = [
|
||||
"git_xet",
|
||||
"simulation",
|
||||
]
|
||||
exclude = ["simulation/chunk_cache_bench", "hf_xet", "wasm/hf_xet_wasm", "wasm/hf_xet_thin_wasm"]
|
||||
exclude = [
|
||||
"simulation/chunk_cache_bench",
|
||||
"hf_xet",
|
||||
"wasm/hf_xet_wasm",
|
||||
"wasm/hf_xet_thin_wasm",
|
||||
"examples/xet_pkg_napi",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.5.2"
|
||||
@@ -52,7 +58,7 @@ console-subscriber = "0.5"
|
||||
countio = { version = "0.3", features = ["futures"] }
|
||||
crc32fast = "1.5"
|
||||
csv = "1"
|
||||
ctor = "0.6"
|
||||
ctor = "1"
|
||||
dirs = "6.0"
|
||||
futures = "0.3"
|
||||
humantime = "2.1"
|
||||
|
||||
16
api_changes/update_260424_next_stable_chunk_boundary.md
Normal file
16
api_changes/update_260424_next_stable_chunk_boundary.md
Normal file
@@ -0,0 +1,16 @@
|
||||
This update adds a new public deduplication helper for computing restart-safe chunk boundaries from existing chunk boundary metadata.
|
||||
|
||||
What changed
|
||||
- Added `xet_data::deduplication::next_stable_chunk_boundary(starting_position, chunk_boundaries) -> Option<usize>`.
|
||||
- Re-exported it from `xet_data::deduplication` so downstream crates can use it directly.
|
||||
- The function scans forward from `starting_position` and returns the next chunk boundary that satisfies the stable-boundary condition:
|
||||
- two consecutive chunk sizes in `[2 * min_chunk, max_chunk - min_chunk)`,
|
||||
- where `min_chunk` and `max_chunk` are derived from chunking constants.
|
||||
|
||||
Why this matters
|
||||
- Callers that already have chunk-boundary metadata can locate a stable resume boundary without re-reading file bytes.
|
||||
- This enables deterministic alignment behavior for resumed/partial workflows that need chunk boundaries robust to prefix changes.
|
||||
|
||||
Usage notes
|
||||
- `chunk_boundaries` should be monotonically increasing chunk-end offsets produced by the same chunking configuration.
|
||||
- `starting_position` may be any byte offset (not necessarily a chunk boundary) and is used as the reference offset from which to search for the next stable chunk boundary.
|
||||
5
examples/xet_pkg_napi/.gitignore
vendored
Normal file
5
examples/xet_pkg_napi/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
node_modules/
|
||||
*.node
|
||||
index.js
|
||||
index.d.ts
|
||||
downloads/
|
||||
3448
examples/xet_pkg_napi/Cargo.lock
generated
Normal file
3448
examples/xet_pkg_napi/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
38
examples/xet_pkg_napi/Cargo.toml
Normal file
38
examples/xet_pkg_napi/Cargo.toml
Normal file
@@ -0,0 +1,38 @@
|
||||
# Standalone — not part of the xet-core workspace. The workspace excludes this
|
||||
# directory, but napi-rs's build flow can resolve cargo against the *original*
|
||||
# (non-worktree) repo path during development, which would otherwise drag this
|
||||
# crate back into the workspace it's deliberately outside of.
|
||||
[workspace]
|
||||
|
||||
[package]
|
||||
name = "xet_pkg_napi"
|
||||
version = "0.0.1"
|
||||
edition = "2024"
|
||||
license = "Apache-2.0"
|
||||
description = "Smoke-test napi binding for hf-xet (xet_pkg). Verifies that the Rust client builds and links inside a Node.js native addon."
|
||||
publish = false
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
# hf-xet is published as `hf-xet` but the lib name is `xet`. Pull it via path.
|
||||
hf-xet = { path = "../../xet_pkg" }
|
||||
|
||||
# napi-rs 2.x — the standard for Rust → Node.js native addons.
|
||||
|
||||
napi = { version = "2", default-features = false, features = ["napi8"] }
|
||||
napi-derive = "2"
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = "2"
|
||||
|
||||
# cargo-machete can't match `hf-xet` (package name) to `xet::` (lib name in source).
|
||||
[package.metadata.cargo-machete]
|
||||
ignored = ["hf-xet"]
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
opt-level = 3
|
||||
debug = 1
|
||||
strip = "symbols"
|
||||
103
examples/xet_pkg_napi/README.md
Normal file
103
examples/xet_pkg_napi/README.md
Normal file
@@ -0,0 +1,103 @@
|
||||
# xet_pkg_napi — napi smoke test for hf-xet
|
||||
|
||||
A minimal [napi-rs](https://napi.rs) native addon that links against the
|
||||
`xet_pkg` (`hf-xet`) crate. Verifies that `hf-xet` compiles, links, starts up,
|
||||
and can actually pull a file from CAS — all from inside a Node.js native
|
||||
module.
|
||||
|
||||
## What it exports
|
||||
|
||||
The Rust crate at `src/lib.rs` exposes three functions to Node:
|
||||
|
||||
- `initLogging(version: string)` — installs `xet`'s tracing subscriber.
|
||||
- `smokeTest(): string` — builds a `XetSession` synchronously and constructs
|
||||
upload-commit + file-download-group builders. No I/O.
|
||||
- `downloadFile(opts): { destPath, bytesDownloaded }` — actually downloads a
|
||||
Xet-stored file from the HuggingFace Hub. **Synchronous**: blocks the libuv
|
||||
main thread until the download finishes, so the JS event loop is paused for
|
||||
the duration. Acceptable for a smoke test; a real binding should wrap this
|
||||
in `napi::Task` / `tokio::task::spawn_blocking`.
|
||||
|
||||
This crate is **excluded from the workspace** (see the root `Cargo.toml`)
|
||||
and carries its own `[workspace]` table because it has its own
|
||||
`crate-type = ["cdylib"]` and ships under the `napi-rs/cli` build flow rather
|
||||
than `cargo build`.
|
||||
|
||||
## Build & run
|
||||
|
||||
Requires Node ≥ 18, a Rust toolchain, and outbound network access to
|
||||
`huggingface.co` and `cas-bridge.xethub.hf.co`.
|
||||
|
||||
```sh
|
||||
cd examples/xet_pkg_napi
|
||||
npm install
|
||||
npm run build:debug # or `npm run build` for release
|
||||
npm run smoke
|
||||
```
|
||||
|
||||
`napi build` writes two artifacts next to `package.json`:
|
||||
|
||||
- `xet-pkg-napi.<platform>-<arch>.node` — the compiled cdylib
|
||||
- `index.js` / `index.d.ts` — a CJS shim that picks the right `.node` for the
|
||||
current platform
|
||||
|
||||
`smoke.mjs`:
|
||||
|
||||
1. Issues a `HEAD` against the HF Hub `resolve` URL with a non-default
|
||||
User-Agent (Cloudfront strips `X-Xet-Hash` on cache hits served to
|
||||
default UAs).
|
||||
2. Reads `X-Xet-Hash`, `X-Linked-Size`, and `X-Linked-Etag` from the response.
|
||||
3. Calls `downloadFile()` with the parsed metadata.
|
||||
4. Verifies the on-disk size matches `X-Linked-Size`.
|
||||
|
||||
### Configuration
|
||||
|
||||
All env vars are optional. Defaults target a tiny (~540 KB) public Xet file
|
||||
so the smoke test runs quickly without an HF token.
|
||||
|
||||
| Var | Default |
|
||||
| -------------- | -------------------------------------------------- |
|
||||
| `HF_ENDPOINT` | `https://huggingface.co` |
|
||||
| `HF_REPO_TYPE` | `model` (`model` \| `dataset` \| `space`) |
|
||||
| `HF_REPO` | `hf-internal-testing/tiny-random-bert` |
|
||||
| `HF_BRANCH` | `main` |
|
||||
| `HF_FILENAME` | `pytorch_model.bin` |
|
||||
| `HF_TOKEN` | _unset_ (required for private repos) |
|
||||
| `HF_DEST_DIR` | `./downloads` |
|
||||
|
||||
## Expected output
|
||||
|
||||
```
|
||||
loaded addon, exports: [ 'initLogging', 'smokeTest', 'downloadFile' ]
|
||||
|
||||
Fetching xet metadata for model:hf-internal-testing/tiny-random-bert/pytorch_model.bin@main
|
||||
https://huggingface.co/hf-internal-testing/tiny-random-bert/resolve/main/pytorch_model.bin
|
||||
xet-hash: 75402e74462600f62ca4a08b91c9218f36075860d5f6d7eb07f4c29ed7fa4ad6
|
||||
size: 540,217 bytes
|
||||
sha256: 9922e8996d0c7e24c7f4e7a5d9c5b7303549f4ee94de0f1138b103014b51be13
|
||||
smokeTest: xet session built; runtime initialized
|
||||
|
||||
Downloading -> downloads/pytorch_model.bin
|
||||
|
||||
Result:
|
||||
bytes downloaded: 540,217
|
||||
on-disk size: 540,217
|
||||
elapsed: 1.23s
|
||||
|
||||
OK — file downloaded and size matches.
|
||||
```
|
||||
|
||||
## Notes / caveats
|
||||
|
||||
- **Synchronous download.** A real binding should expose this as
|
||||
`#[napi]` async fn or wrap in `napi::Task` so the JS event loop isn't blocked
|
||||
while xet pulls bytes from CAS.
|
||||
- **No double runtime.** `xet-runtime` owns its own tokio runtime; it doesn't
|
||||
piggyback on libuv. The blocking calls used here use `block_on` against
|
||||
xet's runtime, so napi's main thread is the only thread that gets parked.
|
||||
- **Metadata source.** The xet hash + file size come from the HF Hub's
|
||||
`X-Xet-Hash` / `X-Linked-Size` headers. A non-default `User-Agent` is
|
||||
required because Cloudfront caches strip those headers on cache hits served
|
||||
to default UAs.
|
||||
- **napi feature level.** Built against `napi8`. Bumping to `napi9`+ would
|
||||
unlock newer N-API surfaces if needed.
|
||||
3
examples/xet_pkg_napi/build.rs
Normal file
3
examples/xet_pkg_napi/build.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
fn main() {
|
||||
napi_build::setup();
|
||||
}
|
||||
35
examples/xet_pkg_napi/package-lock.json
generated
Normal file
35
examples/xet_pkg_napi/package-lock.json
generated
Normal file
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"name": "xet-pkg-napi",
|
||||
"version": "0.0.1",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "xet-pkg-napi",
|
||||
"version": "0.0.1",
|
||||
"devDependencies": {
|
||||
"@napi-rs/cli": "^2.18.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/cli": {
|
||||
"version": "2.18.4",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/cli/-/cli-2.18.4.tgz",
|
||||
"integrity": "sha512-SgJeA4df9DE2iAEpr3M2H0OKl/yjtg1BnRI5/JyowS71tUWhrfSu2LT0V3vlHET+g1hBVlrO60PmEXwUEKp8Mg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"bin": {
|
||||
"napi": "scripts/index.js"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/Brooooooklyn"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
22
examples/xet_pkg_napi/package.json
Normal file
22
examples/xet_pkg_napi/package.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "xet-pkg-napi",
|
||||
"version": "0.0.1",
|
||||
"description": "napi-rs smoke test for hf-xet",
|
||||
"private": true,
|
||||
"main": "index.js",
|
||||
"type": "commonjs",
|
||||
"napi": {
|
||||
"name": "xet-pkg-napi"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "napi build --platform --release",
|
||||
"build:debug": "napi build --platform",
|
||||
"smoke": "node smoke.mjs"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@napi-rs/cli": "^2.18.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
}
|
||||
112
examples/xet_pkg_napi/smoke.mjs
Normal file
112
examples/xet_pkg_napi/smoke.mjs
Normal file
@@ -0,0 +1,112 @@
|
||||
// Smoke driver: load the napi addon, fetch a public Xet file's metadata from
|
||||
// the HuggingFace Hub, then download the file via the binding.
|
||||
//
|
||||
// Run after `npm run build` (or `npm run build:debug`):
|
||||
//
|
||||
// node smoke.mjs
|
||||
//
|
||||
// Optional env vars (defaults pick a tiny ~540KB public Xet file):
|
||||
// HF_ENDPOINT default: https://huggingface.co
|
||||
// HF_REPO_TYPE default: model (model | dataset | space)
|
||||
// HF_REPO default: hf-internal-testing/tiny-random-bert
|
||||
// HF_BRANCH default: main
|
||||
// HF_FILENAME default: pytorch_model.bin
|
||||
// HF_TOKEN optional; required for private repos
|
||||
// HF_DEST_DIR default: ./downloads
|
||||
|
||||
import { createRequire } from "node:module";
|
||||
import { mkdirSync, statSync, rmSync } from "node:fs";
|
||||
import { join, basename } from "node:path";
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
const addon = require("./index.js");
|
||||
|
||||
console.log("loaded addon, exports:", Object.keys(addon));
|
||||
|
||||
const endpoint = process.env.HF_ENDPOINT ?? "https://huggingface.co";
|
||||
const repoType = process.env.HF_REPO_TYPE ?? "model";
|
||||
const repoId = process.env.HF_REPO ?? "hf-internal-testing/tiny-random-bert";
|
||||
const branch = process.env.HF_BRANCH ?? "main";
|
||||
const filename = process.env.HF_FILENAME ?? "pytorch_model.bin";
|
||||
const token = process.env.HF_TOKEN ?? null;
|
||||
const destDir = process.env.HF_DEST_DIR ?? "./downloads";
|
||||
|
||||
const repoPathSegment = repoType === "model" ? "" : `${repoType}s/`;
|
||||
const apiTypeSegment = `${repoType}s`;
|
||||
|
||||
const resolveUrl =
|
||||
`${endpoint}/${repoPathSegment}${repoId}/resolve/${branch}/${filename}`;
|
||||
const tokenRefreshUrl =
|
||||
`${endpoint}/api/${apiTypeSegment}/${repoId}/xet-read-token/${branch}`;
|
||||
|
||||
console.log(`\nFetching xet metadata for ${repoType}:${repoId}/${filename}@${branch}`);
|
||||
console.log(` ${resolveUrl}`);
|
||||
|
||||
// HEAD against /resolve/ — Cloudfront strips X-Xet-Hash on cache hits served
|
||||
// to default UAs, so spoof a hf-xet-style User-Agent + cache-bust the URL.
|
||||
const headResp = await fetch(`${resolveUrl}?_=${Date.now()}`, {
|
||||
method: "HEAD",
|
||||
redirect: "manual",
|
||||
headers: {
|
||||
"User-Agent": "xet-pkg-napi-smoke/0.1",
|
||||
...(token ? { Authorization: `Bearer ${token}` } : {}),
|
||||
},
|
||||
});
|
||||
|
||||
if (headResp.status !== 302 && headResp.status !== 200) {
|
||||
throw new Error(
|
||||
`HEAD ${resolveUrl} returned ${headResp.status} ${headResp.statusText}` +
|
||||
(token ? "" : " — set HF_TOKEN if the repo is private"),
|
||||
);
|
||||
}
|
||||
|
||||
const xetHash = headResp.headers.get("x-xet-hash");
|
||||
const linkedSize = headResp.headers.get("x-linked-size");
|
||||
const linkedEtag = headResp.headers.get("x-linked-etag");
|
||||
|
||||
if (!xetHash || !linkedSize) {
|
||||
throw new Error(
|
||||
"Hub did not return X-Xet-Hash / X-Linked-Size headers — is this a Xet-stored file?",
|
||||
);
|
||||
}
|
||||
|
||||
const sha256 = linkedEtag ? linkedEtag.replace(/"/g, "") : null;
|
||||
const fileSize = Number(linkedSize);
|
||||
console.log(` xet-hash: ${xetHash}`);
|
||||
console.log(` size: ${fileSize.toLocaleString()} bytes`);
|
||||
console.log(` sha256: ${sha256 ?? "(unknown)"}`);
|
||||
|
||||
mkdirSync(destDir, { recursive: true });
|
||||
const destPath = join(destDir, basename(filename));
|
||||
rmSync(destPath, { force: true });
|
||||
|
||||
addon.initLogging("xet_pkg_napi/0.0.1 smoke");
|
||||
|
||||
const smokeResult = addon.smokeTest();
|
||||
console.log(`smokeTest: ${smokeResult}`);
|
||||
|
||||
console.log(`\nDownloading -> ${destPath}`);
|
||||
const t0 = Date.now();
|
||||
const result = addon.downloadFile({
|
||||
tokenRefreshUrl,
|
||||
...(token ? { authToken: token } : {}),
|
||||
xetHash,
|
||||
fileSize,
|
||||
...(sha256 ? { sha256 } : {}),
|
||||
destPath,
|
||||
});
|
||||
const elapsedSec = (Date.now() - t0) / 1000;
|
||||
|
||||
const stat = statSync(destPath);
|
||||
console.log(`\nResult:`);
|
||||
console.log(` bytes downloaded: ${result.bytesDownloaded.toLocaleString()}`);
|
||||
console.log(` on-disk size: ${stat.size.toLocaleString()}`);
|
||||
console.log(` elapsed: ${elapsedSec.toFixed(2)}s`);
|
||||
|
||||
if (stat.size !== fileSize) {
|
||||
throw new Error(
|
||||
`size mismatch: expected ${fileSize}, got ${stat.size} on disk`,
|
||||
);
|
||||
}
|
||||
|
||||
console.log("\nOK — file downloaded and size matches.");
|
||||
106
examples/xet_pkg_napi/src/lib.rs
Normal file
106
examples/xet_pkg_napi/src/lib.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
//! napi smoke-test binding for `hf-xet` (the `xet` crate at `xet_pkg/`).
|
||||
//!
|
||||
//! Exposes:
|
||||
//! - `initLogging(version)` — install xet's tracing subscriber
|
||||
//! - `smokeTest()` — build a `XetSession` and runtime helpers without doing any I/O
|
||||
//! - `downloadFile(opts)` — actually download a Xet-stored file from the HuggingFace Hub to a local path
|
||||
//!
|
||||
//! `downloadFile` is intentionally synchronous: it blocks the caller until the
|
||||
//! download completes, internally using `xet`'s `*_blocking` APIs which run on
|
||||
//! xet-runtime's own tokio runtime. Calling it from JS will block the libuv
|
||||
//! main thread for the duration — fine for a smoke test, but a real binding
|
||||
//! should wrap this in `napi::Task` / `tokio::task::spawn_blocking` so the JS
|
||||
//! event loop stays responsive.
|
||||
|
||||
use napi::{Error as NapiError, Status};
|
||||
use napi_derive::napi;
|
||||
use xet::xet_session::{HeaderMap, HeaderValue, XetFileInfo, XetSessionBuilder, header};
|
||||
|
||||
fn to_napi_err<E: std::fmt::Display>(e: E) -> NapiError {
|
||||
NapiError::new(Status::GenericFailure, e.to_string())
|
||||
}
|
||||
|
||||
#[napi(js_name = "initLogging")]
|
||||
pub fn init_logging(version: String) {
|
||||
xet::init_logging(version);
|
||||
}
|
||||
|
||||
#[napi(js_name = "smokeTest")]
|
||||
pub fn smoke_test() -> Result<String, NapiError> {
|
||||
let session = XetSessionBuilder::new().build().map_err(to_napi_err)?;
|
||||
let _upload = session.new_upload_commit().map_err(to_napi_err)?;
|
||||
let _download = session.new_file_download_group().map_err(to_napi_err)?;
|
||||
Ok("xet session built; runtime initialized".to_string())
|
||||
}
|
||||
|
||||
/// Options for [`downloadFile`].
|
||||
///
|
||||
/// `xetHash` and `fileSize` come from the HuggingFace Hub's
|
||||
/// `X-Xet-Hash` and `X-Linked-Size` response headers (issue a HEAD against
|
||||
/// the `/{repo}/resolve/{ref}/{filename}` URL with a `User-Agent` to see them
|
||||
/// — Cloudfront strips them on cache hits without a UA hint).
|
||||
#[napi(object, js_name = "DownloadFileOptions")]
|
||||
pub struct DownloadFileOptions {
|
||||
/// The HuggingFace Hub's xet-read-token endpoint, e.g.
|
||||
/// `https://huggingface.co/api/models/{repo}/xet-read-token/{ref}`.
|
||||
pub token_refresh_url: String,
|
||||
/// Optional bearer token for the refresh endpoint. Required for private
|
||||
/// repos; for public repos this can be `null`.
|
||||
pub auth_token: Option<String>,
|
||||
/// The xet content hash (hex string) of the file to download.
|
||||
pub xet_hash: String,
|
||||
/// The file's size in bytes. JS `number` is precise up to 2^53; HF files
|
||||
/// are well under that.
|
||||
pub file_size: i64,
|
||||
/// Optional SHA-256 (hex) used by xet to verify the download.
|
||||
pub sha256: Option<String>,
|
||||
/// Local filesystem destination for the downloaded file.
|
||||
pub dest_path: String,
|
||||
}
|
||||
|
||||
/// Result of a successful [`downloadFile`] call.
|
||||
#[napi(object, js_name = "DownloadFileResult")]
|
||||
pub struct DownloadFileResult {
|
||||
pub dest_path: String,
|
||||
pub bytes_downloaded: i64,
|
||||
}
|
||||
|
||||
#[napi(js_name = "downloadFile")]
|
||||
pub fn download_file(opts: DownloadFileOptions) -> Result<DownloadFileResult, NapiError> {
|
||||
let mut headers = HeaderMap::new();
|
||||
if let Some(token) = opts.auth_token.as_deref() {
|
||||
let value = HeaderValue::from_str(&format!("Bearer {token}"))
|
||||
.map_err(|e| NapiError::new(Status::InvalidArg, format!("invalid auth token: {e}")))?;
|
||||
headers.insert(header::AUTHORIZATION, value);
|
||||
}
|
||||
|
||||
let file_size: u64 = opts
|
||||
.file_size
|
||||
.try_into()
|
||||
.map_err(|_| NapiError::new(Status::InvalidArg, "fileSize must be non-negative"))?;
|
||||
let file_info = match opts.sha256 {
|
||||
Some(sha) => XetFileInfo::new_with_sha256(opts.xet_hash, file_size, sha),
|
||||
None => XetFileInfo::new(opts.xet_hash, file_size),
|
||||
};
|
||||
|
||||
let session = XetSessionBuilder::new().build().map_err(to_napi_err)?;
|
||||
let group = session
|
||||
.new_file_download_group()
|
||||
.map_err(to_napi_err)?
|
||||
.with_token_refresh_url(opts.token_refresh_url, headers)
|
||||
.build_blocking()
|
||||
.map_err(to_napi_err)?;
|
||||
|
||||
let dest_path = std::path::PathBuf::from(&opts.dest_path);
|
||||
group
|
||||
.download_file_to_path_blocking(file_info, dest_path.clone())
|
||||
.map_err(to_napi_err)?;
|
||||
let report = group.finish_blocking().map_err(to_napi_err)?;
|
||||
|
||||
let bytes_downloaded: i64 = report.progress.total_bytes_completed.try_into().unwrap_or(i64::MAX);
|
||||
|
||||
Ok(DownloadFileResult {
|
||||
dest_path: opts.dest_path,
|
||||
bytes_downloaded,
|
||||
})
|
||||
}
|
||||
50
hf_xet/Cargo.lock
generated
50
hf_xet/Cargo.lock
generated
@@ -633,20 +633,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.6.3"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
|
||||
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
|
||||
dependencies = [
|
||||
"ctor-proc-macro",
|
||||
"dtor",
|
||||
"link-section",
|
||||
"linktime-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor-proc-macro"
|
||||
version = "0.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
||||
|
||||
[[package]]
|
||||
name = "debugid"
|
||||
version = "0.8.0"
|
||||
@@ -707,21 +701,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
|
||||
dependencies = [
|
||||
"dtor-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor-proc-macro"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
@@ -1617,6 +1596,18 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-section"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
|
||||
|
||||
[[package]]
|
||||
name = "linktime-proc-macro"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
@@ -1875,15 +1866,14 @@ checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.76"
|
||||
version = "0.10.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
|
||||
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"cfg-if 1.0.4",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
@@ -1916,9 +1906,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.112"
|
||||
version = "0.9.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
|
||||
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
50
simulation/chunk_cache_bench/Cargo.lock
generated
50
simulation/chunk_cache_bench/Cargo.lock
generated
@@ -791,20 +791,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.6.3"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
|
||||
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
|
||||
dependencies = [
|
||||
"ctor-proc-macro",
|
||||
"dtor",
|
||||
"link-section",
|
||||
"linktime-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor-proc-macro"
|
||||
version = "0.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
||||
|
||||
[[package]]
|
||||
name = "daemonize"
|
||||
version = "0.5.0"
|
||||
@@ -919,21 +913,6 @@ dependencies = [
|
||||
"const-random",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
|
||||
dependencies = [
|
||||
"dtor-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor-proc-macro"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
@@ -1866,12 +1845,24 @@ dependencies = [
|
||||
"redox_syscall 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-section"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
|
||||
|
||||
[[package]]
|
||||
name = "linked-hash-map"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
||||
|
||||
[[package]]
|
||||
name = "linktime-proc-macro"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.15"
|
||||
@@ -2207,15 +2198,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.76"
|
||||
version = "0.10.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf"
|
||||
checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
@@ -2245,9 +2235,9 @@ checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.112"
|
||||
version = "0.9.115"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb"
|
||||
checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
41
wasm/hf_xet_thin_wasm/Cargo.lock
generated
41
wasm/hf_xet_thin_wasm/Cargo.lock
generated
@@ -471,20 +471,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.6.3"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
|
||||
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
|
||||
dependencies = [
|
||||
"ctor-proc-macro",
|
||||
"dtor",
|
||||
"link-section",
|
||||
"linktime-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor-proc-macro"
|
||||
version = "0.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
@@ -536,21 +530,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
|
||||
dependencies = [
|
||||
"dtor-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor-proc-macro"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
@@ -1217,6 +1196,18 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-section"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
|
||||
|
||||
[[package]]
|
||||
name = "linktime-proc-macro"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
|
||||
41
wasm/hf_xet_wasm/Cargo.lock
generated
41
wasm/hf_xet_wasm/Cargo.lock
generated
@@ -492,20 +492,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.6.3"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e"
|
||||
checksum = "5c24d2b2b7c12a2fffb7c5c8fd0dcda7ca14b4600fa2d3701b6079aefb6fa180"
|
||||
dependencies = [
|
||||
"ctor-proc-macro",
|
||||
"dtor",
|
||||
"link-section",
|
||||
"linktime-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor-proc-macro"
|
||||
version = "0.0.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.5.8"
|
||||
@@ -557,21 +551,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301"
|
||||
dependencies = [
|
||||
"dtor-proc-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtor-proc-macro"
|
||||
version = "0.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
@@ -1305,6 +1284,18 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-section"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4641b91711debb59c61b07eb5e30521ed6d9e2bdd9fd04f934e7da3a5bc386d4"
|
||||
|
||||
[[package]]
|
||||
name = "linktime-proc-macro"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44cd706ff0d503ee32b2071166510ca27e281228de10cd3aa8d35ff94560f81"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
|
||||
@@ -156,6 +156,7 @@ impl RemoteClient {
|
||||
|
||||
let result = RetryWrapper::new(self.ctx.clone(), api_tag)
|
||||
.with_429_no_retry()
|
||||
.with_expected_404()
|
||||
.log_errors_as_info()
|
||||
.run(move || client.get(url.clone()).with_extension(Api(api_tag)).send())
|
||||
.await;
|
||||
@@ -583,7 +584,7 @@ impl Client for RemoteClient {
|
||||
// Use the no-read-timeout client for shard uploads. reqwest's per-request timeout()
|
||||
// does NOT override the client-level read_timeout(), so we use a separate client
|
||||
// with no read_timeout. Server-side shard processing scales linearly with file entry
|
||||
// count and can exceed the global read_timeout (120s) for large shards.
|
||||
// count and can exceed the global read_timeout (300s) for large shards.
|
||||
#[cfg(not(target_family = "wasm"))]
|
||||
let client = self.shard_upload_http_client.clone();
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ pub struct RetryWrapper {
|
||||
no_retry_on_429: bool,
|
||||
retry_on_403: bool,
|
||||
expected_416: bool,
|
||||
expected_404: bool,
|
||||
log_errors_as_info: bool,
|
||||
api_tag: &'static str,
|
||||
connection_permit: Option<Mutex<ConnectionPermitInfo>>,
|
||||
@@ -46,6 +47,7 @@ impl RetryWrapper {
|
||||
no_retry_on_429: false,
|
||||
retry_on_403: false,
|
||||
expected_416: false,
|
||||
expected_404: false,
|
||||
log_errors_as_info: false,
|
||||
api_tag,
|
||||
connection_permit: None,
|
||||
@@ -77,6 +79,15 @@ impl RetryWrapper {
|
||||
self
|
||||
}
|
||||
|
||||
/// Mark 404 responses as expected (e.g. `query_dedup` cache miss). When set,
|
||||
/// a 404 is still returned as a fatal (non-retried) error to the caller — which is
|
||||
/// usually handled by the caller converting it to `Ok(None)` — but it is logged as
|
||||
/// a cache miss instead of the misleading `"Fatal Error"`.
|
||||
pub fn with_expected_404(mut self) -> Self {
|
||||
self.expected_404 = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn log_errors_as_info(mut self) -> Self {
|
||||
self.log_errors_as_info = true;
|
||||
self
|
||||
@@ -166,6 +177,9 @@ impl RetryWrapper {
|
||||
} else if e.status() == Some(StatusCode::RANGE_NOT_SATISFIABLE) && self.expected_416 {
|
||||
let cas_err = process_error("Reached end of reconstruction 416 (Range Not Satisfiable)", e, true);
|
||||
Err(RetryableReqwestError::FatalError(cas_err))
|
||||
} else if e.status() == Some(StatusCode::NOT_FOUND) && self.expected_404 {
|
||||
let cas_err = process_error("Not Found (cache miss)", e, true);
|
||||
Err(RetryableReqwestError::FatalError(cas_err))
|
||||
} else {
|
||||
let cas_err = process_error("Fatal Error", e, false);
|
||||
Err(RetryableReqwestError::FatalError(cas_err))
|
||||
@@ -857,6 +871,37 @@ mod tests {
|
||||
check_json_unexpected_eof_retry(&server).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_404_expected_is_fatal_and_not_retried() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
Mock::given(method("GET"))
|
||||
.and(path("/not_found"))
|
||||
.respond_with(ResponseTemplate::new(404))
|
||||
.expect(1)
|
||||
.mount(&server)
|
||||
.await;
|
||||
|
||||
let client = make_client();
|
||||
let counter = Arc::new(AtomicU32::new(0));
|
||||
let counter_ = counter.clone();
|
||||
|
||||
let result = connection_wrapper("test_404_expected_is_fatal_and_not_retried")
|
||||
.with_max_attempts(3)
|
||||
.with_expected_404()
|
||||
.run(move || {
|
||||
let url = format!("{}/not_found", server.uri());
|
||||
counter_.fetch_add(1, Ordering::Relaxed);
|
||||
client.clone().get(&url).send()
|
||||
})
|
||||
.await;
|
||||
|
||||
assert!(result.is_err());
|
||||
let err = result.unwrap_err();
|
||||
assert_eq!(err.status(), Some(StatusCode::NOT_FOUND));
|
||||
assert_eq!(counter.load(Ordering::SeqCst), 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_403_no_retry_by_default() {
|
||||
let server = MockServer::start().await;
|
||||
|
||||
@@ -185,9 +185,10 @@ impl MDBMinimalShard {
|
||||
let _ = MDBShardFileHeader::deserialize(reader)?;
|
||||
|
||||
let mut file_info_views = Vec::<MDBFileInfoView>::new();
|
||||
let mut seen_file_hashes = HashSet::new();
|
||||
process_shard_file_info_section(reader, |fiv: MDBFileInfoView| {
|
||||
// register the offset here to the file entries
|
||||
if include_files {
|
||||
if include_files && seen_file_hashes.insert(fiv.file_hash()) {
|
||||
file_info_views.push(fiv);
|
||||
}
|
||||
Ok(())
|
||||
@@ -232,11 +233,14 @@ impl MDBMinimalShard {
|
||||
let _ = MDBShardFileHeader::deserialize(&mut Cursor::new(&buf))?;
|
||||
|
||||
let mut file_info_views = Vec::<MDBFileInfoView>::new();
|
||||
let mut seen_file_hashes = HashSet::new();
|
||||
process_shard_file_info_section_async(reader, |fiv: MDBFileInfoView| {
|
||||
// register the offset here to the file entries
|
||||
if include_files {
|
||||
file_callback(&fiv)?;
|
||||
file_info_views.push(fiv);
|
||||
if seen_file_hashes.insert(fiv.file_hash()) {
|
||||
file_info_views.push(fiv);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
@@ -507,17 +511,28 @@ mod tests {
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::{RngExt, SeedableRng};
|
||||
|
||||
use super::super::MDBShardInfo;
|
||||
use super::super::file_structs::MDBFileInfo;
|
||||
use super::super::file_structs::{FileDataSequenceHeader, MDBFileInfo};
|
||||
use super::super::shard_file::test_routines::{
|
||||
convert_to_file, gen_random_shard, gen_random_shard_with_xorb_references,
|
||||
convert_to_file, gen_random_file_info, gen_random_shard, gen_random_shard_with_xorb_references,
|
||||
};
|
||||
use super::super::shard_in_memory::MDBInMemoryShard;
|
||||
use super::super::xorb_structs::MDBXorbInfo;
|
||||
use super::super::xorb_structs::{MDBXorbInfo, XorbChunkSequenceHeader};
|
||||
use super::super::{MDBShardFileHeader, MDBShardInfo};
|
||||
use super::MDBMinimalShard;
|
||||
use crate::error::Result;
|
||||
use crate::merklehash::MerkleHash;
|
||||
|
||||
fn file_info_stream(file_infos: &[MDBFileInfo]) -> Vec<u8> {
|
||||
let mut buffer = Vec::new();
|
||||
MDBShardFileHeader::default().serialize(&mut buffer).unwrap();
|
||||
for file_info in file_infos {
|
||||
file_info.serialize(&mut buffer).unwrap();
|
||||
}
|
||||
FileDataSequenceHeader::bookend().serialize(&mut buffer).unwrap();
|
||||
XorbChunkSequenceHeader::bookend().serialize(&mut buffer).unwrap();
|
||||
buffer
|
||||
}
|
||||
|
||||
fn verify_serialization(min_shard: &MDBMinimalShard, mem_shard: &MDBInMemoryShard) -> Result<()> {
|
||||
for verification in [true, false] {
|
||||
// compute size, with verification if possible only
|
||||
@@ -665,6 +680,43 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_minimal_shard_deduplicates_file_infos_first_wins() {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(7);
|
||||
let first = gen_random_file_info(&mut rng, &2, false, false);
|
||||
let mut duplicate = gen_random_file_info(&mut rng, &3, true, true);
|
||||
duplicate.metadata.file_hash = first.metadata.file_hash;
|
||||
|
||||
let buffer = file_info_stream(&[first.clone(), duplicate.clone()]);
|
||||
|
||||
let min_shard = MDBMinimalShard::from_reader(&mut Cursor::new(&buffer), true, true).unwrap();
|
||||
assert_eq!(min_shard.num_files(), 1);
|
||||
assert_eq!(MDBFileInfo::from(min_shard.file(0).unwrap()), first);
|
||||
|
||||
let mut callback_file_infos = Vec::new();
|
||||
let min_shard_async = MDBMinimalShard::from_reader_async_with_custom_callbacks(
|
||||
&mut &buffer[..],
|
||||
true,
|
||||
true,
|
||||
|f| {
|
||||
callback_file_infos.push(MDBFileInfo::from(f));
|
||||
Ok(())
|
||||
},
|
||||
|_| Ok(()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(min_shard, min_shard_async);
|
||||
assert_eq!(callback_file_infos, vec![first.clone(), duplicate]);
|
||||
|
||||
let mut reserialized = Vec::new();
|
||||
min_shard.serialize(&mut reserialized, false).unwrap();
|
||||
let shard_info = MDBShardInfo::load_from_reader(&mut Cursor::new(&reserialized)).unwrap();
|
||||
let file_infos = shard_info.read_all_file_info_sections(&mut Cursor::new(&reserialized)).unwrap();
|
||||
assert_eq!(file_infos, vec![first]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_shards() -> Result<()> {
|
||||
let shard = gen_random_shard(0, &[], &[0], false, false)?;
|
||||
|
||||
@@ -282,9 +282,15 @@ impl Chunker {
|
||||
/// partition_scan_bytes is the number of bytes to scan at each
|
||||
/// proposed partition boundary in search of a valid chunk.
|
||||
///
|
||||
/// Due to a known issue in how we do chunking, note that these
|
||||
/// partitions are not 100% guaranteed to align. See the
|
||||
/// parallel_chunking.pdf for details.
|
||||
/// Partition alignment is guaranteed by the hash warmup fix: the
|
||||
/// chunker feeds `min_chunk - 64 - 1` bytes before scanning for
|
||||
/// boundaries, ensuring the gear hash window is fully warmed (purely
|
||||
/// data-dependent) at all accepted trigger positions. This function
|
||||
/// additionally verifies the absence of hidden triggers by re-chunking
|
||||
/// with `min_chunk = 0`. See `parallel chunking.lyx` for the proof.
|
||||
///
|
||||
/// For finding stable chunk boundaries from existing chunk boundaries (without
|
||||
/// data access), see [`next_stable_chunk_boundary`].
|
||||
pub fn find_partitions<R: Read + Seek>(
|
||||
reader: &mut R,
|
||||
file_size: usize,
|
||||
@@ -353,6 +359,53 @@ pub fn find_partitions<R: Read + Seek>(
|
||||
Ok(partitions)
|
||||
}
|
||||
|
||||
/// Given a list of chunk boundaries in a file and an arbitrary reference position,
|
||||
/// returns the next stable chunk boundary at or after that position.
|
||||
///
|
||||
/// `starting_position` may be any byte offset in the file; it does not need to
|
||||
/// be an existing chunk boundary. The search starts at the first chunk boundary
|
||||
/// `>= starting_position`.
|
||||
///
|
||||
/// A stable chunk boundary is defined such that any possible changes in the data
|
||||
/// before `starting_position` would produce the same chunk boundaries at the
|
||||
/// stable boundary and later. The fixed data between `starting_position` and
|
||||
/// the returned stable boundary is always sufficient to restore the chunker to
|
||||
/// its original chunk boundaries.
|
||||
///
|
||||
/// The stability condition requires two consecutive chunks after `starting_position`,
|
||||
/// both with sizes in `[2 * min_chunk, max_chunk - min_chunk)`. The boundary
|
||||
/// at the end of the second such chunk is the stable chunk boundary.
|
||||
///
|
||||
/// The lower bound is `2 * min_chunk` rather than `min_chunk` (as used in
|
||||
/// [`find_partitions`]) because this function operates on existing chunk
|
||||
/// boundaries without data access, and cannot verify the absence of hidden
|
||||
/// hash triggers in the `[c_k, c_k + min_chunk)` skip zone. A shadow-zone
|
||||
/// trigger can advance a modified chunker by up to `min_chunk`, so the next
|
||||
/// chunk must be at least `2 * min_chunk` to remain reachable.
|
||||
///
|
||||
/// See `parallel chunking.lyx` for the full proof and `find_stable_start` in
|
||||
/// `merkle_hash_subtree.rs` for the analogous construction in merkle hashing.
|
||||
pub fn next_stable_chunk_boundary(starting_position: usize, chunk_boundaries: &[usize]) -> Option<usize> {
|
||||
let minimum_chunk = *TARGET_CHUNK_SIZE / *MINIMUM_CHUNK_DIVISOR;
|
||||
let maximum_chunk = *TARGET_CHUNK_SIZE * *MAXIMUM_CHUNK_MULTIPLIER;
|
||||
|
||||
let start_idx = chunk_boundaries.partition_point(|&x| x < starting_position);
|
||||
|
||||
for i in start_idx..chunk_boundaries.len().saturating_sub(2) {
|
||||
let size_a = chunk_boundaries[i + 1] - chunk_boundaries[i];
|
||||
let size_b = chunk_boundaries[i + 2] - chunk_boundaries[i + 1];
|
||||
|
||||
if size_a >= 2 * minimum_chunk
|
||||
&& size_a < maximum_chunk - minimum_chunk
|
||||
&& size_b >= 2 * minimum_chunk
|
||||
&& size_b < maximum_chunk - minimum_chunk
|
||||
{
|
||||
return Some(chunk_boundaries[i + 2]);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
@@ -5,7 +5,7 @@ mod defrag_prevention;
|
||||
mod file_deduplication;
|
||||
mod interface;
|
||||
|
||||
pub use chunking::{Chunker, find_partitions};
|
||||
pub use chunking::{Chunker, find_partitions, next_stable_chunk_boundary};
|
||||
pub use data_aggregator::DataAggregator;
|
||||
pub use dedup_metrics::DeduplicationMetrics;
|
||||
pub use file_deduplication::FileDeduper;
|
||||
|
||||
@@ -827,9 +827,58 @@ Now, with this implementation, can we still perform parallel chunking?
|
||||
\end_layout
|
||||
|
||||
\begin_layout Standard
|
||||
I do not believe so.
|
||||
Due to the hash disagreement, under adverserial settings it is possible
|
||||
to construct two chunk sequences which will *never* align.
|
||||
|
||||
\series bold
|
||||
Update:
|
||||
\series default
|
||||
The implementation has been fixed.
|
||||
Instead of skipping
|
||||
\begin_inset Formula $m$
|
||||
\end_inset
|
||||
|
||||
bytes (setting HashStreamStart =
|
||||
\begin_inset Formula $i+m$
|
||||
\end_inset
|
||||
|
||||
), the chunker now starts feeding the hash at
|
||||
\begin_inset Formula $m-k-1$
|
||||
\end_inset
|
||||
|
||||
bytes from the chunk start.
|
||||
This ensures the hash window has been fed at least
|
||||
\begin_inset Formula $k+1$
|
||||
\end_inset
|
||||
|
||||
bytes by position
|
||||
\begin_inset Formula $m$
|
||||
\end_inset
|
||||
|
||||
, so the hash output at every accepted trigger position (
|
||||
\begin_inset Formula $\ge m$
|
||||
\end_inset
|
||||
|
||||
from chunk start) depends only on the last
|
||||
\begin_inset Formula $k$
|
||||
\end_inset
|
||||
|
||||
bytes of data, independent of the chunk starting point.
|
||||
Therefore
|
||||
\begin_inset Formula $H(a_{1},b)=H(a_{2},b)$
|
||||
\end_inset
|
||||
|
||||
for all
|
||||
\begin_inset Formula $b$
|
||||
\end_inset
|
||||
|
||||
where
|
||||
\begin_inset Formula $b-a_{1}\ge m$
|
||||
\end_inset
|
||||
|
||||
and
|
||||
\begin_inset Formula $b-a_{2}\ge m$
|
||||
\end_inset
|
||||
|
||||
, and the parallel chunking proof in Section 2 holds.
|
||||
\end_layout
|
||||
|
||||
\end_body
|
||||
|
||||
Binary file not shown.
312
xet_data/tests/test_stable_chunk_boundary_detection.rs
Normal file
312
xet_data/tests/test_stable_chunk_boundary_detection.rs
Normal file
@@ -0,0 +1,312 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{RngExt, SeedableRng};
|
||||
use xet_data::deduplication::constants::TARGET_CHUNK_SIZE;
|
||||
use xet_data::deduplication::{Chunk, Chunker, next_stable_chunk_boundary};
|
||||
use xet_runtime::test_set_constants;
|
||||
|
||||
test_set_constants! {
|
||||
TARGET_CHUNK_SIZE = 1024;
|
||||
}
|
||||
|
||||
fn make_random_data(seed: u64, len: usize) -> Vec<u8> {
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
let mut data = vec![0u8; len];
|
||||
rng.fill(&mut data[..]);
|
||||
data
|
||||
}
|
||||
|
||||
fn chunk_data(data: &[u8]) -> Vec<Chunk> {
|
||||
let mut chunker = Chunker::default();
|
||||
chunker.next_block(data, true)
|
||||
}
|
||||
|
||||
fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec<usize> {
|
||||
chunks
|
||||
.iter()
|
||||
.scan(0usize, |pos, c| {
|
||||
*pos += c.data.len();
|
||||
Some(*pos)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn verify_alignment(
|
||||
original_boundaries: &[usize],
|
||||
new_boundaries: &[usize],
|
||||
stable: usize,
|
||||
file_size: usize,
|
||||
starting_position: usize,
|
||||
mutation_seed: u64,
|
||||
) {
|
||||
let orig_set: HashSet<usize> = original_boundaries.iter().copied().collect();
|
||||
let new_set: HashSet<usize> = new_boundaries.iter().copied().collect();
|
||||
|
||||
for &oc in original_boundaries {
|
||||
if oc >= stable && oc < file_size {
|
||||
assert!(
|
||||
new_set.contains(&oc),
|
||||
"Original chunk boundary {oc} (>= stable {stable}) missing from new chunk boundaries. \
|
||||
starting_position={starting_position}, mutation_seed={mutation_seed}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for &nc in new_boundaries {
|
||||
if nc >= stable && nc < file_size {
|
||||
assert!(
|
||||
orig_set.contains(&nc),
|
||||
"New chunk boundary {nc} (>= stable {stable}) not in original chunk boundaries. \
|
||||
starting_position={starting_position}, mutation_seed={mutation_seed}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// For a given data buffer, exercise `next_stable_chunk_boundary` at random
|
||||
/// starting positions across the full data range with random mutations.
|
||||
fn stress_test_stable_chunk_boundaries(data: &[u8], seed: u64, num_positions: usize, num_mutations: u64) {
|
||||
let file_size = data.len();
|
||||
let chunks = chunk_data(data);
|
||||
let chunk_boundaries = get_chunk_boundaries(&chunks);
|
||||
|
||||
assert!(
|
||||
chunk_boundaries.len() > 10,
|
||||
"Need enough chunks for meaningful testing, got {}",
|
||||
chunk_boundaries.len()
|
||||
);
|
||||
|
||||
let mut rng = StdRng::seed_from_u64(seed);
|
||||
let mut tested_stable = 0u64;
|
||||
|
||||
for trial in 0..num_positions {
|
||||
let starting_position = rng.random_range(1..file_size);
|
||||
|
||||
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
|
||||
Some(s) => s,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
assert!(
|
||||
chunk_boundaries.contains(&stable),
|
||||
"Stable chunk boundary {stable} is not a member of original chunk boundaries"
|
||||
);
|
||||
assert!(
|
||||
stable >= starting_position,
|
||||
"Stable chunk boundary {stable} must be at or after starting_position {starting_position}"
|
||||
);
|
||||
|
||||
tested_stable += 1;
|
||||
|
||||
for mutation_seed in 0..num_mutations {
|
||||
let combined_seed = (trial as u64) * 10000 + mutation_seed + 1;
|
||||
let mut modified = data.to_vec();
|
||||
let mut mrng = StdRng::seed_from_u64(combined_seed);
|
||||
mrng.fill(&mut modified[..starting_position]);
|
||||
|
||||
let new_chunks = chunk_data(&modified);
|
||||
let new_boundaries = get_chunk_boundaries(&new_chunks);
|
||||
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, combined_seed);
|
||||
}
|
||||
}
|
||||
|
||||
let min_expected = (num_positions / 4).max(1);
|
||||
assert!(
|
||||
tested_stable >= min_expected as u64,
|
||||
"Too few starting positions had stable chunk boundaries: {tested_stable} (expected >= {min_expected})"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_edge_cases() {
|
||||
let data = make_random_data(42, 50_000);
|
||||
let chunks = chunk_data(&data);
|
||||
let chunk_boundaries = get_chunk_boundaries(&chunks);
|
||||
|
||||
// starting_position at 0: should still return a valid point
|
||||
let stable_0 = next_stable_chunk_boundary(0, &chunk_boundaries);
|
||||
if let Some(s) = stable_0 {
|
||||
assert!(chunk_boundaries.contains(&s));
|
||||
}
|
||||
|
||||
// starting_position past all chunk boundaries: should return None
|
||||
let past_end = *chunk_boundaries.last().unwrap() + 1;
|
||||
assert!(next_stable_chunk_boundary(past_end, &chunk_boundaries).is_none());
|
||||
|
||||
// starting_position near end with too few remaining points
|
||||
if chunk_boundaries.len() >= 2 {
|
||||
let near_end = chunk_boundaries[chunk_boundaries.len() - 2];
|
||||
assert!(next_stable_chunk_boundary(near_end + 1, &chunk_boundaries).is_none());
|
||||
}
|
||||
|
||||
// Degenerate inputs
|
||||
assert!(next_stable_chunk_boundary(0, &[]).is_none());
|
||||
assert!(next_stable_chunk_boundary(0, &[100]).is_none());
|
||||
assert!(next_stable_chunk_boundary(0, &[100, 200]).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_with_constant_data() {
|
||||
// Constant data produces max-chunk-sized chunks (forced boundaries).
|
||||
// With target=1024: max_chunk=2048, min_chunk=128, so max-min=1920.
|
||||
// Forced boundaries at size 2048 fail the upper bound check of < 1920.
|
||||
let data = vec![0u8; 50_000];
|
||||
let chunks = chunk_data(&data);
|
||||
let chunk_boundaries = get_chunk_boundaries(&chunks);
|
||||
|
||||
let stable = next_stable_chunk_boundary(0, &chunk_boundaries);
|
||||
assert!(stable.is_none(), "Constant data should have no stable chunk boundary (all forced cuts)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_smoke_stress() {
|
||||
let data = make_random_data(42, 50_000);
|
||||
stress_test_stable_chunk_boundaries(&data, 42, 5, 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_smoke_varied_seeds() {
|
||||
for seed in [1, 7, 255] {
|
||||
let data = make_random_data(seed, 50_000);
|
||||
stress_test_stable_chunk_boundaries(&data, seed + 77, 5, 5);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "smoke-test"))]
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_stress() {
|
||||
let data = make_random_data(42, 256_000);
|
||||
stress_test_stable_chunk_boundaries(&data, 42, 100, 20);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "smoke-test"))]
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_stress_varied_seeds() {
|
||||
for seed in [1, 7, 13, 100, 255, 1024, 42424, 999999] {
|
||||
let data = make_random_data(seed, 100_000);
|
||||
stress_test_stable_chunk_boundaries(&data, seed + 77, 50, 20);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "smoke-test"))]
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_mutation_types() {
|
||||
let data = make_random_data(42, 100_000);
|
||||
let chunks = chunk_data(&data);
|
||||
let chunk_boundaries = get_chunk_boundaries(&chunks);
|
||||
let file_size = data.len();
|
||||
|
||||
let mid_idx = chunk_boundaries.len() / 2;
|
||||
let starting_position = chunk_boundaries[mid_idx];
|
||||
|
||||
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
|
||||
Some(s) => s,
|
||||
None => return,
|
||||
};
|
||||
|
||||
// Zero-fill
|
||||
{
|
||||
let mut modified = data.to_vec();
|
||||
modified[..starting_position].fill(0);
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 0);
|
||||
}
|
||||
|
||||
// 0xFF-fill
|
||||
{
|
||||
let mut modified = data.to_vec();
|
||||
modified[..starting_position].fill(0xFF);
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 1);
|
||||
}
|
||||
|
||||
// Reverse the prefix
|
||||
{
|
||||
let mut modified = data.to_vec();
|
||||
modified[..starting_position].reverse();
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 2);
|
||||
}
|
||||
|
||||
// XOR with a pattern
|
||||
{
|
||||
let mut modified = data.to_vec();
|
||||
for (i, byte) in modified[..starting_position].iter_mut().enumerate() {
|
||||
*byte ^= (i & 0xFF) as u8;
|
||||
}
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, 3);
|
||||
}
|
||||
|
||||
// Many different random fills
|
||||
for seed in 0..200 {
|
||||
let mut modified = data.to_vec();
|
||||
let mut rng = StdRng::seed_from_u64(seed + 5000);
|
||||
rng.fill(&mut modified[..starting_position]);
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
verify_alignment(&chunk_boundaries, &new_boundaries, stable, file_size, starting_position, seed + 5000);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "smoke-test"))]
|
||||
#[test]
|
||||
fn test_stable_chunk_boundary_is_tight() {
|
||||
// Verify the stable chunk boundary is actually needed: the chunk boundary
|
||||
// just before stable should NOT always be stable (there should exist some
|
||||
// mutation that breaks it).
|
||||
let data = make_random_data(42, 256_000);
|
||||
let chunks = chunk_data(&data);
|
||||
let chunk_boundaries = get_chunk_boundaries(&chunks);
|
||||
|
||||
let mut found_non_stable_predecessor = false;
|
||||
|
||||
for &starting_position in chunk_boundaries.iter().take(chunk_boundaries.len() / 2) {
|
||||
if starting_position == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let stable = match next_stable_chunk_boundary(starting_position, &chunk_boundaries) {
|
||||
Some(s) => s,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let stable_idx = chunk_boundaries.iter().position(|&x| x == stable).unwrap();
|
||||
if stable_idx == 0 {
|
||||
continue;
|
||||
}
|
||||
let predecessor = chunk_boundaries[stable_idx - 1];
|
||||
if predecessor <= starting_position {
|
||||
continue;
|
||||
}
|
||||
|
||||
let orig_set: HashSet<usize> = chunk_boundaries.iter().copied().collect();
|
||||
for seed in 0..200 {
|
||||
let mut modified = data.to_vec();
|
||||
let mut rng = StdRng::seed_from_u64(seed + 90000);
|
||||
rng.fill(&mut modified[..starting_position]);
|
||||
let new_boundaries = get_chunk_boundaries(&chunk_data(&modified));
|
||||
let new_set: HashSet<usize> = new_boundaries.iter().copied().collect();
|
||||
|
||||
if !new_set.contains(&predecessor)
|
||||
|| new_boundaries
|
||||
.iter()
|
||||
.any(|&nc| nc >= predecessor && nc < stable && !orig_set.contains(&nc))
|
||||
{
|
||||
found_non_stable_predecessor = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if found_non_stable_predecessor {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(
|
||||
found_non_stable_predecessor,
|
||||
"Could not find any case where the predecessor of a stable chunk boundary was actually unstable. \
|
||||
This suggests the stability condition may be too conservative."
|
||||
);
|
||||
}
|
||||
@@ -54,10 +54,10 @@ crate::config_group!({
|
||||
/// transfers to complete. If no data is received for this duration, the connection
|
||||
/// is considered stalled and will timeout.
|
||||
///
|
||||
/// The default value is 120 seconds.
|
||||
/// The default value is 300 seconds.
|
||||
///
|
||||
/// Use the environment variable `HF_XET_CLIENT_READ_TIMEOUT` to set this value.
|
||||
ref read_timeout: Duration = Duration::from_secs(120);
|
||||
ref read_timeout: Duration = Duration::from_secs(300);
|
||||
|
||||
/// Send a report of a successful partial upload every 512kb.
|
||||
///
|
||||
|
||||
@@ -229,7 +229,7 @@ macro_rules! test_set_constants {
|
||||
)+) => {
|
||||
use $crate::configuration_utils::ctor_reexport as ctor;
|
||||
|
||||
#[ctor::ctor]
|
||||
#[ctor::ctor(unsafe)]
|
||||
fn set_constants_on_load() {
|
||||
$(
|
||||
let val = $val;
|
||||
@@ -300,7 +300,7 @@ macro_rules! test_set_config {
|
||||
)+) => {
|
||||
use $crate::configuration_utils::ctor_reexport as config_ctor;
|
||||
|
||||
#[config_ctor::ctor]
|
||||
#[config_ctor::ctor(unsafe)]
|
||||
fn set_config_on_load() {
|
||||
$(
|
||||
let group_name_upper = stringify!($group_name).to_uppercase();
|
||||
|
||||
Reference in New Issue
Block a user