From 062138f845e636c7107173b944a32ec995086d5a Mon Sep 17 00:00:00 2001 From: Assaf Vayner Date: Mon, 6 Apr 2026 13:55:29 -0700 Subject: [PATCH] Improve hf-xet crate documentation (#780) ## Summary - Expand the crate-level docs (`lib.rs`) with a proper introduction explaining what Xet storage is and an end-to-end upload+download example on the landing page - Add runtime detection and `XetConfig` guidance to `XetSessionBuilder` docs - Describe all three upload methods (path, bytes, stream) in the `xet_session` module docs - Add dedicated streaming upload and streaming download code examples - Add module doc comment to `legacy/` pointing users to `xet_session` ## Test plan - [x] `cargo check -p hf-xet` passes - [x] `cargo doc -p hf-xet --no-deps` builds with no new warnings (50 pre-existing warnings unchanged) - [ ] Review rendered docs on docs.rs after merge --- > [!NOTE] > **Low Risk** > Documentation-only changes (module/crate docs and examples) with no functional code or behavior modifications, so runtime risk is low aside from potential doc/example inaccuracies. > > **Overview** > Improves public-facing documentation across `lib.rs`, `xet_session`, and `legacy` by adding a clearer introduction to Xet/CAS, end-to-end upload+download quickstart examples, and guidance on using `XetSessionBuilder` (runtime detection and `XetConfig`). > > Expands `xet_session` docs to describe the three upload modes (path/bytes/stream) and adds dedicated streaming upload and streaming download examples, while clarifying that `legacy` is maintained for backwards compatibility and steering new users to `xet_session`. > > Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit 6e926857a9500674f0cac2732001cf1035e2e13c. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot). --- xet_pkg/src/legacy/mod.rs | 8 +++ xet_pkg/src/lib.rs | 55 +++++++++++++++-- xet_pkg/src/xet_session/mod.rs | 99 ++++++++++++++++++++++++++++-- xet_pkg/src/xet_session/session.rs | 19 +++++- 4 files changed, 169 insertions(+), 12 deletions(-) diff --git a/xet_pkg/src/legacy/mod.rs b/xet_pkg/src/legacy/mod.rs index fd8f8ec7..6a9b5f67 100644 --- a/xet_pkg/src/legacy/mod.rs +++ b/xet_pkg/src/legacy/mod.rs @@ -1,3 +1,11 @@ +//! Legacy helpers re-exported for backward compatibility. +//! +//! This module exposes lower-level types and functions used by the Python +//! bindings (`hf_xet`) and `git_xet`. New code should use the +//! [`xet_session`](crate::xet_session) API instead — it provides a safer, +//! higher-level interface with built-in progress tracking, token refresh, +//! and automatic runtime management. + pub mod data_client; pub mod progress_tracking; diff --git a/xet_pkg/src/lib.rs b/xet_pkg/src/lib.rs index f4c23367..3f208e46 100644 --- a/xet_pkg/src/lib.rs +++ b/xet_pkg/src/lib.rs @@ -1,14 +1,59 @@ -//! Client library for the Hugging Face Xet data storage system. +//! Rust client library for the Hugging Face Xet storage system. //! -//! Provides the high-level [`xet_session::XetSession`] API for uploading -//! and downloading files with chunk-based deduplication, tying together -//! the lower-level [`xet_runtime`], [`xet_core_structures`], -//! [`xet_client`], and [`xet_data`] crates. +//! Xet is the storage backend used by the [Hugging Face Hub](https://huggingface.co) for +//! large files. Files are split into variable-size chunks, deduplicated, and stored in CAS +//! (Content-Addressed Storage) server. This crate provides a high-level +//! API for uploading and downloading those files. +//! +//! # Getting started +//! +//! All operations go through [`xet_session::XetSession`], which manages a +//! tokio runtime and shared HTTP settings. Create one with +//! [`XetSessionBuilder`](xet_session::XetSessionBuilder), then use it to +//! build upload commits or download groups: +//! +//! ```rust,no_run +//! use xet::xet_session::{Sha256Policy, XetFileInfo, XetSessionBuilder}; +//! +//! # fn example() -> Result<(), xet::xet_session::SessionError> { +//! let session = XetSessionBuilder::new().build()?; +//! +//! // Upload a file +//! let commit = session +//! .new_upload_commit()? +//! .with_endpoint("https://cas.example.com") +//! .with_token_info("write-token", 1_700_000_000) +//! .build_blocking()?; +//! let handle = commit.upload_from_path_blocking("file.bin".into(), Sha256Policy::Compute)?; +//! let report = commit.commit_blocking()?; +//! +//! // Download a file using the metadata from the upload +//! let meta = report.uploads.values().next().unwrap(); +//! let group = session +//! .new_file_download_group()? +//! .with_token_info("read-token", 1_700_000_000) +//! .build_blocking()?; +//! group.download_file_to_path_blocking(meta.xet_info.clone(), "out/file.bin".into())?; +//! group.finish_blocking()?; +//! # Ok(()) +//! # } +//! ``` +//! +//! See the [`xet_session`] module for the full API, including async +//! variants, streaming uploads and downloads, and progress tracking. +//! +//! # Modules +//! +//! - [`xet_session`] — the primary API: [`XetSession`](xet_session::XetSession), upload commits, file download groups, +//! and streaming downloads. +//! - [`error`] — [`XetError`], the unified error type for the public API. pub mod error; pub use error::XetError; #[cfg(feature = "python")] pub use error::{XetAuthenticationError, XetObjectNotFoundError, register_exceptions}; +// Legacy helpers re-exported for backward compatibility with `hf_xet` (Python bindings) +// and `git_xet`. New code should use the [`xet_session`] API instead. pub mod legacy; pub mod xet_session; diff --git a/xet_pkg/src/xet_session/mod.rs b/xet_pkg/src/xet_session/mod.rs index c3457535..fa3dd36e 100644 --- a/xet_pkg/src/xet_session/mod.rs +++ b/xet_pkg/src/xet_session/mod.rs @@ -23,11 +23,20 @@ //! [`with_token_refresh_url`](AuthGroupBuilder::with_token_refresh_url), then call //! [`build`](AuthGroupBuilder::build) (async) or //! [`build_blocking`](AuthGroupBuilder::build_blocking) (sync). -//! Queue files with [`upload_from_path`](XetUploadCommit::upload_from_path) / -//! [`upload_from_path_blocking`](XetUploadCommit::upload_from_path_blocking) or -//! [`upload_bytes`](XetUploadCommit::upload_bytes) / -//! [`upload_bytes_blocking`](XetUploadCommit::upload_bytes_blocking), then call -//! [`commit`](XetUploadCommit::commit) or +//! +//! There are three ways to queue data for upload: +//! +//! - **From a file path** — [`upload_from_path`](XetUploadCommit::upload_from_path) / +//! [`upload_from_path_blocking`](XetUploadCommit::upload_from_path_blocking). The file is read in a background task. +//! - **From raw bytes** — [`upload_bytes`](XetUploadCommit::upload_bytes) / +//! [`upload_bytes_blocking`](XetUploadCommit::upload_bytes_blocking). Useful when data is already in memory. +//! - **Incrementally via a stream** — [`upload_stream`](XetUploadCommit::upload_stream) / +//! [`upload_stream_blocking`](XetUploadCommit::upload_stream_blocking). Returns an [`XetStreamUpload`] handle; call +//! [`write`](XetStreamUpload::write) to feed chunks, then [`finish`](XetStreamUpload::finish) to finalise. **`finish` +//! must be called before [`commit`](XetUploadCommit::commit).** Use this when data arrives incrementally (e.g. from a +//! network socket or a generator) and you don't want to buffer it all in memory first. +//! +//! Then call [`commit`](XetUploadCommit::commit) or //! [`commit_blocking`](XetUploadCommit::commit_blocking) to wait for all //! transfers to finish and receive a [`XetCommitReport`]. //! @@ -159,6 +168,86 @@ //! # Ok(()) //! # } //! ``` +//! +//! # Streaming upload +//! +//! Use [`upload_stream`](XetUploadCommit::upload_stream) when data arrives +//! incrementally and you don't want to buffer it all in memory or on disk +//! first. Call [`write`](XetStreamUpload::write) for each chunk, then +//! [`finish`](XetStreamUpload::finish) before [`commit`](XetUploadCommit::commit). +//! +//! ```rust,no_run +//! use xet::xet_session::{Sha256Policy, XetSessionBuilder}; +//! +//! # async fn example() -> Result<(), Box> { +//! let session = XetSessionBuilder::new().build()?; +//! let commit = session +//! .new_upload_commit()? +//! .with_endpoint("https://cas.example.com") +//! .with_token_info("write-token", 1_700_000_000) +//! .build() +//! .await?; +//! +//! // Begin a streaming upload with an optional tracking name +//! let stream = commit +//! .upload_stream(Some("generated-data.bin".into()), Sha256Policy::Compute) +//! .await?; +//! +//! // Feed data in chunks — could come from a network socket, a generator, etc. +//! for chunk in vec![b"hello ".to_vec(), b"world".to_vec()] { +//! stream.write(chunk).await?; +//! } +//! +//! // Finalise the stream and get per-file metadata +//! let meta = stream.finish().await?; +//! println!("hash: {}, size: {:?}", meta.xet_info.hash, meta.xet_info.file_size); +//! +//! // Commit all uploads in this group +//! let report = commit.commit().await?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Streaming download +//! +//! Use [`XetDownloadStreamGroup`] when you want to consume file data as a +//! byte stream rather than writing it to disk. This is useful for serving +//! data over HTTP, piping it to another process, or processing it on the fly. +//! +//! [`download_stream`](XetDownloadStreamGroup::download_stream) returns +//! chunks in file order. +//! [`download_unordered_stream`](XetDownloadStreamGroup::download_unordered_stream) +//! returns `(offset, Bytes)` chunks in completion order for higher throughput +//! when the consumer can handle out-of-order data. +//! +//! ```rust,no_run +//! use xet::xet_session::{XetFileInfo, XetSessionBuilder}; +//! +//! # async fn example(file_info: XetFileInfo) -> Result<(), Box> { +//! let session = XetSessionBuilder::new().build()?; +//! let group = session +//! .new_download_stream_group()? +//! .with_token_info("read-token", 1_700_000_000) +//! .build() +//! .await?; +//! +//! // Ordered stream — chunks arrive in file order +//! let mut stream = group.download_stream(file_info.clone(), None).await?; +//! let mut total = 0u64; +//! while let Some(chunk) = stream.next().await? { +//! total += chunk.len() as u64; +//! // process chunk... +//! } +//! println!("received {total} bytes"); +//! +//! // Byte-range request — only download bytes 1000..2000 +//! let mut range_stream = group.download_stream(file_info.clone(), Some(1000..2000)).await?; +//! while let Some(chunk) = range_stream.next().await? { +//! // process partial data... +//! } +//! # Ok(()) +//! # } +//! ``` mod auth_group_builder; mod common; diff --git a/xet_pkg/src/xet_session/session.rs b/xet_pkg/src/xet_session/session.rs index d1f95b80..8a25cdbc 100644 --- a/xet_pkg/src/xet_session/session.rs +++ b/xet_pkg/src/xet_session/session.rs @@ -53,8 +53,17 @@ pub struct XetSessionInner { /// /// All fields are optional; call [`build`](XetSessionBuilder::build) when done. /// -/// [`build`](Self::build) auto-detects a suitable current tokio handle when present, -/// or creates an owned runtime (see [`XetSessionBuilder::with_tokio_handle`]). +/// ## Runtime detection +/// +/// [`build`](Self::build) auto-detects a suitable tokio runtime: +/// +/// - **Inside `#[tokio::main]` or an existing tokio multi-thread runtime** and the runtime meets the requirements — the +/// session wraps the caller's handle; no second thread pool is created. Both async and blocking methods work. +/// - **Outside any runtime** — an owned multi-thread runtime is created internally. Blocking methods (`_blocking` +/// suffix) work from any thread; async methods work via an internal bridge. +/// - **Explicit handle** — call [`with_tokio_handle`](Self::with_tokio_handle) to supply a handle directly. If it +/// doesn't meet requirements (multi-thread, time + IO drivers), it is silently ignored and an owned runtime is +/// created instead. /// /// ## Authentication /// @@ -96,6 +105,12 @@ pub struct XetSessionInner { /// .build_blocking()?; /// # Ok::<(), xet::xet_session::SessionError>(()) /// ``` +/// +/// ## `XetConfig` +/// +/// For most use cases, [`new`](Self::new) with the default [`XetConfig`] is +/// sufficient. Use [`new_with_config`](Self::new_with_config) when you need to +/// override runtime settings such as cache directories or concurrency limits. pub struct XetSessionBuilder { config: XetConfig, tokio_handle: Option,