mirror of
https://github.com/huggingface/xet-core.git
synced 2026-06-04 13:30:29 +08:00
Improve hf-xet crate documentation (#780)
## Summary
- Expand the crate-level docs (`lib.rs`) with a proper introduction
explaining what Xet storage is and an end-to-end upload+download example
on the landing page
- Add runtime detection and `XetConfig` guidance to `XetSessionBuilder`
docs
- Describe all three upload methods (path, bytes, stream) in the
`xet_session` module docs
- Add dedicated streaming upload and streaming download code examples
- Add module doc comment to `legacy/` pointing users to `xet_session`
## Test plan
- [x] `cargo check -p hf-xet` passes
- [x] `cargo doc -p hf-xet --no-deps` builds with no new warnings (50
pre-existing warnings unchanged)
- [ ] Review rendered docs on docs.rs after merge
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Low Risk**
> Documentation-only changes (module/crate docs and examples) with no
functional code or behavior modifications, so runtime risk is low aside
from potential doc/example inaccuracies.
>
> **Overview**
> Improves public-facing documentation across `lib.rs`, `xet_session`,
and `legacy` by adding a clearer introduction to Xet/CAS, end-to-end
upload+download quickstart examples, and guidance on using
`XetSessionBuilder` (runtime detection and `XetConfig`).
>
> Expands `xet_session` docs to describe the three upload modes
(path/bytes/stream) and adds dedicated streaming upload and streaming
download examples, while clarifying that `legacy` is maintained for
backwards compatibility and steering new users to `xet_session`.
>
> <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit
6e926857a9. Bugbot is set up for automated
code reviews on this repo. Configure
[here](https://www.cursor.com/dashboard/bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
This commit is contained in:
@@ -1,3 +1,11 @@
|
||||
//! Legacy helpers re-exported for backward compatibility.
|
||||
//!
|
||||
//! This module exposes lower-level types and functions used by the Python
|
||||
//! bindings (`hf_xet`) and `git_xet`. New code should use the
|
||||
//! [`xet_session`](crate::xet_session) API instead — it provides a safer,
|
||||
//! higher-level interface with built-in progress tracking, token refresh,
|
||||
//! and automatic runtime management.
|
||||
|
||||
pub mod data_client;
|
||||
pub mod progress_tracking;
|
||||
|
||||
|
||||
@@ -1,14 +1,59 @@
|
||||
//! Client library for the Hugging Face Xet data storage system.
|
||||
//! Rust client library for the Hugging Face Xet storage system.
|
||||
//!
|
||||
//! Provides the high-level [`xet_session::XetSession`] API for uploading
|
||||
//! and downloading files with chunk-based deduplication, tying together
|
||||
//! the lower-level [`xet_runtime`], [`xet_core_structures`],
|
||||
//! [`xet_client`], and [`xet_data`] crates.
|
||||
//! Xet is the storage backend used by the [Hugging Face Hub](https://huggingface.co) for
|
||||
//! large files. Files are split into variable-size chunks, deduplicated, and stored in CAS
|
||||
//! (Content-Addressed Storage) server. This crate provides a high-level
|
||||
//! API for uploading and downloading those files.
|
||||
//!
|
||||
//! # Getting started
|
||||
//!
|
||||
//! All operations go through [`xet_session::XetSession`], which manages a
|
||||
//! tokio runtime and shared HTTP settings. Create one with
|
||||
//! [`XetSessionBuilder`](xet_session::XetSessionBuilder), then use it to
|
||||
//! build upload commits or download groups:
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use xet::xet_session::{Sha256Policy, XetFileInfo, XetSessionBuilder};
|
||||
//!
|
||||
//! # fn example() -> Result<(), xet::xet_session::SessionError> {
|
||||
//! let session = XetSessionBuilder::new().build()?;
|
||||
//!
|
||||
//! // Upload a file
|
||||
//! let commit = session
|
||||
//! .new_upload_commit()?
|
||||
//! .with_endpoint("https://cas.example.com")
|
||||
//! .with_token_info("write-token", 1_700_000_000)
|
||||
//! .build_blocking()?;
|
||||
//! let handle = commit.upload_from_path_blocking("file.bin".into(), Sha256Policy::Compute)?;
|
||||
//! let report = commit.commit_blocking()?;
|
||||
//!
|
||||
//! // Download a file using the metadata from the upload
|
||||
//! let meta = report.uploads.values().next().unwrap();
|
||||
//! let group = session
|
||||
//! .new_file_download_group()?
|
||||
//! .with_token_info("read-token", 1_700_000_000)
|
||||
//! .build_blocking()?;
|
||||
//! group.download_file_to_path_blocking(meta.xet_info.clone(), "out/file.bin".into())?;
|
||||
//! group.finish_blocking()?;
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! See the [`xet_session`] module for the full API, including async
|
||||
//! variants, streaming uploads and downloads, and progress tracking.
|
||||
//!
|
||||
//! # Modules
|
||||
//!
|
||||
//! - [`xet_session`] — the primary API: [`XetSession`](xet_session::XetSession), upload commits, file download groups,
|
||||
//! and streaming downloads.
|
||||
//! - [`error`] — [`XetError`], the unified error type for the public API.
|
||||
|
||||
pub mod error;
|
||||
pub use error::XetError;
|
||||
#[cfg(feature = "python")]
|
||||
pub use error::{XetAuthenticationError, XetObjectNotFoundError, register_exceptions};
|
||||
|
||||
// Legacy helpers re-exported for backward compatibility with `hf_xet` (Python bindings)
|
||||
// and `git_xet`. New code should use the [`xet_session`] API instead.
|
||||
pub mod legacy;
|
||||
pub mod xet_session;
|
||||
|
||||
@@ -23,11 +23,20 @@
|
||||
//! [`with_token_refresh_url`](AuthGroupBuilder::with_token_refresh_url), then call
|
||||
//! [`build`](AuthGroupBuilder::build) (async) or
|
||||
//! [`build_blocking`](AuthGroupBuilder::build_blocking) (sync).
|
||||
//! Queue files with [`upload_from_path`](XetUploadCommit::upload_from_path) /
|
||||
//! [`upload_from_path_blocking`](XetUploadCommit::upload_from_path_blocking) or
|
||||
//! [`upload_bytes`](XetUploadCommit::upload_bytes) /
|
||||
//! [`upload_bytes_blocking`](XetUploadCommit::upload_bytes_blocking), then call
|
||||
//! [`commit`](XetUploadCommit::commit) or
|
||||
//!
|
||||
//! There are three ways to queue data for upload:
|
||||
//!
|
||||
//! - **From a file path** — [`upload_from_path`](XetUploadCommit::upload_from_path) /
|
||||
//! [`upload_from_path_blocking`](XetUploadCommit::upload_from_path_blocking). The file is read in a background task.
|
||||
//! - **From raw bytes** — [`upload_bytes`](XetUploadCommit::upload_bytes) /
|
||||
//! [`upload_bytes_blocking`](XetUploadCommit::upload_bytes_blocking). Useful when data is already in memory.
|
||||
//! - **Incrementally via a stream** — [`upload_stream`](XetUploadCommit::upload_stream) /
|
||||
//! [`upload_stream_blocking`](XetUploadCommit::upload_stream_blocking). Returns an [`XetStreamUpload`] handle; call
|
||||
//! [`write`](XetStreamUpload::write) to feed chunks, then [`finish`](XetStreamUpload::finish) to finalise. **`finish`
|
||||
//! must be called before [`commit`](XetUploadCommit::commit).** Use this when data arrives incrementally (e.g. from a
|
||||
//! network socket or a generator) and you don't want to buffer it all in memory first.
|
||||
//!
|
||||
//! Then call [`commit`](XetUploadCommit::commit) or
|
||||
//! [`commit_blocking`](XetUploadCommit::commit_blocking) to wait for all
|
||||
//! transfers to finish and receive a [`XetCommitReport`].
|
||||
//!
|
||||
@@ -159,6 +168,86 @@
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! # Streaming upload
|
||||
//!
|
||||
//! Use [`upload_stream`](XetUploadCommit::upload_stream) when data arrives
|
||||
//! incrementally and you don't want to buffer it all in memory or on disk
|
||||
//! first. Call [`write`](XetStreamUpload::write) for each chunk, then
|
||||
//! [`finish`](XetStreamUpload::finish) before [`commit`](XetUploadCommit::commit).
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use xet::xet_session::{Sha256Policy, XetSessionBuilder};
|
||||
//!
|
||||
//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
|
||||
//! let session = XetSessionBuilder::new().build()?;
|
||||
//! let commit = session
|
||||
//! .new_upload_commit()?
|
||||
//! .with_endpoint("https://cas.example.com")
|
||||
//! .with_token_info("write-token", 1_700_000_000)
|
||||
//! .build()
|
||||
//! .await?;
|
||||
//!
|
||||
//! // Begin a streaming upload with an optional tracking name
|
||||
//! let stream = commit
|
||||
//! .upload_stream(Some("generated-data.bin".into()), Sha256Policy::Compute)
|
||||
//! .await?;
|
||||
//!
|
||||
//! // Feed data in chunks — could come from a network socket, a generator, etc.
|
||||
//! for chunk in vec![b"hello ".to_vec(), b"world".to_vec()] {
|
||||
//! stream.write(chunk).await?;
|
||||
//! }
|
||||
//!
|
||||
//! // Finalise the stream and get per-file metadata
|
||||
//! let meta = stream.finish().await?;
|
||||
//! println!("hash: {}, size: {:?}", meta.xet_info.hash, meta.xet_info.file_size);
|
||||
//!
|
||||
//! // Commit all uploads in this group
|
||||
//! let report = commit.commit().await?;
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! # Streaming download
|
||||
//!
|
||||
//! Use [`XetDownloadStreamGroup`] when you want to consume file data as a
|
||||
//! byte stream rather than writing it to disk. This is useful for serving
|
||||
//! data over HTTP, piping it to another process, or processing it on the fly.
|
||||
//!
|
||||
//! [`download_stream`](XetDownloadStreamGroup::download_stream) returns
|
||||
//! chunks in file order.
|
||||
//! [`download_unordered_stream`](XetDownloadStreamGroup::download_unordered_stream)
|
||||
//! returns `(offset, Bytes)` chunks in completion order for higher throughput
|
||||
//! when the consumer can handle out-of-order data.
|
||||
//!
|
||||
//! ```rust,no_run
|
||||
//! use xet::xet_session::{XetFileInfo, XetSessionBuilder};
|
||||
//!
|
||||
//! # async fn example(file_info: XetFileInfo) -> Result<(), Box<dyn std::error::Error>> {
|
||||
//! let session = XetSessionBuilder::new().build()?;
|
||||
//! let group = session
|
||||
//! .new_download_stream_group()?
|
||||
//! .with_token_info("read-token", 1_700_000_000)
|
||||
//! .build()
|
||||
//! .await?;
|
||||
//!
|
||||
//! // Ordered stream — chunks arrive in file order
|
||||
//! let mut stream = group.download_stream(file_info.clone(), None).await?;
|
||||
//! let mut total = 0u64;
|
||||
//! while let Some(chunk) = stream.next().await? {
|
||||
//! total += chunk.len() as u64;
|
||||
//! // process chunk...
|
||||
//! }
|
||||
//! println!("received {total} bytes");
|
||||
//!
|
||||
//! // Byte-range request — only download bytes 1000..2000
|
||||
//! let mut range_stream = group.download_stream(file_info.clone(), Some(1000..2000)).await?;
|
||||
//! while let Some(chunk) = range_stream.next().await? {
|
||||
//! // process partial data...
|
||||
//! }
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
|
||||
mod auth_group_builder;
|
||||
mod common;
|
||||
|
||||
@@ -53,8 +53,17 @@ pub struct XetSessionInner {
|
||||
///
|
||||
/// All fields are optional; call [`build`](XetSessionBuilder::build) when done.
|
||||
///
|
||||
/// [`build`](Self::build) auto-detects a suitable current tokio handle when present,
|
||||
/// or creates an owned runtime (see [`XetSessionBuilder::with_tokio_handle`]).
|
||||
/// ## Runtime detection
|
||||
///
|
||||
/// [`build`](Self::build) auto-detects a suitable tokio runtime:
|
||||
///
|
||||
/// - **Inside `#[tokio::main]` or an existing tokio multi-thread runtime** and the runtime meets the requirements — the
|
||||
/// session wraps the caller's handle; no second thread pool is created. Both async and blocking methods work.
|
||||
/// - **Outside any runtime** — an owned multi-thread runtime is created internally. Blocking methods (`_blocking`
|
||||
/// suffix) work from any thread; async methods work via an internal bridge.
|
||||
/// - **Explicit handle** — call [`with_tokio_handle`](Self::with_tokio_handle) to supply a handle directly. If it
|
||||
/// doesn't meet requirements (multi-thread, time + IO drivers), it is silently ignored and an owned runtime is
|
||||
/// created instead.
|
||||
///
|
||||
/// ## Authentication
|
||||
///
|
||||
@@ -96,6 +105,12 @@ pub struct XetSessionInner {
|
||||
/// .build_blocking()?;
|
||||
/// # Ok::<(), xet::xet_session::SessionError>(())
|
||||
/// ```
|
||||
///
|
||||
/// ## `XetConfig`
|
||||
///
|
||||
/// For most use cases, [`new`](Self::new) with the default [`XetConfig`] is
|
||||
/// sufficient. Use [`new_with_config`](Self::new_with_config) when you need to
|
||||
/// override runtime settings such as cache directories or concurrency limits.
|
||||
pub struct XetSessionBuilder {
|
||||
config: XetConfig,
|
||||
tokio_handle: Option<tokio::runtime::Handle>,
|
||||
|
||||
Reference in New Issue
Block a user