From f781498b68573988387b01aa2c0f60b7536d7d49 Mon Sep 17 00:00:00 2001 From: Adrien Date: Mon, 30 Mar 2026 17:57:23 +0200 Subject: [PATCH] fix: truncate local file on full-file download to prevent corruption (#764) ## Summary - Fixes a data corruption bug where downloading a file smaller than an existing local file left stale trailing bytes intact - The file was opened with `truncate(false)` unconditionally (needed for concurrent partial-range writes), but full-file downloads now use `truncate(true)` - Adds regression test `test_full_file_truncates_larger_existing_file` Ref: https://github.com/huggingface/huggingface_hub/issues/3995 --- > [!NOTE] > **Medium Risk** > Changes on-disk write semantics for reconstructed downloads by optionally truncating the destination file, which affects data integrity and could impact concurrent/partial-write callers if misused. > > **Overview** > Fixes a corruption case where full-file downloads could leave stale trailing bytes when writing over an existing larger file by adding a `truncate_file` flag to `FileReconstructor::reconstruct_to_file` and wiring it to `OpenOptions::truncate()`. > > Updates full-file download flow (`FileDownloadSession::download_file_with_id`) to pass `truncate_file=true`, while keeping benchmarks/tests and range/concurrent write paths passing `false` to preserve existing behavior for partial/concurrent writes. > > Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit ed33dab9a1dfdf45a2d422672f9ee5cc5ec2a5bf. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot). --------- Co-authored-by: di --- xet_data/benches/reconstruction_bench.rs | 4 ++-- .../file_reconstruction/file_reconstructor.rs | 21 ++++++++++--------- .../src/processing/file_download_session.rs | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/xet_data/benches/reconstruction_bench.rs b/xet_data/benches/reconstruction_bench.rs index 5dc13e22..53a181c9 100644 --- a/xet_data/benches/reconstruction_bench.rs +++ b/xet_data/benches/reconstruction_bench.rs @@ -48,7 +48,7 @@ fn bench_sequential_non_vectored(c: &mut Criterion) { let path = dir.path().join("out.bin"); FileReconstructor::new(&client, hash) .with_config(cfg) - .reconstruct_to_file(&path, None) + .reconstruct_to_file(&path, None, false) .await .unwrap(); } @@ -77,7 +77,7 @@ fn bench_sequential_vectored(c: &mut Criterion) { let path = dir.path().join("out.bin"); FileReconstructor::new(&client, hash) .with_config(cfg) - .reconstruct_to_file(&path, None) + .reconstruct_to_file(&path, None, false) .await .unwrap(); } diff --git a/xet_data/src/file_reconstruction/file_reconstructor.rs b/xet_data/src/file_reconstruction/file_reconstructor.rs index 17161ab7..f0417229 100644 --- a/xet_data/src/file_reconstruction/file_reconstructor.rs +++ b/xet_data/src/file_reconstruction/file_reconstructor.rs @@ -95,14 +95,15 @@ impl FileReconstructor { /// Reconstructs the file and writes it to the given path. /// - /// The file is opened with read/write access without truncation, allowing - /// multiple concurrent reconstructions to write to different regions of the - /// same file. + /// The file is opened with read/write access. When `truncate_file` is `true` + /// the file is truncated to the reconstructed length; when `false` the file + /// is left at its existing size, allowing multiple concurrent reconstructions + /// to write to different regions of the same file. /// /// When `write_offset` is `Some(offset)`, writing begins at that byte /// position regardless of the byte range. When `None`, writing begins at /// the byte range start (or 0 for a full-file reconstruction). - pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option) -> Result { + pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option, truncate_file: bool) -> Result { info!( file_hash = %self.file_hash, byte_range = ?self.byte_range, @@ -115,7 +116,7 @@ impl FileReconstructor { std::fs::create_dir_all(parent)?; } - let mut file = OpenOptions::new().write(true).create(true).truncate(false).open(path)?; + let mut file = OpenOptions::new().write(true).create(true).truncate(truncate_file).open(path)?; let default_write_position = self.byte_range.map_or(0, |r| r.start); let seek_position = write_offset.unwrap_or(default_write_position); @@ -472,7 +473,7 @@ mod tests { reconstructor = reconstructor.with_byte_range(range); } - reconstructor.reconstruct_to_file(&file_path, None).await?; + reconstructor.reconstruct_to_file(&file_path, None, false).await?; // Read back the data from the file at the expected location. let file_data = std::fs::read(&file_path)?; @@ -499,7 +500,7 @@ mod tests { reconstructor = reconstructor.with_byte_range(range); } - reconstructor.reconstruct_to_file(&file_path, Some(offset)).await?; + reconstructor.reconstruct_to_file(&file_path, Some(offset), false).await?; // Read back all file data. let file_data = std::fs::read(&file_path)?; @@ -524,7 +525,7 @@ mod tests { reconstructor = reconstructor.with_byte_range(range); } - reconstructor.reconstruct_to_file(&file_path, Some(0)).await?; + reconstructor.reconstruct_to_file(&file_path, Some(0), false).await?; // Read back all file data (it starts at offset 0). let file_data = std::fs::read(&file_path)?; @@ -1210,7 +1211,7 @@ mod tests { FileReconstructor::new(&(client.clone() as Arc), file_hash) .with_byte_range(range) .with_config(config) - .reconstruct_to_file(file_path, None) + .reconstruct_to_file(file_path, None, false) .await } @@ -1266,7 +1267,7 @@ mod tests { FileReconstructor::new(&(client as Arc), file_hash) .with_byte_range(range) .with_config(config) - .reconstruct_to_file(&file_path, None) + .reconstruct_to_file(&file_path, None, false) .await }); } diff --git a/xet_data/src/processing/file_download_session.rs b/xet_data/src/processing/file_download_session.rs index 51fe3604..3360ba0d 100644 --- a/xet_data/src/processing/file_download_session.rs +++ b/xet_data/src/processing/file_download_session.rs @@ -128,7 +128,7 @@ impl FileDownloadSession { let name = Arc::from(write_path.to_string_lossy().as_ref()); let progress_updater = self.progress.new_item(id, name); let reconstructor = self.setup_reconstructor(file_info, None, Some(progress_updater))?; - let n_bytes = reconstructor.reconstruct_to_file(write_path, None).await?; + let n_bytes = reconstructor.reconstruct_to_file(write_path, None, true).await?; // Caller is responsible for cleaning up the file on error (consistent // with other error paths); see download_group.rs error handling. if let Some(expected_size) = file_info.file_size()