mirror of
https://github.com/huggingface/xet-core.git
synced 2026-06-04 13:30:29 +08:00
fix: truncate local file on full-file download to prevent corruption (#764)
## Summary
- Fixes a data corruption bug where downloading a file smaller than an
existing local file left stale trailing bytes intact
- The file was opened with `truncate(false)` unconditionally (needed for
concurrent partial-range writes), but full-file downloads now use
`truncate(true)`
- Adds regression test `test_full_file_truncates_larger_existing_file`
Ref: https://github.com/huggingface/huggingface_hub/issues/3995
<!-- CURSOR_SUMMARY -->
---
> [!NOTE]
> **Medium Risk**
> Changes on-disk write semantics for reconstructed downloads by
optionally truncating the destination file, which affects data integrity
and could impact concurrent/partial-write callers if misused.
>
> **Overview**
> Fixes a corruption case where full-file downloads could leave stale
trailing bytes when writing over an existing larger file by adding a
`truncate_file` flag to `FileReconstructor::reconstruct_to_file` and
wiring it to `OpenOptions::truncate()`.
>
> Updates full-file download flow
(`FileDownloadSession::download_file_with_id`) to pass
`truncate_file=true`, while keeping benchmarks/tests and
range/concurrent write paths passing `false` to preserve existing
behavior for partial/concurrent writes.
>
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
ed33dab9a1. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->
---------
Co-authored-by: di <di@huggingface.co>
This commit is contained in:
@@ -48,7 +48,7 @@ fn bench_sequential_non_vectored(c: &mut Criterion) {
|
||||
let path = dir.path().join("out.bin");
|
||||
FileReconstructor::new(&client, hash)
|
||||
.with_config(cfg)
|
||||
.reconstruct_to_file(&path, None)
|
||||
.reconstruct_to_file(&path, None, false)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
@@ -77,7 +77,7 @@ fn bench_sequential_vectored(c: &mut Criterion) {
|
||||
let path = dir.path().join("out.bin");
|
||||
FileReconstructor::new(&client, hash)
|
||||
.with_config(cfg)
|
||||
.reconstruct_to_file(&path, None)
|
||||
.reconstruct_to_file(&path, None, false)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
@@ -95,14 +95,15 @@ impl FileReconstructor {
|
||||
|
||||
/// Reconstructs the file and writes it to the given path.
|
||||
///
|
||||
/// The file is opened with read/write access without truncation, allowing
|
||||
/// multiple concurrent reconstructions to write to different regions of the
|
||||
/// same file.
|
||||
/// The file is opened with read/write access. When `truncate_file` is `true`
|
||||
/// the file is truncated to the reconstructed length; when `false` the file
|
||||
/// is left at its existing size, allowing multiple concurrent reconstructions
|
||||
/// to write to different regions of the same file.
|
||||
///
|
||||
/// When `write_offset` is `Some(offset)`, writing begins at that byte
|
||||
/// position regardless of the byte range. When `None`, writing begins at
|
||||
/// the byte range start (or 0 for a full-file reconstruction).
|
||||
pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option<u64>) -> Result<u64> {
|
||||
pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option<u64>, truncate_file: bool) -> Result<u64> {
|
||||
info!(
|
||||
file_hash = %self.file_hash,
|
||||
byte_range = ?self.byte_range,
|
||||
@@ -115,7 +116,7 @@ impl FileReconstructor {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let mut file = OpenOptions::new().write(true).create(true).truncate(false).open(path)?;
|
||||
let mut file = OpenOptions::new().write(true).create(true).truncate(truncate_file).open(path)?;
|
||||
|
||||
let default_write_position = self.byte_range.map_or(0, |r| r.start);
|
||||
let seek_position = write_offset.unwrap_or(default_write_position);
|
||||
@@ -472,7 +473,7 @@ mod tests {
|
||||
reconstructor = reconstructor.with_byte_range(range);
|
||||
}
|
||||
|
||||
reconstructor.reconstruct_to_file(&file_path, None).await?;
|
||||
reconstructor.reconstruct_to_file(&file_path, None, false).await?;
|
||||
|
||||
// Read back the data from the file at the expected location.
|
||||
let file_data = std::fs::read(&file_path)?;
|
||||
@@ -499,7 +500,7 @@ mod tests {
|
||||
reconstructor = reconstructor.with_byte_range(range);
|
||||
}
|
||||
|
||||
reconstructor.reconstruct_to_file(&file_path, Some(offset)).await?;
|
||||
reconstructor.reconstruct_to_file(&file_path, Some(offset), false).await?;
|
||||
|
||||
// Read back all file data.
|
||||
let file_data = std::fs::read(&file_path)?;
|
||||
@@ -524,7 +525,7 @@ mod tests {
|
||||
reconstructor = reconstructor.with_byte_range(range);
|
||||
}
|
||||
|
||||
reconstructor.reconstruct_to_file(&file_path, Some(0)).await?;
|
||||
reconstructor.reconstruct_to_file(&file_path, Some(0), false).await?;
|
||||
|
||||
// Read back all file data (it starts at offset 0).
|
||||
let file_data = std::fs::read(&file_path)?;
|
||||
@@ -1210,7 +1211,7 @@ mod tests {
|
||||
FileReconstructor::new(&(client.clone() as Arc<dyn Client>), file_hash)
|
||||
.with_byte_range(range)
|
||||
.with_config(config)
|
||||
.reconstruct_to_file(file_path, None)
|
||||
.reconstruct_to_file(file_path, None, false)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -1266,7 +1267,7 @@ mod tests {
|
||||
FileReconstructor::new(&(client as Arc<dyn Client>), file_hash)
|
||||
.with_byte_range(range)
|
||||
.with_config(config)
|
||||
.reconstruct_to_file(&file_path, None)
|
||||
.reconstruct_to_file(&file_path, None, false)
|
||||
.await
|
||||
});
|
||||
}
|
||||
|
||||
@@ -128,7 +128,7 @@ impl FileDownloadSession {
|
||||
let name = Arc::from(write_path.to_string_lossy().as_ref());
|
||||
let progress_updater = self.progress.new_item(id, name);
|
||||
let reconstructor = self.setup_reconstructor(file_info, None, Some(progress_updater))?;
|
||||
let n_bytes = reconstructor.reconstruct_to_file(write_path, None).await?;
|
||||
let n_bytes = reconstructor.reconstruct_to_file(write_path, None, true).await?;
|
||||
// Caller is responsible for cleaning up the file on error (consistent
|
||||
// with other error paths); see download_group.rs error handling.
|
||||
if let Some(expected_size) = file_info.file_size()
|
||||
|
||||
Reference in New Issue
Block a user