fix: truncate local file on full-file download to prevent corruption (#764)

## Summary

- Fixes a data corruption bug where downloading a file smaller than an
existing local file left stale trailing bytes intact
- The file was opened with `truncate(false)` unconditionally (needed for
concurrent partial-range writes), but full-file downloads now use
`truncate(true)`
- Adds regression test `test_full_file_truncates_larger_existing_file`

Ref: https://github.com/huggingface/huggingface_hub/issues/3995

<!-- CURSOR_SUMMARY -->
---

> [!NOTE]
> **Medium Risk**
> Changes on-disk write semantics for reconstructed downloads by
optionally truncating the destination file, which affects data integrity
and could impact concurrent/partial-write callers if misused.
> 
> **Overview**
> Fixes a corruption case where full-file downloads could leave stale
trailing bytes when writing over an existing larger file by adding a
`truncate_file` flag to `FileReconstructor::reconstruct_to_file` and
wiring it to `OpenOptions::truncate()`.
> 
> Updates full-file download flow
(`FileDownloadSession::download_file_with_id`) to pass
`truncate_file=true`, while keeping benchmarks/tests and
range/concurrent write paths passing `false` to preserve existing
behavior for partial/concurrent writes.
> 
> <sup>Written by [Cursor
Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit
ed33dab9a1. This will update automatically
on new commits. Configure
[here](https://cursor.com/dashboard?tab=bugbot).</sup>
<!-- /CURSOR_SUMMARY -->

---------

Co-authored-by: di <di@huggingface.co>
This commit is contained in:
Adrien
2026-03-30 17:57:23 +02:00
committed by GitHub
parent 15011cb230
commit f781498b68
3 changed files with 14 additions and 13 deletions

View File

@@ -48,7 +48,7 @@ fn bench_sequential_non_vectored(c: &mut Criterion) {
let path = dir.path().join("out.bin");
FileReconstructor::new(&client, hash)
.with_config(cfg)
.reconstruct_to_file(&path, None)
.reconstruct_to_file(&path, None, false)
.await
.unwrap();
}
@@ -77,7 +77,7 @@ fn bench_sequential_vectored(c: &mut Criterion) {
let path = dir.path().join("out.bin");
FileReconstructor::new(&client, hash)
.with_config(cfg)
.reconstruct_to_file(&path, None)
.reconstruct_to_file(&path, None, false)
.await
.unwrap();
}

View File

@@ -95,14 +95,15 @@ impl FileReconstructor {
/// Reconstructs the file and writes it to the given path.
///
/// The file is opened with read/write access without truncation, allowing
/// multiple concurrent reconstructions to write to different regions of the
/// same file.
/// The file is opened with read/write access. When `truncate_file` is `true`
/// the file is truncated to the reconstructed length; when `false` the file
/// is left at its existing size, allowing multiple concurrent reconstructions
/// to write to different regions of the same file.
///
/// When `write_offset` is `Some(offset)`, writing begins at that byte
/// position regardless of the byte range. When `None`, writing begins at
/// the byte range start (or 0 for a full-file reconstruction).
pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option<u64>) -> Result<u64> {
pub async fn reconstruct_to_file(self, path: &Path, write_offset: Option<u64>, truncate_file: bool) -> Result<u64> {
info!(
file_hash = %self.file_hash,
byte_range = ?self.byte_range,
@@ -115,7 +116,7 @@ impl FileReconstructor {
std::fs::create_dir_all(parent)?;
}
let mut file = OpenOptions::new().write(true).create(true).truncate(false).open(path)?;
let mut file = OpenOptions::new().write(true).create(true).truncate(truncate_file).open(path)?;
let default_write_position = self.byte_range.map_or(0, |r| r.start);
let seek_position = write_offset.unwrap_or(default_write_position);
@@ -472,7 +473,7 @@ mod tests {
reconstructor = reconstructor.with_byte_range(range);
}
reconstructor.reconstruct_to_file(&file_path, None).await?;
reconstructor.reconstruct_to_file(&file_path, None, false).await?;
// Read back the data from the file at the expected location.
let file_data = std::fs::read(&file_path)?;
@@ -499,7 +500,7 @@ mod tests {
reconstructor = reconstructor.with_byte_range(range);
}
reconstructor.reconstruct_to_file(&file_path, Some(offset)).await?;
reconstructor.reconstruct_to_file(&file_path, Some(offset), false).await?;
// Read back all file data.
let file_data = std::fs::read(&file_path)?;
@@ -524,7 +525,7 @@ mod tests {
reconstructor = reconstructor.with_byte_range(range);
}
reconstructor.reconstruct_to_file(&file_path, Some(0)).await?;
reconstructor.reconstruct_to_file(&file_path, Some(0), false).await?;
// Read back all file data (it starts at offset 0).
let file_data = std::fs::read(&file_path)?;
@@ -1210,7 +1211,7 @@ mod tests {
FileReconstructor::new(&(client.clone() as Arc<dyn Client>), file_hash)
.with_byte_range(range)
.with_config(config)
.reconstruct_to_file(file_path, None)
.reconstruct_to_file(file_path, None, false)
.await
}
@@ -1266,7 +1267,7 @@ mod tests {
FileReconstructor::new(&(client as Arc<dyn Client>), file_hash)
.with_byte_range(range)
.with_config(config)
.reconstruct_to_file(&file_path, None)
.reconstruct_to_file(&file_path, None, false)
.await
});
}

View File

@@ -128,7 +128,7 @@ impl FileDownloadSession {
let name = Arc::from(write_path.to_string_lossy().as_ref());
let progress_updater = self.progress.new_item(id, name);
let reconstructor = self.setup_reconstructor(file_info, None, Some(progress_updater))?;
let n_bytes = reconstructor.reconstruct_to_file(write_path, None).await?;
let n_bytes = reconstructor.reconstruct_to_file(write_path, None, true).await?;
// Caller is responsible for cleaning up the file on error (consistent
// with other error paths); see download_group.rs error handling.
if let Some(expected_size) = file_info.file_size()