use u64 rather than usize in file hashing paths (#485)

Using the file hashing components in WASM found a bug that using 32 but
usize causes errors when hashing the file.

This PR enforces the use of u64 everywhere along that path (and also
pins the wasm-bindgen version)
This commit is contained in:
Assaf Vayner
2025-09-08 14:27:58 -07:00
committed by GitHub
parent 4d948d1a76
commit e01896e074
5 changed files with 32 additions and 34 deletions

View File

@@ -1037,7 +1037,7 @@ impl CasObject {
};
let chunk_hash = merklehash::compute_data_hash(&data);
hash_chunks.push((chunk_hash, chunk_uncompressed_length as usize));
hash_chunks.push((chunk_hash, chunk_uncompressed_length as u64));
cumulative_compressed_length += compressed_chunk_length as u32;
unpacked_chunk_offset += chunk_uncompressed_length;
@@ -1598,7 +1598,7 @@ pub mod test_utils {
let bytes = gen_random_bytes(chunk_size);
let chunk_hash = merklehash::compute_data_hash(&bytes);
chunks.push((chunk_hash, bytes.len()));
chunks.push((chunk_hash, bytes.len() as u64));
data_contents_raw.extend_from_slice(&bytes);

View File

@@ -33,7 +33,7 @@ pub struct FileDeduper<DataInterfaceType: DeduplicationDataInterface> {
new_data_hash_lookup: HashMap<MerkleHash, usize>,
/// The current chunk hashes for this file.
chunk_hashes: Vec<(MerkleHash, usize)>,
chunk_hashes: Vec<(MerkleHash, u64)>,
/// The current file data entries.
file_info: Vec<FileDataSequenceEntry>,
@@ -262,7 +262,7 @@ impl<DataInterfaceType: DeduplicationDataInterface> FileDeduper<DataInterfaceTyp
}
self.deduplication_metrics.merge_in(&dedup_metrics);
self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len())));
self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len() as u64)));
// Register the xorb dependencies as needed.
if !xorb_dependencies.is_empty() {

View File

@@ -37,7 +37,7 @@ impl RawXorbData {
debug_assert_le!(num_bytes, *MAX_XORB_BYTES);
let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len())).collect();
let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len() as u64)).collect();
let cas_hash = xorb_hash(&hash_and_len);
// Build the MDBCASInfo struct.

View File

@@ -2,15 +2,16 @@ use merklehash::{DataHashHexParseError, MerkleHash, xorb_hash};
use serde::{Deserialize, Serialize};
use wasm_bindgen::prelude::*;
// un-comment following 2 sections for console_log!() printing support
// macro_rules! console_log {
// ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
// }
#[wasm_bindgen]
extern "C" {
#[wasm_bindgen(js_namespace = console)]
fn log(s: &str);
}
// #[wasm_bindgen]
// extern "C" {
// #[wasm_bindgen(js_namespace = console)]
// fn log(s: &str);
// }
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct JsChunkIn {
@@ -79,18 +80,22 @@ impl JsChunker {
}
}
fn parse_chunks_in(chunks_array: JsValue) -> Result<Vec<(MerkleHash, u64)>, JsValue> {
let js_chunks: Vec<JsChunkIn> =
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
js_chunks
.into_iter()
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as u64)))
.collect::<Result<_, DataHashHexParseError>>()
.map_err(|e| JsValue::from(e.to_string()))
}
/// takes an Array of Objects of the form { "hash": string, "length": number }
/// and returns a string of a hash
#[wasm_bindgen]
pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
let js_chunks: Vec<JsChunkIn> =
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
let chunks: Vec<(MerkleHash, usize)> = js_chunks
.into_iter()
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
.collect::<Result<_, DataHashHexParseError>>()
.map_err(|e| JsValue::from(e.to_string()))?;
let chunks = parse_chunks_in(chunks_array)?;
Ok(xorb_hash(&chunks).hex())
}
@@ -99,16 +104,9 @@ pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
/// and returns a string of a hash
#[wasm_bindgen]
pub fn compute_file_hash(chunks_array: JsValue) -> Result<String, JsValue> {
let js_chunks =
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
let chunks = parse_chunks_in(chunks_array)?;
let chunk_list: Vec<(MerkleHash, usize)> = js_chunks
.into_iter()
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
.collect::<Result<_, DataHashHexParseError>>()
.map_err(|e| JsValue::from(e.to_string()))?;
Ok(merklehash::file_hash(&chunk_list).hex())
Ok(merklehash::file_hash(&chunks).hex())
}
/// takes an Array of hashes as strings and returns the verification hash for that range of chunk hashes

View File

@@ -25,7 +25,7 @@ pub const AGGREGATED_HASHES_MEAN_TREE_BRANCHING_FACTOR: u64 = 4;
/// children: This ensures that the graph always has at most 1/2 the number of parents as children. and we don't have
/// too wide branches.
#[inline]
fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {
fn next_merge_cut(hashes: &[(MerkleHash, u64)]) -> usize {
if hashes.len() <= 2 {
return hashes.len();
}
@@ -45,7 +45,7 @@ fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {
/// Merge the hashes together, including the size information and returning the new (hash, size) pair.
#[inline]
fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize) {
fn merged_hash_of_sequence(hash: &[(MerkleHash, u64)]) -> (MerkleHash, u64) {
// Use a threadlocal buffer to avoid the overhead of reallocations.
thread_local! {
static BUFFER: RefCell<String> =
@@ -70,7 +70,7 @@ fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize)
/// Iteratively collapse the list of hashes using the criteria in next_merge_cut
/// until only one hash remains; this is the aggregated hash.
#[inline]
fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
fn aggregated_node_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
if chunks.is_empty() {
return MerkleHash::default();
}
@@ -100,7 +100,7 @@ fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
/// The xorb hash
#[inline]
pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
pub fn xorb_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
if chunks.is_empty() {
return MerkleHash::default();
}
@@ -110,7 +110,7 @@ pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
/// The file hash when a salt is needed.
#[inline]
pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> MerkleHash {
pub fn file_hash_with_salt(chunks: &[(MerkleHash, u64)], salt: &[u8; 32]) -> MerkleHash {
if chunks.is_empty() {
return MerkleHash::default();
}
@@ -120,7 +120,7 @@ pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> M
/// The file hash calculation from a series of chunks; to be used when there isn't a salt.
#[inline]
pub fn file_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
pub fn file_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
file_hash_with_salt(chunks, &[0; 32])
}
@@ -172,7 +172,7 @@ mod tests {
}
println!("],");
let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100) as usize)).collect();
let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100))).collect();
print!("\"{:?}\",", xorb_hash(&hash_list));
// Now do a few salts along with the 0 salt to ensure we get good coverage there.