use u64 rather than usize in file hashing paths (#485)

Using the file hashing components in WASM found a bug that using 32 but usize causes errors when hashing the file. This PR enforces the use of u64 everywhere along that path (and also pins the wasm-bindgen version)
2026-06-04 13:30:29 +08:00 · 2025-09-08 14:27:58 -07:00
parent 4d948d1a76
commit e01896e074
5 changed files with 32 additions and 34 deletions
--- a/cas_object/src/cas_object_format.rs
+++ b/cas_object/src/cas_object_format.rs
@@ -1037,7 +1037,7 @@ impl CasObject {
            };

            let chunk_hash = merklehash::compute_data_hash(&data);
-            hash_chunks.push((chunk_hash, chunk_uncompressed_length as usize));
+            hash_chunks.push((chunk_hash, chunk_uncompressed_length as u64));

            cumulative_compressed_length += compressed_chunk_length as u32;
            unpacked_chunk_offset += chunk_uncompressed_length;
@@ -1598,7 +1598,7 @@ pub mod test_utils {
            let bytes = gen_random_bytes(chunk_size);

            let chunk_hash = merklehash::compute_data_hash(&bytes);
-            chunks.push((chunk_hash, bytes.len()));
+            chunks.push((chunk_hash, bytes.len() as u64));

            data_contents_raw.extend_from_slice(&bytes);

--- a/deduplication/src/file_deduplication.rs
+++ b/deduplication/src/file_deduplication.rs
@@ -33,7 +33,7 @@ pub struct FileDeduper<DataInterfaceType: DeduplicationDataInterface> {
    new_data_hash_lookup: HashMap<MerkleHash, usize>,

    /// The current chunk hashes for this file.
-    chunk_hashes: Vec<(MerkleHash, usize)>,
+    chunk_hashes: Vec<(MerkleHash, u64)>,

    /// The current file data entries.
    file_info: Vec<FileDataSequenceEntry>,
@@ -262,7 +262,7 @@ impl<DataInterfaceType: DeduplicationDataInterface> FileDeduper<DataInterfaceTyp
        }

        self.deduplication_metrics.merge_in(&dedup_metrics);
-        self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len())));
+        self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len() as u64)));

        // Register the xorb dependencies as needed.
        if !xorb_dependencies.is_empty() {
--- a/deduplication/src/raw_xorb_data.rs
+++ b/deduplication/src/raw_xorb_data.rs
@@ -37,7 +37,7 @@ impl RawXorbData {

        debug_assert_le!(num_bytes, *MAX_XORB_BYTES);

-        let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len())).collect();
+        let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len() as u64)).collect();
        let cas_hash = xorb_hash(&hash_and_len);

        // Build the MDBCASInfo struct.
--- a/hf_xet_thin_wasm/src/lib.rs
+++ b/hf_xet_thin_wasm/src/lib.rs
@@ -2,15 +2,16 @@ use merklehash::{DataHashHexParseError, MerkleHash, xorb_hash};
 use serde::{Deserialize, Serialize};
 use wasm_bindgen::prelude::*;

+// un-comment following 2 sections for console_log!() printing support
 // macro_rules! console_log {
 //     ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
 // }

-#[wasm_bindgen]
-extern "C" {
-    #[wasm_bindgen(js_namespace = console)]
-    fn log(s: &str);
-}
+// #[wasm_bindgen]
+// extern "C" {
+//     #[wasm_bindgen(js_namespace = console)]
+//     fn log(s: &str);
+// }

 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct JsChunkIn {
@@ -79,18 +80,22 @@ impl JsChunker {
    }
 }

+fn parse_chunks_in(chunks_array: JsValue) -> Result<Vec<(MerkleHash, u64)>, JsValue> {
+    let js_chunks: Vec<JsChunkIn> =
+        serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
+
+    js_chunks
+        .into_iter()
+        .map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as u64)))
+        .collect::<Result<_, DataHashHexParseError>>()
+        .map_err(|e| JsValue::from(e.to_string()))
+}
+
 /// takes an Array of Objects of the form { "hash": string, "length": number }
 /// and returns a string of a hash
 #[wasm_bindgen]
 pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
-    let js_chunks: Vec<JsChunkIn> =
-        serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
-
-    let chunks: Vec<(MerkleHash, usize)> = js_chunks
-        .into_iter()
-        .map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
-        .collect::<Result<_, DataHashHexParseError>>()
-        .map_err(|e| JsValue::from(e.to_string()))?;
+    let chunks = parse_chunks_in(chunks_array)?;

    Ok(xorb_hash(&chunks).hex())
 }
@@ -99,16 +104,9 @@ pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
 /// and returns a string of a hash
 #[wasm_bindgen]
 pub fn compute_file_hash(chunks_array: JsValue) -> Result<String, JsValue> {
-    let js_chunks =
-        serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
+    let chunks = parse_chunks_in(chunks_array)?;

-    let chunk_list: Vec<(MerkleHash, usize)> = js_chunks
-        .into_iter()
-        .map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
-        .collect::<Result<_, DataHashHexParseError>>()
-        .map_err(|e| JsValue::from(e.to_string()))?;
-
-    Ok(merklehash::file_hash(&chunk_list).hex())
+    Ok(merklehash::file_hash(&chunks).hex())
 }

 /// takes an Array of hashes as strings and returns the verification hash for that range of chunk hashes
--- a/merklehash/src/aggregated_hashes.rs
+++ b/merklehash/src/aggregated_hashes.rs
@@ -25,7 +25,7 @@ pub const AGGREGATED_HASHES_MEAN_TREE_BRANCHING_FACTOR: u64 = 4;
 ///    children: This ensures that the graph always has at most 1/2 the number of parents as children. and we don't have
 ///    too wide branches.
 #[inline]
-fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {
+fn next_merge_cut(hashes: &[(MerkleHash, u64)]) -> usize {
    if hashes.len() <= 2 {
        return hashes.len();
    }
@@ -45,7 +45,7 @@ fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {

 /// Merge the hashes together, including the size information and returning the new (hash, size) pair.
 #[inline]
-fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize) {
+fn merged_hash_of_sequence(hash: &[(MerkleHash, u64)]) -> (MerkleHash, u64) {
    // Use a threadlocal buffer to avoid the overhead of reallocations.
    thread_local! {
        static BUFFER: RefCell<String> =
@@ -70,7 +70,7 @@ fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize)
 /// Iteratively collapse the list of hashes using the criteria in next_merge_cut
 /// until only one hash remains; this is the aggregated hash.
 #[inline]
-fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
+fn aggregated_node_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
    if chunks.is_empty() {
        return MerkleHash::default();
    }
@@ -100,7 +100,7 @@ fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {

 /// The xorb hash
 #[inline]
-pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
+pub fn xorb_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
    if chunks.is_empty() {
        return MerkleHash::default();
    }
@@ -110,7 +110,7 @@ pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {

 /// The file hash when a salt is needed.
 #[inline]
-pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> MerkleHash {
+pub fn file_hash_with_salt(chunks: &[(MerkleHash, u64)], salt: &[u8; 32]) -> MerkleHash {
    if chunks.is_empty() {
        return MerkleHash::default();
    }
@@ -120,7 +120,7 @@ pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> M

 /// The file hash calculation from a series of chunks; to be used when there isn't a salt.
 #[inline]
-pub fn file_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
+pub fn file_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
    file_hash_with_salt(chunks, &[0; 32])
 }

@@ -172,7 +172,7 @@ mod tests {
            }
            println!("],");

-            let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100) as usize)).collect();
+            let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100))).collect();
            print!("\"{:?}\",", xorb_hash(&hash_list));

            // Now do a few salts along with the 0 salt to ensure we get good coverage there.