mirror of
https://github.com/huggingface/xet-core.git
synced 2026-06-04 13:30:29 +08:00
use u64 rather than usize in file hashing paths (#485)
Using the file hashing components in WASM found a bug that using 32 but usize causes errors when hashing the file. This PR enforces the use of u64 everywhere along that path (and also pins the wasm-bindgen version)
This commit is contained in:
@@ -1037,7 +1037,7 @@ impl CasObject {
|
||||
};
|
||||
|
||||
let chunk_hash = merklehash::compute_data_hash(&data);
|
||||
hash_chunks.push((chunk_hash, chunk_uncompressed_length as usize));
|
||||
hash_chunks.push((chunk_hash, chunk_uncompressed_length as u64));
|
||||
|
||||
cumulative_compressed_length += compressed_chunk_length as u32;
|
||||
unpacked_chunk_offset += chunk_uncompressed_length;
|
||||
@@ -1598,7 +1598,7 @@ pub mod test_utils {
|
||||
let bytes = gen_random_bytes(chunk_size);
|
||||
|
||||
let chunk_hash = merklehash::compute_data_hash(&bytes);
|
||||
chunks.push((chunk_hash, bytes.len()));
|
||||
chunks.push((chunk_hash, bytes.len() as u64));
|
||||
|
||||
data_contents_raw.extend_from_slice(&bytes);
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ pub struct FileDeduper<DataInterfaceType: DeduplicationDataInterface> {
|
||||
new_data_hash_lookup: HashMap<MerkleHash, usize>,
|
||||
|
||||
/// The current chunk hashes for this file.
|
||||
chunk_hashes: Vec<(MerkleHash, usize)>,
|
||||
chunk_hashes: Vec<(MerkleHash, u64)>,
|
||||
|
||||
/// The current file data entries.
|
||||
file_info: Vec<FileDataSequenceEntry>,
|
||||
@@ -262,7 +262,7 @@ impl<DataInterfaceType: DeduplicationDataInterface> FileDeduper<DataInterfaceTyp
|
||||
}
|
||||
|
||||
self.deduplication_metrics.merge_in(&dedup_metrics);
|
||||
self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len())));
|
||||
self.chunk_hashes.extend(chunks.iter().map(|c| (c.hash, c.data.len() as u64)));
|
||||
|
||||
// Register the xorb dependencies as needed.
|
||||
if !xorb_dependencies.is_empty() {
|
||||
|
||||
@@ -37,7 +37,7 @@ impl RawXorbData {
|
||||
|
||||
debug_assert_le!(num_bytes, *MAX_XORB_BYTES);
|
||||
|
||||
let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len())).collect();
|
||||
let hash_and_len: Vec<_> = chunks.iter().map(|c| (c.hash, c.data.len() as u64)).collect();
|
||||
let cas_hash = xorb_hash(&hash_and_len);
|
||||
|
||||
// Build the MDBCASInfo struct.
|
||||
|
||||
@@ -2,15 +2,16 @@ use merklehash::{DataHashHexParseError, MerkleHash, xorb_hash};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use wasm_bindgen::prelude::*;
|
||||
|
||||
// un-comment following 2 sections for console_log!() printing support
|
||||
// macro_rules! console_log {
|
||||
// ($($t:tt)*) => (log(&format_args!($($t)*).to_string()))
|
||||
// }
|
||||
|
||||
#[wasm_bindgen]
|
||||
extern "C" {
|
||||
#[wasm_bindgen(js_namespace = console)]
|
||||
fn log(s: &str);
|
||||
}
|
||||
// #[wasm_bindgen]
|
||||
// extern "C" {
|
||||
// #[wasm_bindgen(js_namespace = console)]
|
||||
// fn log(s: &str);
|
||||
// }
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct JsChunkIn {
|
||||
@@ -79,18 +80,22 @@ impl JsChunker {
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_chunks_in(chunks_array: JsValue) -> Result<Vec<(MerkleHash, u64)>, JsValue> {
|
||||
let js_chunks: Vec<JsChunkIn> =
|
||||
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
|
||||
|
||||
js_chunks
|
||||
.into_iter()
|
||||
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as u64)))
|
||||
.collect::<Result<_, DataHashHexParseError>>()
|
||||
.map_err(|e| JsValue::from(e.to_string()))
|
||||
}
|
||||
|
||||
/// takes an Array of Objects of the form { "hash": string, "length": number }
|
||||
/// and returns a string of a hash
|
||||
#[wasm_bindgen]
|
||||
pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
|
||||
let js_chunks: Vec<JsChunkIn> =
|
||||
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
|
||||
|
||||
let chunks: Vec<(MerkleHash, usize)> = js_chunks
|
||||
.into_iter()
|
||||
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
|
||||
.collect::<Result<_, DataHashHexParseError>>()
|
||||
.map_err(|e| JsValue::from(e.to_string()))?;
|
||||
let chunks = parse_chunks_in(chunks_array)?;
|
||||
|
||||
Ok(xorb_hash(&chunks).hex())
|
||||
}
|
||||
@@ -99,16 +104,9 @@ pub fn compute_xorb_hash(chunks_array: JsValue) -> Result<String, JsValue> {
|
||||
/// and returns a string of a hash
|
||||
#[wasm_bindgen]
|
||||
pub fn compute_file_hash(chunks_array: JsValue) -> Result<String, JsValue> {
|
||||
let js_chunks =
|
||||
serde_wasm_bindgen::from_value::<Vec<JsChunkIn>>(chunks_array).map_err(|e| JsValue::from(e.to_string()))?;
|
||||
let chunks = parse_chunks_in(chunks_array)?;
|
||||
|
||||
let chunk_list: Vec<(MerkleHash, usize)> = js_chunks
|
||||
.into_iter()
|
||||
.map(|jsc| Ok((MerkleHash::from_hex(&jsc.hash)?, jsc.length as usize)))
|
||||
.collect::<Result<_, DataHashHexParseError>>()
|
||||
.map_err(|e| JsValue::from(e.to_string()))?;
|
||||
|
||||
Ok(merklehash::file_hash(&chunk_list).hex())
|
||||
Ok(merklehash::file_hash(&chunks).hex())
|
||||
}
|
||||
|
||||
/// takes an Array of hashes as strings and returns the verification hash for that range of chunk hashes
|
||||
|
||||
@@ -25,7 +25,7 @@ pub const AGGREGATED_HASHES_MEAN_TREE_BRANCHING_FACTOR: u64 = 4;
|
||||
/// children: This ensures that the graph always has at most 1/2 the number of parents as children. and we don't have
|
||||
/// too wide branches.
|
||||
#[inline]
|
||||
fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {
|
||||
fn next_merge_cut(hashes: &[(MerkleHash, u64)]) -> usize {
|
||||
if hashes.len() <= 2 {
|
||||
return hashes.len();
|
||||
}
|
||||
@@ -45,7 +45,7 @@ fn next_merge_cut(hashes: &[(MerkleHash, usize)]) -> usize {
|
||||
|
||||
/// Merge the hashes together, including the size information and returning the new (hash, size) pair.
|
||||
#[inline]
|
||||
fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize) {
|
||||
fn merged_hash_of_sequence(hash: &[(MerkleHash, u64)]) -> (MerkleHash, u64) {
|
||||
// Use a threadlocal buffer to avoid the overhead of reallocations.
|
||||
thread_local! {
|
||||
static BUFFER: RefCell<String> =
|
||||
@@ -70,7 +70,7 @@ fn merged_hash_of_sequence(hash: &[(MerkleHash, usize)]) -> (MerkleHash, usize)
|
||||
/// Iteratively collapse the list of hashes using the criteria in next_merge_cut
|
||||
/// until only one hash remains; this is the aggregated hash.
|
||||
#[inline]
|
||||
fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
|
||||
fn aggregated_node_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
|
||||
if chunks.is_empty() {
|
||||
return MerkleHash::default();
|
||||
}
|
||||
@@ -100,7 +100,7 @@ fn aggregated_node_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
|
||||
|
||||
/// The xorb hash
|
||||
#[inline]
|
||||
pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
|
||||
pub fn xorb_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
|
||||
if chunks.is_empty() {
|
||||
return MerkleHash::default();
|
||||
}
|
||||
@@ -110,7 +110,7 @@ pub fn xorb_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
|
||||
|
||||
/// The file hash when a salt is needed.
|
||||
#[inline]
|
||||
pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> MerkleHash {
|
||||
pub fn file_hash_with_salt(chunks: &[(MerkleHash, u64)], salt: &[u8; 32]) -> MerkleHash {
|
||||
if chunks.is_empty() {
|
||||
return MerkleHash::default();
|
||||
}
|
||||
@@ -120,7 +120,7 @@ pub fn file_hash_with_salt(chunks: &[(MerkleHash, usize)], salt: &[u8; 32]) -> M
|
||||
|
||||
/// The file hash calculation from a series of chunks; to be used when there isn't a salt.
|
||||
#[inline]
|
||||
pub fn file_hash(chunks: &[(MerkleHash, usize)]) -> MerkleHash {
|
||||
pub fn file_hash(chunks: &[(MerkleHash, u64)]) -> MerkleHash {
|
||||
file_hash_with_salt(chunks, &[0; 32])
|
||||
}
|
||||
|
||||
@@ -172,7 +172,7 @@ mod tests {
|
||||
}
|
||||
println!("],");
|
||||
|
||||
let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100) as usize)).collect();
|
||||
let hash_list: Vec<_> = v.iter().map(|&hi| (rh(hi), (hi * 100))).collect();
|
||||
print!("\"{:?}\",", xorb_hash(&hash_list));
|
||||
|
||||
// Now do a few salts along with the 0 salt to ensure we get good coverage there.
|
||||
|
||||
Reference in New Issue
Block a user