From 60b607be5b2d8934386cae2d1455625a788f1be3 Mon Sep 17 00:00:00 2001 From: Vitaly Goldshteyn Date: Wed, 31 Dec 2025 00:51:16 -0800 Subject: [PATCH] `CRC32` version of `CombineContiguous` for length <= 32. For length in [17, 32] we compute two chain of dependent CRC32 operations to have good entropy in the resulting two 32 bit numbers. 1. x := CRC32(CRC32(state, A), D) 2. y := CRC32(CRC32(bswap(state), C), B) On ARM: CRC32 has 2 cycles latency and throughput equal to 1. Computations will be pipelined without any wait. On x86: CRC32 has 3 cycles latency and throughput equal to 1. There will be 1 extra cycle wait, but we can do `cmp` in parallel. At the end we multiply (mul - x) * (y - mul). mul is added to fill upper 32 bits of CRC result with good entropy bits. `mul = rotr(kMul, len)` We also mixing length differently: 1. `state + 8 * len` (`lea` instruction), later one or two CRC shuffle these bits well into low 32 bit. 2. `rotr(kMul, len)` is used for filling high 32 bits before multiplication in `Mix`. This avoid reading from `kStaticRandomData`. For smaller strings we try to extremely minimize binary size and register pressure. CRC instruction fused with memory read is used. llvm-mca reporting 1 cycle smaller latency compared to separate `mov` + `crc`. ASM analysis https://godbolt.org/z/e1xrKzhdc: 1. 100+ bytes binary size saving (per inline instance) 2. 25+ instruction saving 3. 2 registers are not used (r8 and r9). Latency in isolation without accounting comparison are controversial. 1. latency for 8 bytes in isolation is 1 cycle better: https://godbolt.org/z/zc39eM3K9 2. latency for 1-3 bytes in isolation is 2 cycles better: https://godbolt.org/z/qMKfbv438 3. latency for 16 bytes in isolation is 3 cycles worse: https://godbolt.org/z/vcqr8oGv3 4. latency for 32 bytes in isolation is 5 cycles worse: https://godbolt.org/z/nEPP5jP58 PiperOrigin-RevId: 850659551 Change-Id: I02a2434f2d98473b099c171ef1c56adffa821c60 --- absl/hash/internal/hash.h | 111 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h index 02df7faa..37bd39d6 100644 --- a/absl/hash/internal/hash.h +++ b/absl/hash/internal/hash.h @@ -74,6 +74,7 @@ #include #include "absl/base/attributes.h" +#include "absl/base/internal/endian.h" #include "absl/base/internal/unaligned_access.h" #include "absl/base/optimization.h" #include "absl/base/port.h" @@ -93,6 +94,38 @@ #include // NOLINT #endif +// 32-bit builds with SSE 4.2 do not have _mm_crc32_u64, so the +// __x86_64__ condition is necessary. +#if defined(__SSE4_2__) && defined(__x86_64__) + +#include +#define ABSL_HASH_INTERNAL_HAS_CRC32 +#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64 +#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32 +#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8 + +#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) + +// MSVC AVX (/arch:AVX) implies SSE 4.2. +#include +#define ABSL_HASH_INTERNAL_HAS_CRC32 +#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64 +#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32 +#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8 + +#elif defined(__ARM_FEATURE_CRC32) + +#include +#define ABSL_HASH_INTERNAL_HAS_CRC32 +// Casting to uint32_t to be consistent with x86 intrinsic (_mm_crc32_u64 +// accepts crc as 64 bit integer). +#define ABSL_HASH_INTERNAL_CRC32_U64(crc, data) \ + __crc32cd(static_cast(crc), data) +#define ABSL_HASH_INTERNAL_CRC32_U32 __crc32cw +#define ABSL_HASH_INTERNAL_CRC32_U8 __crc32cb + +#endif + namespace absl { ABSL_NAMESPACE_BEGIN @@ -965,18 +998,20 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) { return Uint128High64(m) ^ Uint128Low64(m); } -// Reads 8 bytes from p. -inline uint64_t Read8(const unsigned char* p) { // Suppress erroneous array bounds errors on GCC. #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif +inline uint32_t Read4(const unsigned char* p) { + return absl::base_internal::UnalignedLoad32(p); +} +inline uint64_t Read8(const unsigned char* p) { return absl::base_internal::UnalignedLoad64(p); +} #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic pop #endif -} // Reads 9 to 16 bytes from p. // The first 8 bytes are in .first, and the rest of the bytes are in .second @@ -1096,6 +1131,70 @@ inline uint64_t CombineContiguousImpl( return CombineLargeContiguousImplOn32BitLengthGt8(state, first, len); } +#ifdef ABSL_HASH_INTERNAL_HAS_CRC32 +inline uint64_t CombineContiguousImpl( + uint64_t state, const unsigned char* first, size_t len, + std::integral_constant /* sizeof_size_t */) { + if (ABSL_PREDICT_FALSE(len > 32)) { + return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len); + } + // `mul` is the salt that is used for final mixing. It is important to fill + // high 32 bits because CRC wipes out high 32 bits. + // `rotr` is important to mix `len` into high 32 bits. + uint64_t mul = absl::rotr(kMul, static_cast(len)); + // Only low 32 bits of each uint64_t are used in CRC32 so we use gbswap_64 to + // move high 32 bits to low 32 bits. It has slightly smaller binary size than + // `>> 32`. `state + 8 * len` is a single instruction on both x86 and ARM, so + // we use it to better mix length. Although only the low 32 bits of the pair + // elements are used, we use pair for better generated + // code. + std::pair crcs = {state + 8 * len, + absl::gbswap_64(state)}; + + // All CRC operations here directly read bytes from the memory. + // Single fused instructions are used, like `crc32 rcx, qword ptr [rsi]`. + // On x86, llvm-mca reports latency `R + 2` for such fused instructions, while + // `R + 3` for two separate `mov` + `crc` instructions. `R` is the latency of + // reading the memory. Fused instructions also reduce register pressure + // allowing surrounding code to be more efficient when this code is inlined. + if (len > 8) { + crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first)), + ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + len - 8))}; + if (len > 16) { + // We compute the second round of dependent CRC32 operations. + crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first + len - 16)), + ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + 8))}; + } + } else { + if (len >= 4) { + // We use CRC for 4 bytes to benefit from the fused instruction and better + // hash quality. + // Using `xor` or `add` may reduce latency for this case, but would + // require more registers, more instructions and will have worse hash + // quality. + crcs = {ABSL_HASH_INTERNAL_CRC32_U32(static_cast(crcs.first), + Read4(first)), + ABSL_HASH_INTERNAL_CRC32_U32(static_cast(crcs.second), + Read4(first + len - 4))}; + } else if (len >= 1) { + // We mix three bytes all into different output registers. + // This way, we do not need shifting of these bytes (so they don't overlap + // with each other). + crcs = {ABSL_HASH_INTERNAL_CRC32_U8(static_cast(crcs.first), + first[0]), + ABSL_HASH_INTERNAL_CRC32_U8(static_cast(crcs.second), + first[len - 1])}; + // Middle byte is mixed weaker. It is a new byte only for len == 3. + // Mixing is independent from CRC operations so it is scheduled ASAP. + mul += first[len / 2]; + } + } + // `mul` is mixed into both sides of `Mix` to guarantee non-zero values for + // both multiplicands. Using Mix instead of just multiplication here improves + // hash quality, especially for short strings. + return Mix(mul - crcs.first, crcs.second - mul); +} +#else inline uint64_t CombineContiguousImpl( uint64_t state, const unsigned char* first, size_t len, std::integral_constant /* sizeof_size_t */) { @@ -1118,6 +1217,7 @@ inline uint64_t CombineContiguousImpl( // to calling CombineLargeContiguousImpl once with 2 * PiecewiseChunkSize(). return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len); } +#endif // ABSL_HASH_INTERNAL_HAS_CRC32 #if defined(ABSL_INTERNAL_LEGACY_HASH_NAMESPACE) && \ ABSL_META_INTERNAL_STD_HASH_SFINAE_FRIENDLY_ @@ -1452,4 +1552,9 @@ H PiecewiseCombiner::finalize(H state) { ABSL_NAMESPACE_END } // namespace absl +#undef ABSL_HASH_INTERNAL_HAS_CRC32 +#undef ABSL_HASH_INTERNAL_CRC32_U64 +#undef ABSL_HASH_INTERNAL_CRC32_U32 +#undef ABSL_HASH_INTERNAL_CRC32_U8 + #endif // ABSL_HASH_INTERNAL_HASH_H_