From 60b607be5b2d8934386cae2d1455625a788f1be3 Mon Sep 17 00:00:00 2001
From: Vitaly Goldshteyn <goldvitaly@google.com>
Date: Wed, 31 Dec 2025 00:51:16 -0800
Subject: [PATCH] `CRC32` version of `CombineContiguous` for length <= 32.

For length in [17, 32] we compute two chain of dependent CRC32  operations to have good entropy in the resulting two 32 bit numbers.
1. x := CRC32(CRC32(state, A), D)
2. y := CRC32(CRC32(bswap(state), C), B)

On ARM:
  CRC32 has 2 cycles latency and throughput equal to 1.
  Computations will be pipelined without any wait.
On x86:
  CRC32 has 3 cycles latency and throughput equal to 1.
  There will be 1 extra cycle wait, but we can do `cmp` in parallel.

At the end we multiply (mul - x) * (y - mul). mul is added to fill upper 32 bits of CRC result with good entropy bits. `mul = rotr(kMul, len)`

We also mixing length differently:
1. `state + 8 * len` (`lea` instruction), later one or two CRC shuffle these bits well into low 32 bit.
2. `rotr(kMul, len)` is used for filling high 32 bits before multiplication in `Mix`. This avoid reading from `kStaticRandomData`.

For smaller strings we try to extremely minimize binary size and register pressure.
CRC instruction fused with memory read is used. llvm-mca reporting 1 cycle smaller latency compared to separate `mov` + `crc`.

ASM analysis https://godbolt.org/z/e1xrKzhdc:
1. 100+ bytes binary size saving (per inline instance)
2. 25+ instruction saving
3. 2 registers are not used (r8 and r9).

Latency in isolation without accounting comparison are controversial.
1. latency for 8 bytes in isolation is 1 cycle better: https://godbolt.org/z/zc39eM3K9
2. latency for 1-3 bytes in isolation is 2 cycles better: https://godbolt.org/z/qMKfbv438
3. latency for 16 bytes in isolation is 3 cycles worse: https://godbolt.org/z/vcqr8oGv3
4. latency for 32 bytes in isolation is 5 cycles worse:
https://godbolt.org/z/nEPP5jP58

PiperOrigin-RevId: 850659551
Change-Id: I02a2434f2d98473b099c171ef1c56adffa821c60
---
 absl/hash/internal/hash.h | 111 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 3 deletions(-)
diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h
index 02df7faa..37bd39d6 100644
--- a/absl/hash/internal/hash.h
+++ b/absl/hash/internal/hash.h
@@ -74,6 +74,7 @@
 #include <vector>
 
 #include "absl/base/attributes.h"
+#include "absl/base/internal/endian.h"
 #include "absl/base/internal/unaligned_access.h"
 #include "absl/base/optimization.h"
 #include "absl/base/port.h"
@@ -93,6 +94,38 @@
 #include <filesystem>  // NOLINT
 #endif
 
+// 32-bit builds with SSE 4.2 do not have _mm_crc32_u64, so the
+// __x86_64__ condition is necessary.
+#if defined(__SSE4_2__) && defined(__x86_64__)
+
+#include <x86intrin.h>
+#define ABSL_HASH_INTERNAL_HAS_CRC32
+#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
+#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
+#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
+
+#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__)
+
+// MSVC AVX (/arch:AVX) implies SSE 4.2.
+#include <intrin.h>
+#define ABSL_HASH_INTERNAL_HAS_CRC32
+#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
+#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
+#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
+
+#elif defined(__ARM_FEATURE_CRC32)
+
+#include <arm_acle.h>
+#define ABSL_HASH_INTERNAL_HAS_CRC32
+// Casting to uint32_t to be consistent with x86 intrinsic (_mm_crc32_u64
+// accepts crc as 64 bit integer).
+#define ABSL_HASH_INTERNAL_CRC32_U64(crc, data) \
+  __crc32cd(static_cast<uint32_t>(crc), data)
+#define ABSL_HASH_INTERNAL_CRC32_U32 __crc32cw
+#define ABSL_HASH_INTERNAL_CRC32_U8 __crc32cb
+
+#endif
+
 namespace absl {
 ABSL_NAMESPACE_BEGIN
 
@@ -965,18 +998,20 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
   return Uint128High64(m) ^ Uint128Low64(m);
 }
 
-// Reads 8 bytes from p.
-inline uint64_t Read8(const unsigned char* p) {
 // Suppress erroneous array bounds errors on GCC.
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
+inline uint32_t Read4(const unsigned char* p) {
+  return absl::base_internal::UnalignedLoad32(p);
+}
+inline uint64_t Read8(const unsigned char* p) {
   return absl::base_internal::UnalignedLoad64(p);
+}
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
-}
 
 // Reads 9 to 16 bytes from p.
 // The first 8 bytes are in .first, and the rest of the bytes are in .second
@@ -1096,6 +1131,70 @@ inline uint64_t CombineContiguousImpl(
   return CombineLargeContiguousImplOn32BitLengthGt8(state, first, len);
 }
 
+#ifdef ABSL_HASH_INTERNAL_HAS_CRC32
+inline uint64_t CombineContiguousImpl(
+    uint64_t state, const unsigned char* first, size_t len,
+    std::integral_constant<int, 8> /* sizeof_size_t */) {
+  if (ABSL_PREDICT_FALSE(len > 32)) {
+    return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
+  }
+  // `mul` is the salt that is used for final mixing. It is important to fill
+  // high 32 bits because CRC wipes out high 32 bits.
+  // `rotr` is important to mix `len` into high 32 bits.
+  uint64_t mul = absl::rotr(kMul, static_cast<int>(len));
+  // Only low 32 bits of each uint64_t are used in CRC32 so we use gbswap_64 to
+  // move high 32 bits to low 32 bits. It has slightly smaller binary size than
+  // `>> 32`. `state + 8 * len` is a single instruction on both x86 and ARM, so
+  // we use it to better mix length. Although only the low 32 bits of the pair
+  // elements are used, we use pair<uint64_t, uint64_t> for better generated
+  // code.
+  std::pair<uint64_t, uint64_t> crcs = {state + 8 * len,
+                                        absl::gbswap_64(state)};
+
+  // All CRC operations here directly read bytes from the memory.
+  // Single fused instructions are used, like `crc32 rcx, qword ptr [rsi]`.
+  // On x86, llvm-mca reports latency `R + 2` for such fused instructions, while
+  // `R + 3` for two separate `mov` + `crc` instructions. `R` is the latency of
+  // reading the memory. Fused instructions also reduce register pressure
+  // allowing surrounding code to be more efficient when this code is inlined.
+  if (len > 8) {
+    crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first)),
+            ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + len - 8))};
+    if (len > 16) {
+      // We compute the second round of dependent CRC32 operations.
+      crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first + len - 16)),
+              ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + 8))};
+    }
+  } else {
+    if (len >= 4) {
+      // We use CRC for 4 bytes to benefit from the fused instruction and better
+      // hash quality.
+      // Using `xor` or `add` may reduce latency for this case, but would
+      // require more registers, more instructions and will have worse hash
+      // quality.
+      crcs = {ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.first),
+                                           Read4(first)),
+              ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.second),
+                                           Read4(first + len - 4))};
+    } else if (len >= 1) {
+      // We mix three bytes all into different output registers.
+      // This way, we do not need shifting of these bytes (so they don't overlap
+      // with each other).
+      crcs = {ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.first),
+                                          first[0]),
+              ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.second),
+                                          first[len - 1])};
+      // Middle byte is mixed weaker. It is a new byte only for len == 3.
+      // Mixing is independent from CRC operations so it is scheduled ASAP.
+      mul += first[len / 2];
+    }
+  }
+  // `mul` is mixed into both sides of `Mix` to guarantee non-zero values for
+  // both multiplicands. Using Mix instead of just multiplication here improves
+  // hash quality, especially for short strings.
+  return Mix(mul - crcs.first, crcs.second - mul);
+}
+#else
 inline uint64_t CombineContiguousImpl(
     uint64_t state, const unsigned char* first, size_t len,
     std::integral_constant<int, 8> /* sizeof_size_t */) {
@@ -1118,6 +1217,7 @@ inline uint64_t CombineContiguousImpl(
   // to calling CombineLargeContiguousImpl once with 2 * PiecewiseChunkSize().
   return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
 }
+#endif  // ABSL_HASH_INTERNAL_HAS_CRC32
 
 #if defined(ABSL_INTERNAL_LEGACY_HASH_NAMESPACE) && \
     ABSL_META_INTERNAL_STD_HASH_SFINAE_FRIENDLY_
@@ -1452,4 +1552,9 @@ H PiecewiseCombiner::finalize(H state) {
 ABSL_NAMESPACE_END
 }  // namespace absl
 
+#undef ABSL_HASH_INTERNAL_HAS_CRC32
+#undef ABSL_HASH_INTERNAL_CRC32_U64
+#undef ABSL_HASH_INTERNAL_CRC32_U32
+#undef ABSL_HASH_INTERNAL_CRC32_U8
+
 #endif  // ABSL_HASH_INTERNAL_HASH_H_