CRC32 version of CombineContiguous for length <= 32.

For length in [17, 32] we compute two chain of dependent CRC32  operations to have good entropy in the resulting two 32 bit numbers.
1. x := CRC32(CRC32(state, A), D)
2. y := CRC32(CRC32(bswap(state), C), B)

On ARM:
  CRC32 has 2 cycles latency and throughput equal to 1.
  Computations will be pipelined without any wait.
On x86:
  CRC32 has 3 cycles latency and throughput equal to 1.
  There will be 1 extra cycle wait, but we can do `cmp` in parallel.

At the end we multiply (mul - x) * (y - mul). mul is added to fill upper 32 bits of CRC result with good entropy bits. `mul = rotr(kMul, len)`

We also mixing length differently:
1. `state + 8 * len` (`lea` instruction), later one or two CRC shuffle these bits well into low 32 bit.
2. `rotr(kMul, len)` is used for filling high 32 bits before multiplication in `Mix`. This avoid reading from `kStaticRandomData`.

For smaller strings we try to extremely minimize binary size and register pressure.
CRC instruction fused with memory read is used. llvm-mca reporting 1 cycle smaller latency compared to separate `mov` + `crc`.

ASM analysis https://godbolt.org/z/e1xrKzhdc:
1. 100+ bytes binary size saving (per inline instance)
2. 25+ instruction saving
3. 2 registers are not used (r8 and r9).

Latency in isolation without accounting comparison are controversial.
1. latency for 8 bytes in isolation is 1 cycle better: https://godbolt.org/z/zc39eM3K9
2. latency for 1-3 bytes in isolation is 2 cycles better: https://godbolt.org/z/qMKfbv438
3. latency for 16 bytes in isolation is 3 cycles worse: https://godbolt.org/z/vcqr8oGv3
4. latency for 32 bytes in isolation is 5 cycles worse:
https://godbolt.org/z/nEPP5jP58

PiperOrigin-RevId: 850659551
Change-Id: I02a2434f2d98473b099c171ef1c56adffa821c60
This commit is contained in:
Vitaly Goldshteyn
2025-12-31 00:51:16 -08:00
committed by Copybara-Service
parent 7b40ebf946
commit 60b607be5b

View File

@@ -74,6 +74,7 @@
#include <vector>
#include "absl/base/attributes.h"
#include "absl/base/internal/endian.h"
#include "absl/base/internal/unaligned_access.h"
#include "absl/base/optimization.h"
#include "absl/base/port.h"
@@ -93,6 +94,38 @@
#include <filesystem> // NOLINT
#endif
// 32-bit builds with SSE 4.2 do not have _mm_crc32_u64, so the
// __x86_64__ condition is necessary.
#if defined(__SSE4_2__) && defined(__x86_64__)
#include <x86intrin.h>
#define ABSL_HASH_INTERNAL_HAS_CRC32
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__)
// MSVC AVX (/arch:AVX) implies SSE 4.2.
#include <intrin.h>
#define ABSL_HASH_INTERNAL_HAS_CRC32
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
#elif defined(__ARM_FEATURE_CRC32)
#include <arm_acle.h>
#define ABSL_HASH_INTERNAL_HAS_CRC32
// Casting to uint32_t to be consistent with x86 intrinsic (_mm_crc32_u64
// accepts crc as 64 bit integer).
#define ABSL_HASH_INTERNAL_CRC32_U64(crc, data) \
__crc32cd(static_cast<uint32_t>(crc), data)
#define ABSL_HASH_INTERNAL_CRC32_U32 __crc32cw
#define ABSL_HASH_INTERNAL_CRC32_U8 __crc32cb
#endif
namespace absl {
ABSL_NAMESPACE_BEGIN
@@ -965,18 +998,20 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
return Uint128High64(m) ^ Uint128Low64(m);
}
// Reads 8 bytes from p.
inline uint64_t Read8(const unsigned char* p) {
// Suppress erroneous array bounds errors on GCC.
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
inline uint32_t Read4(const unsigned char* p) {
return absl::base_internal::UnalignedLoad32(p);
}
inline uint64_t Read8(const unsigned char* p) {
return absl::base_internal::UnalignedLoad64(p);
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
// Reads 9 to 16 bytes from p.
// The first 8 bytes are in .first, and the rest of the bytes are in .second
@@ -1096,6 +1131,70 @@ inline uint64_t CombineContiguousImpl(
return CombineLargeContiguousImplOn32BitLengthGt8(state, first, len);
}
#ifdef ABSL_HASH_INTERNAL_HAS_CRC32
inline uint64_t CombineContiguousImpl(
uint64_t state, const unsigned char* first, size_t len,
std::integral_constant<int, 8> /* sizeof_size_t */) {
if (ABSL_PREDICT_FALSE(len > 32)) {
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
}
// `mul` is the salt that is used for final mixing. It is important to fill
// high 32 bits because CRC wipes out high 32 bits.
// `rotr` is important to mix `len` into high 32 bits.
uint64_t mul = absl::rotr(kMul, static_cast<int>(len));
// Only low 32 bits of each uint64_t are used in CRC32 so we use gbswap_64 to
// move high 32 bits to low 32 bits. It has slightly smaller binary size than
// `>> 32`. `state + 8 * len` is a single instruction on both x86 and ARM, so
// we use it to better mix length. Although only the low 32 bits of the pair
// elements are used, we use pair<uint64_t, uint64_t> for better generated
// code.
std::pair<uint64_t, uint64_t> crcs = {state + 8 * len,
absl::gbswap_64(state)};
// All CRC operations here directly read bytes from the memory.
// Single fused instructions are used, like `crc32 rcx, qword ptr [rsi]`.
// On x86, llvm-mca reports latency `R + 2` for such fused instructions, while
// `R + 3` for two separate `mov` + `crc` instructions. `R` is the latency of
// reading the memory. Fused instructions also reduce register pressure
// allowing surrounding code to be more efficient when this code is inlined.
if (len > 8) {
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first)),
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + len - 8))};
if (len > 16) {
// We compute the second round of dependent CRC32 operations.
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first + len - 16)),
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + 8))};
}
} else {
if (len >= 4) {
// We use CRC for 4 bytes to benefit from the fused instruction and better
// hash quality.
// Using `xor` or `add` may reduce latency for this case, but would
// require more registers, more instructions and will have worse hash
// quality.
crcs = {ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.first),
Read4(first)),
ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.second),
Read4(first + len - 4))};
} else if (len >= 1) {
// We mix three bytes all into different output registers.
// This way, we do not need shifting of these bytes (so they don't overlap
// with each other).
crcs = {ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.first),
first[0]),
ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.second),
first[len - 1])};
// Middle byte is mixed weaker. It is a new byte only for len == 3.
// Mixing is independent from CRC operations so it is scheduled ASAP.
mul += first[len / 2];
}
}
// `mul` is mixed into both sides of `Mix` to guarantee non-zero values for
// both multiplicands. Using Mix instead of just multiplication here improves
// hash quality, especially for short strings.
return Mix(mul - crcs.first, crcs.second - mul);
}
#else
inline uint64_t CombineContiguousImpl(
uint64_t state, const unsigned char* first, size_t len,
std::integral_constant<int, 8> /* sizeof_size_t */) {
@@ -1118,6 +1217,7 @@ inline uint64_t CombineContiguousImpl(
// to calling CombineLargeContiguousImpl once with 2 * PiecewiseChunkSize().
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
}
#endif // ABSL_HASH_INTERNAL_HAS_CRC32
#if defined(ABSL_INTERNAL_LEGACY_HASH_NAMESPACE) && \
ABSL_META_INTERNAL_STD_HASH_SFINAE_FRIENDLY_
@@ -1452,4 +1552,9 @@ H PiecewiseCombiner::finalize(H state) {
ABSL_NAMESPACE_END
} // namespace absl
#undef ABSL_HASH_INTERNAL_HAS_CRC32
#undef ABSL_HASH_INTERNAL_CRC32_U64
#undef ABSL_HASH_INTERNAL_CRC32_U32
#undef ABSL_HASH_INTERNAL_CRC32_U8
#endif // ABSL_HASH_INTERNAL_HASH_H_