mirror of
https://github.com/abseil/abseil-cpp.git
synced 2026-06-04 12:07:05 +08:00
CRC32 version of CombineContiguous for length <= 32.
For length in [17, 32] we compute two chain of dependent CRC32 operations to have good entropy in the resulting two 32 bit numbers. 1. x := CRC32(CRC32(state, A), D) 2. y := CRC32(CRC32(bswap(state), C), B) On ARM: CRC32 has 2 cycles latency and throughput equal to 1. Computations will be pipelined without any wait. On x86: CRC32 has 3 cycles latency and throughput equal to 1. There will be 1 extra cycle wait, but we can do `cmp` in parallel. At the end we multiply (mul - x) * (y - mul). mul is added to fill upper 32 bits of CRC result with good entropy bits. `mul = rotr(kMul, len)` We also mixing length differently: 1. `state + 8 * len` (`lea` instruction), later one or two CRC shuffle these bits well into low 32 bit. 2. `rotr(kMul, len)` is used for filling high 32 bits before multiplication in `Mix`. This avoid reading from `kStaticRandomData`. For smaller strings we try to extremely minimize binary size and register pressure. CRC instruction fused with memory read is used. llvm-mca reporting 1 cycle smaller latency compared to separate `mov` + `crc`. ASM analysis https://godbolt.org/z/e1xrKzhdc: 1. 100+ bytes binary size saving (per inline instance) 2. 25+ instruction saving 3. 2 registers are not used (r8 and r9). Latency in isolation without accounting comparison are controversial. 1. latency for 8 bytes in isolation is 1 cycle better: https://godbolt.org/z/zc39eM3K9 2. latency for 1-3 bytes in isolation is 2 cycles better: https://godbolt.org/z/qMKfbv438 3. latency for 16 bytes in isolation is 3 cycles worse: https://godbolt.org/z/vcqr8oGv3 4. latency for 32 bytes in isolation is 5 cycles worse: https://godbolt.org/z/nEPP5jP58 PiperOrigin-RevId: 850659551 Change-Id: I02a2434f2d98473b099c171ef1c56adffa821c60
This commit is contained in:
committed by
Copybara-Service
parent
7b40ebf946
commit
60b607be5b
@@ -74,6 +74,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "absl/base/attributes.h"
|
||||
#include "absl/base/internal/endian.h"
|
||||
#include "absl/base/internal/unaligned_access.h"
|
||||
#include "absl/base/optimization.h"
|
||||
#include "absl/base/port.h"
|
||||
@@ -93,6 +94,38 @@
|
||||
#include <filesystem> // NOLINT
|
||||
#endif
|
||||
|
||||
// 32-bit builds with SSE 4.2 do not have _mm_crc32_u64, so the
|
||||
// __x86_64__ condition is necessary.
|
||||
#if defined(__SSE4_2__) && defined(__x86_64__)
|
||||
|
||||
#include <x86intrin.h>
|
||||
#define ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
|
||||
|
||||
#elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__)
|
||||
|
||||
// MSVC AVX (/arch:AVX) implies SSE 4.2.
|
||||
#include <intrin.h>
|
||||
#define ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U64 _mm_crc32_u64
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U32 _mm_crc32_u32
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U8 _mm_crc32_u8
|
||||
|
||||
#elif defined(__ARM_FEATURE_CRC32)
|
||||
|
||||
#include <arm_acle.h>
|
||||
#define ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
// Casting to uint32_t to be consistent with x86 intrinsic (_mm_crc32_u64
|
||||
// accepts crc as 64 bit integer).
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U64(crc, data) \
|
||||
__crc32cd(static_cast<uint32_t>(crc), data)
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U32 __crc32cw
|
||||
#define ABSL_HASH_INTERNAL_CRC32_U8 __crc32cb
|
||||
|
||||
#endif
|
||||
|
||||
namespace absl {
|
||||
ABSL_NAMESPACE_BEGIN
|
||||
|
||||
@@ -965,18 +998,20 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) {
|
||||
return Uint128High64(m) ^ Uint128Low64(m);
|
||||
}
|
||||
|
||||
// Reads 8 bytes from p.
|
||||
inline uint64_t Read8(const unsigned char* p) {
|
||||
// Suppress erroneous array bounds errors on GCC.
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Warray-bounds"
|
||||
#endif
|
||||
inline uint32_t Read4(const unsigned char* p) {
|
||||
return absl::base_internal::UnalignedLoad32(p);
|
||||
}
|
||||
inline uint64_t Read8(const unsigned char* p) {
|
||||
return absl::base_internal::UnalignedLoad64(p);
|
||||
}
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
}
|
||||
|
||||
// Reads 9 to 16 bytes from p.
|
||||
// The first 8 bytes are in .first, and the rest of the bytes are in .second
|
||||
@@ -1096,6 +1131,70 @@ inline uint64_t CombineContiguousImpl(
|
||||
return CombineLargeContiguousImplOn32BitLengthGt8(state, first, len);
|
||||
}
|
||||
|
||||
#ifdef ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
inline uint64_t CombineContiguousImpl(
|
||||
uint64_t state, const unsigned char* first, size_t len,
|
||||
std::integral_constant<int, 8> /* sizeof_size_t */) {
|
||||
if (ABSL_PREDICT_FALSE(len > 32)) {
|
||||
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
|
||||
}
|
||||
// `mul` is the salt that is used for final mixing. It is important to fill
|
||||
// high 32 bits because CRC wipes out high 32 bits.
|
||||
// `rotr` is important to mix `len` into high 32 bits.
|
||||
uint64_t mul = absl::rotr(kMul, static_cast<int>(len));
|
||||
// Only low 32 bits of each uint64_t are used in CRC32 so we use gbswap_64 to
|
||||
// move high 32 bits to low 32 bits. It has slightly smaller binary size than
|
||||
// `>> 32`. `state + 8 * len` is a single instruction on both x86 and ARM, so
|
||||
// we use it to better mix length. Although only the low 32 bits of the pair
|
||||
// elements are used, we use pair<uint64_t, uint64_t> for better generated
|
||||
// code.
|
||||
std::pair<uint64_t, uint64_t> crcs = {state + 8 * len,
|
||||
absl::gbswap_64(state)};
|
||||
|
||||
// All CRC operations here directly read bytes from the memory.
|
||||
// Single fused instructions are used, like `crc32 rcx, qword ptr [rsi]`.
|
||||
// On x86, llvm-mca reports latency `R + 2` for such fused instructions, while
|
||||
// `R + 3` for two separate `mov` + `crc` instructions. `R` is the latency of
|
||||
// reading the memory. Fused instructions also reduce register pressure
|
||||
// allowing surrounding code to be more efficient when this code is inlined.
|
||||
if (len > 8) {
|
||||
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first)),
|
||||
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + len - 8))};
|
||||
if (len > 16) {
|
||||
// We compute the second round of dependent CRC32 operations.
|
||||
crcs = {ABSL_HASH_INTERNAL_CRC32_U64(crcs.first, Read8(first + len - 16)),
|
||||
ABSL_HASH_INTERNAL_CRC32_U64(crcs.second, Read8(first + 8))};
|
||||
}
|
||||
} else {
|
||||
if (len >= 4) {
|
||||
// We use CRC for 4 bytes to benefit from the fused instruction and better
|
||||
// hash quality.
|
||||
// Using `xor` or `add` may reduce latency for this case, but would
|
||||
// require more registers, more instructions and will have worse hash
|
||||
// quality.
|
||||
crcs = {ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.first),
|
||||
Read4(first)),
|
||||
ABSL_HASH_INTERNAL_CRC32_U32(static_cast<uint32_t>(crcs.second),
|
||||
Read4(first + len - 4))};
|
||||
} else if (len >= 1) {
|
||||
// We mix three bytes all into different output registers.
|
||||
// This way, we do not need shifting of these bytes (so they don't overlap
|
||||
// with each other).
|
||||
crcs = {ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.first),
|
||||
first[0]),
|
||||
ABSL_HASH_INTERNAL_CRC32_U8(static_cast<uint32_t>(crcs.second),
|
||||
first[len - 1])};
|
||||
// Middle byte is mixed weaker. It is a new byte only for len == 3.
|
||||
// Mixing is independent from CRC operations so it is scheduled ASAP.
|
||||
mul += first[len / 2];
|
||||
}
|
||||
}
|
||||
// `mul` is mixed into both sides of `Mix` to guarantee non-zero values for
|
||||
// both multiplicands. Using Mix instead of just multiplication here improves
|
||||
// hash quality, especially for short strings.
|
||||
return Mix(mul - crcs.first, crcs.second - mul);
|
||||
}
|
||||
#else
|
||||
inline uint64_t CombineContiguousImpl(
|
||||
uint64_t state, const unsigned char* first, size_t len,
|
||||
std::integral_constant<int, 8> /* sizeof_size_t */) {
|
||||
@@ -1118,6 +1217,7 @@ inline uint64_t CombineContiguousImpl(
|
||||
// to calling CombineLargeContiguousImpl once with 2 * PiecewiseChunkSize().
|
||||
return CombineLargeContiguousImplOn64BitLengthGt32(state, first, len);
|
||||
}
|
||||
#endif // ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
|
||||
#if defined(ABSL_INTERNAL_LEGACY_HASH_NAMESPACE) && \
|
||||
ABSL_META_INTERNAL_STD_HASH_SFINAE_FRIENDLY_
|
||||
@@ -1452,4 +1552,9 @@ H PiecewiseCombiner::finalize(H state) {
|
||||
ABSL_NAMESPACE_END
|
||||
} // namespace absl
|
||||
|
||||
#undef ABSL_HASH_INTERNAL_HAS_CRC32
|
||||
#undef ABSL_HASH_INTERNAL_CRC32_U64
|
||||
#undef ABSL_HASH_INTERNAL_CRC32_U32
|
||||
#undef ABSL_HASH_INTERNAL_CRC32_U8
|
||||
|
||||
#endif // ABSL_HASH_INTERNAL_HASH_H_
|
||||
|
||||
Reference in New Issue
Block a user