From 4e5beaf3b8bdfd2f80aba4e448cb1f659b3b7504 Mon Sep 17 00:00:00 2001 From: Evan Brown Date: Mon, 23 Jun 2025 12:52:24 -0700 Subject: [PATCH] Change PrecombineLengthMix to sample data from kStaticRandomData. The idea is based on Carbon hashing's SampleRandomData(). This also fixes an issue that we found when adding debug asserts to detect cases of long probe sequences in absl hash tables. In that case, the keys were length=29 strings with all the entropy in the 2nd and 3rd bytes, and we saw many collisions due to low entropy in the last byte of the hash. It seems that this was caused by the input seed having low entropy, and doing the length mixing this way solves that issue. Also: - Fix a bug in HashtableDebugAccess::GetNumProbes. - Move the DoubleRange test from flat_hash_set_test.cc to hash_test.cc since it's a test for hash quality. - Fix using declaration style in hash_test.cc to comply with Google C++ style guide. PiperOrigin-RevId: 774891965 Change-Id: I22f18a0288e9bfb2734f7b8c6e788b623a1db0f5 --- absl/container/flat_hash_set_test.cc | 15 ------- absl/container/internal/raw_hash_set.h | 2 +- absl/hash/hash_test.cc | 54 ++++++++++++++++++++++++-- absl/hash/internal/hash.cc | 8 ++-- absl/hash/internal/hash.h | 25 +++++------- 5 files changed, 65 insertions(+), 39 deletions(-) diff --git a/absl/container/flat_hash_set_test.cc b/absl/container/flat_hash_set_test.cc index f2c3fbd3..ca069b40 100644 --- a/absl/container/flat_hash_set_test.cc +++ b/absl/container/flat_hash_set_test.cc @@ -397,21 +397,6 @@ TEST(FlatHashSet, IsDefaultHash) { false); } -// Test that we don't cause excessive collisions on the hash table for -// doubles in the range [-1024, 1024]. See cl/773069881 for more information. -TEST(FlatHashSet, DoubleRange) { - using absl::container_internal::hashtable_debug_internal:: - HashtableDebugAccess; - absl::flat_hash_set set; - for (double t = -1024.0; t < 1024.0; t += 1.0) { - set.insert(t); - } - for (double t = -1024.0; t < 1024.0; t += 1.0) { - ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, t), 64) - << t; - } -} - } // namespace } // namespace container_internal ABSL_NAMESPACE_END diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h index 08c5d57a..94a3249a 100644 --- a/absl/container/internal/raw_hash_set.h +++ b/absl/container/internal/raw_hash_set.h @@ -3713,7 +3713,7 @@ struct HashtableDebugAccess> { static size_t GetNumProbes(const Set& set, const typename Set::key_type& key) { - if (set.is_soo()) return 0; + if (set.is_small()) return 0; size_t num_probes = 0; const size_t hash = set.hash_of(key); auto seq = probe(set.common(), hash); diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc index 85e34c91..0ad92cff 100644 --- a/absl/hash/hash_test.cc +++ b/absl/hash/hash_test.cc @@ -58,17 +58,17 @@ namespace { +using ::absl::Hash; +using ::absl::container_internal::hashtable_debug_internal:: + HashtableDebugAccess; +using ::absl::hash_internal::SpyHashState; using ::absl::hash_test_internal::is_hashable; using ::absl::hash_test_internal::TypeErasedContainer; using ::absl::hash_test_internal::TypeErasedValue; -using ::testing::SizeIs; template using TypeErasedVector = TypeErasedContainer>; -using absl::Hash; -using absl::hash_internal::SpyHashState; - template class HashValueIntTest : public testing::Test { }; @@ -1239,4 +1239,50 @@ TEST(HashOf, DoubleSignCollision) { EXPECT_NE(absl::HashOf(-1.0), absl::HashOf(1.0)); } +// Test that we don't cause excessive collisions on the hash table for +// doubles in the range [-1024, 1024]. See cl/773069881 for more information. +TEST(SwisstableCollisions, DoubleRange) { +#ifdef GOOGLE_UNSUPPORTED_OS_LOONIX + // TODO(b/424834054): make this test pass on Loonix. + GTEST_SKIP() << "Test fails on Loonix."; +#endif + + absl::flat_hash_set set; + for (double t = -1024.0; t < 1024.0; t += 1.0) { + set.insert(t); + ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, t), 64) + << t; + } +} + +// Test that for each pair of adjacent bytes in a string, if there's only +// entropy in those two bytes, then we don't have excessive collisions. +TEST(SwisstableCollisions, LowEntropyStrings) { + if (sizeof(size_t) < 8) { + // TODO(b/424834054): make this test pass on 32-bit platforms. We need to + // make 32-bit Mix() stronger. + GTEST_SKIP() << "Test fails on 32-bit platforms"; + } + + constexpr char kMinChar = 0; + constexpr char kMaxChar = 64; + // These sizes cover the different hashing cases. + for (size_t size : {8u, 16u, 32u, 64u}) { + for (size_t b = 0; b < size - 1; ++b) { + absl::flat_hash_set set; + std::string s(size, '\0'); + for (char c1 = kMinChar; c1 < kMaxChar; ++c1) { + for (char c2 = kMinChar; c2 < kMaxChar; ++c2) { + s[b] = c1; + s[b + 1] = c2; + set.insert(s); + ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, s), + 64) + << size << " " << b; + } + } + } + } +} + } // namespace diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc index 87d2061c..8dcc5bd4 100644 --- a/absl/hash/internal/hash.cc +++ b/absl/hash/internal/hash.cc @@ -100,10 +100,10 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) { ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit( const unsigned char* data, size_t len, uint64_t state) { // TODO(b/417141985): expose and use CityHash32WithSeed. - return Mix( - PrecombineLengthMix(state, len) ^ - hash_internal::CityHash32(reinterpret_cast(data), len), - kMul); + // Note: we can't use PrecombineLengthMix here because len can be up to 1024. + return Mix((state + len) ^ hash_internal::CityHash32( + reinterpret_cast(data), len), + kMul); } ABSL_ATTRIBUTE_NOINLINE uint64_t diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h index dc374055..0b4f8beb 100644 --- a/absl/hash/internal/hash.h +++ b/absl/hash/internal/hash.h @@ -935,21 +935,6 @@ hash_range_or_bytes(H hash_state, const T* data, size_t size) { hash_internal::WeaklyMixedInteger{size}); } -// Extremely weak mixture of length that is added to the state before combining -// the data. It is used only for small strings. -inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) { - // The length is always one byte here. We place it to 4th byte for the - // following reasons: - // 1. 4th byte is unused for very short strings 0-3 bytes. - // 2. 4th byte is duplicated for 4 bytes string. - // 3. 4th byte is in the middle and mixed well for 5-8 bytes strings. - // - // There were experiments with adding just `len` here. - // Also seems have slightly better performance overall, that gives collisions - // for small strings. - return state + (uint64_t{len} << 24); -} - inline constexpr uint64_t kMul = uint64_t{0xdcb22ca68cb134ed}; // Random data taken from the hexadecimal digits of Pi's fractional component. @@ -959,6 +944,16 @@ ABSL_CACHELINE_ALIGNED inline constexpr uint64_t kStaticRandomData[] = { 0x082e'fa98'ec4e'6c89, 0x4528'21e6'38d0'1377, }; +// Extremely weak mixture of length that is mixed into the state before +// combining the data. It is used only for small strings. This also ensures that +// we have high entropy in all bits of the state. +inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) { + ABSL_ASSUME(len + sizeof(uint64_t) <= sizeof(kStaticRandomData)); + uint64_t data = absl::base_internal::UnalignedLoad64( + reinterpret_cast(&kStaticRandomData[0]) + len); + return state ^ data; +} + ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) { // For 32 bit platforms we are trying to use all 64 lower bits. if constexpr (sizeof(size_t) < 8) {