diff --git a/absl/container/flat_hash_set_test.cc b/absl/container/flat_hash_set_test.cc index f2c3fbd3..ca069b40 100644 --- a/absl/container/flat_hash_set_test.cc +++ b/absl/container/flat_hash_set_test.cc @@ -397,21 +397,6 @@ TEST(FlatHashSet, IsDefaultHash) { false); } -// Test that we don't cause excessive collisions on the hash table for -// doubles in the range [-1024, 1024]. See cl/773069881 for more information. -TEST(FlatHashSet, DoubleRange) { - using absl::container_internal::hashtable_debug_internal:: - HashtableDebugAccess; - absl::flat_hash_set set; - for (double t = -1024.0; t < 1024.0; t += 1.0) { - set.insert(t); - } - for (double t = -1024.0; t < 1024.0; t += 1.0) { - ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, t), 64) - << t; - } -} - } // namespace } // namespace container_internal ABSL_NAMESPACE_END diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h index 08c5d57a..94a3249a 100644 --- a/absl/container/internal/raw_hash_set.h +++ b/absl/container/internal/raw_hash_set.h @@ -3713,7 +3713,7 @@ struct HashtableDebugAccess> { static size_t GetNumProbes(const Set& set, const typename Set::key_type& key) { - if (set.is_soo()) return 0; + if (set.is_small()) return 0; size_t num_probes = 0; const size_t hash = set.hash_of(key); auto seq = probe(set.common(), hash); diff --git a/absl/hash/hash_test.cc b/absl/hash/hash_test.cc index 85e34c91..0ad92cff 100644 --- a/absl/hash/hash_test.cc +++ b/absl/hash/hash_test.cc @@ -58,17 +58,17 @@ namespace { +using ::absl::Hash; +using ::absl::container_internal::hashtable_debug_internal:: + HashtableDebugAccess; +using ::absl::hash_internal::SpyHashState; using ::absl::hash_test_internal::is_hashable; using ::absl::hash_test_internal::TypeErasedContainer; using ::absl::hash_test_internal::TypeErasedValue; -using ::testing::SizeIs; template using TypeErasedVector = TypeErasedContainer>; -using absl::Hash; -using absl::hash_internal::SpyHashState; - template class HashValueIntTest : public testing::Test { }; @@ -1239,4 +1239,50 @@ TEST(HashOf, DoubleSignCollision) { EXPECT_NE(absl::HashOf(-1.0), absl::HashOf(1.0)); } +// Test that we don't cause excessive collisions on the hash table for +// doubles in the range [-1024, 1024]. See cl/773069881 for more information. +TEST(SwisstableCollisions, DoubleRange) { +#ifdef GOOGLE_UNSUPPORTED_OS_LOONIX + // TODO(b/424834054): make this test pass on Loonix. + GTEST_SKIP() << "Test fails on Loonix."; +#endif + + absl::flat_hash_set set; + for (double t = -1024.0; t < 1024.0; t += 1.0) { + set.insert(t); + ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, t), 64) + << t; + } +} + +// Test that for each pair of adjacent bytes in a string, if there's only +// entropy in those two bytes, then we don't have excessive collisions. +TEST(SwisstableCollisions, LowEntropyStrings) { + if (sizeof(size_t) < 8) { + // TODO(b/424834054): make this test pass on 32-bit platforms. We need to + // make 32-bit Mix() stronger. + GTEST_SKIP() << "Test fails on 32-bit platforms"; + } + + constexpr char kMinChar = 0; + constexpr char kMaxChar = 64; + // These sizes cover the different hashing cases. + for (size_t size : {8u, 16u, 32u, 64u}) { + for (size_t b = 0; b < size - 1; ++b) { + absl::flat_hash_set set; + std::string s(size, '\0'); + for (char c1 = kMinChar; c1 < kMaxChar; ++c1) { + for (char c2 = kMinChar; c2 < kMaxChar; ++c2) { + s[b] = c1; + s[b + 1] = c2; + set.insert(s); + ASSERT_LT(HashtableDebugAccess::GetNumProbes(set, s), + 64) + << size << " " << b; + } + } + } + } +} + } // namespace diff --git a/absl/hash/internal/hash.cc b/absl/hash/internal/hash.cc index 87d2061c..8dcc5bd4 100644 --- a/absl/hash/internal/hash.cc +++ b/absl/hash/internal/hash.cc @@ -100,10 +100,10 @@ uint64_t Mix32Bytes(const uint8_t* ptr, uint64_t current_state) { ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t HashBlockOn32Bit( const unsigned char* data, size_t len, uint64_t state) { // TODO(b/417141985): expose and use CityHash32WithSeed. - return Mix( - PrecombineLengthMix(state, len) ^ - hash_internal::CityHash32(reinterpret_cast(data), len), - kMul); + // Note: we can't use PrecombineLengthMix here because len can be up to 1024. + return Mix((state + len) ^ hash_internal::CityHash32( + reinterpret_cast(data), len), + kMul); } ABSL_ATTRIBUTE_NOINLINE uint64_t diff --git a/absl/hash/internal/hash.h b/absl/hash/internal/hash.h index dc374055..0b4f8beb 100644 --- a/absl/hash/internal/hash.h +++ b/absl/hash/internal/hash.h @@ -935,21 +935,6 @@ hash_range_or_bytes(H hash_state, const T* data, size_t size) { hash_internal::WeaklyMixedInteger{size}); } -// Extremely weak mixture of length that is added to the state before combining -// the data. It is used only for small strings. -inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) { - // The length is always one byte here. We place it to 4th byte for the - // following reasons: - // 1. 4th byte is unused for very short strings 0-3 bytes. - // 2. 4th byte is duplicated for 4 bytes string. - // 3. 4th byte is in the middle and mixed well for 5-8 bytes strings. - // - // There were experiments with adding just `len` here. - // Also seems have slightly better performance overall, that gives collisions - // for small strings. - return state + (uint64_t{len} << 24); -} - inline constexpr uint64_t kMul = uint64_t{0xdcb22ca68cb134ed}; // Random data taken from the hexadecimal digits of Pi's fractional component. @@ -959,6 +944,16 @@ ABSL_CACHELINE_ALIGNED inline constexpr uint64_t kStaticRandomData[] = { 0x082e'fa98'ec4e'6c89, 0x4528'21e6'38d0'1377, }; +// Extremely weak mixture of length that is mixed into the state before +// combining the data. It is used only for small strings. This also ensures that +// we have high entropy in all bits of the state. +inline uint64_t PrecombineLengthMix(uint64_t state, size_t len) { + ABSL_ASSUME(len + sizeof(uint64_t) <= sizeof(kStaticRandomData)); + uint64_t data = absl::base_internal::UnalignedLoad64( + reinterpret_cast(&kStaticRandomData[0]) + len); + return state ^ data; +} + ABSL_ATTRIBUTE_ALWAYS_INLINE inline uint64_t Mix(uint64_t lhs, uint64_t rhs) { // For 32 bit platforms we are trying to use all 64 lower bits. if constexpr (sizeof(size_t) < 8) {