Optimize CRC-32C extension by zeroes

Optimize multiply() (renamed to MultiplyWithExtraX33()) to eliminate
several instructions that were present only to avoid introducing an
extra factor of x^33 into the multiplication.  It's actually fine to
introduce the extra factor of x^33 as long as it's canceled out with an
extra factor of x^-33 in all the kCRC32CPowers[] entries.

To make this work, the number of bits dropped by ComputeZeroConstant()
had to be increased from 2 to at least 3, since 2^(i + 3 +
kNumDroppedBits) - 33 must be >= 0 for all i including i=0; otherwise
kCRC32CPowers[0] would need a negative power of x.  However, this is
fine since it's more efficient to utilize CRC32_u32() and CRC32_u64()
for bits 2 and 3 anyway.  So, increase kNumDroppedBits to 4.

Add a Python script that generates the updated kCRC32CPowers[].  It
isn't wired up to the build system, but rather is just added so that
kCRC32CPowers[] can be reproduced.

Also add a test which tests ExtendCrc32cByZeroes() with all the length
bits, thus testing all the entries of kCRC32CPowers[].

Note that the kCRC32CPowers[] generation script and new test case are
things we should have had anyway, regardless of the x^33 optimization.

This change slightly improves the performance of Extend() for lengths
greater than or equal to 2048 bytes, and also the performance of
ExtendByZeroes().  It also slightly reduces the binary code size.

Before:
    BM_Calculate/2048                   84.3 ns         84.3 ns      8307735
    BM_Calculate/10000                   376 ns          375 ns      1865976
    BM_Calculate/500000                18538 ns        18531 ns        37813
    BM_ExtendByZeroes/1                 3.55 ns         3.55 ns    197111095
    BM_ExtendByZeroes/10                3.90 ns         3.89 ns    179773877
    BM_ExtendByZeroes/100               6.06 ns         6.06 ns    115242160
    BM_ExtendByZeroes/1000              12.0 ns         12.0 ns     58078004
    BM_ExtendByZeroes/10000             9.97 ns         9.97 ns     70335772
    BM_ExtendByZeroes/100000            12.1 ns         12.1 ns     58157829
    BM_ExtendByZeroes/1000000           14.4 ns         14.4 ns     48527365

After:
    BM_Calculate/2048                   82.8 ns         82.7 ns      8478296
    BM_Calculate/10000                   375 ns          375 ns      1869663
    BM_Calculate/500000                18547 ns        18538 ns        37846
    BM_ExtendByZeroes/1                 2.96 ns         2.96 ns    236772500
    BM_ExtendByZeroes/10                3.85 ns         3.85 ns    182059238
    BM_ExtendByZeroes/100               5.42 ns         5.42 ns    129077546
    BM_ExtendByZeroes/1000              9.43 ns         9.42 ns     74232457
    BM_ExtendByZeroes/10000             8.14 ns         8.14 ns     86244218
    BM_ExtendByZeroes/100000            10.7 ns         10.7 ns     65467391
    BM_ExtendByZeroes/1000000           11.0 ns         11.0 ns     63575936
PiperOrigin-RevId: 786828855
Change-Id: I6208625fd1c35c2c137e756cf5fadc1adccfdd5d
This commit is contained in:
Abseil Team
2025-07-24 14:03:26 -07:00
committed by Copybara-Service
parent 342afd0767
commit 57abc0ee3f
3 changed files with 179 additions and 54 deletions

View File

@@ -15,11 +15,14 @@
#include "absl/crc/crc32c.h" #include "absl/crc/crc32c.h"
#include <algorithm> #include <algorithm>
#include <array>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <cstring> #include <cstring>
#include <limits>
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <tuple>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "absl/crc/internal/crc32c.h" #include "absl/crc/internal/crc32c.h"
@@ -101,6 +104,33 @@ TEST(CRC32C, ExtendByZeroes) {
} }
} }
// Test ExtendCrc32cByZeroes() for the full range of the size_t length,
// including every bit. This is important because ExtendCrc32cByZeroes() is
// implemented using an array of constants, where each entry in the array is
// used only when a particular bit in the size_t length is set. This test
// verifies that every entry in that array is correct.
TEST(CRC32C, ExtendByZeroesAllLengthBits) {
absl::crc32c_t base_crc = absl::crc32c_t{0xc99465aa};
const std::array<std::tuple<uint64_t, absl::crc32c_t>, 5> kTestCases = {{
{0, absl::crc32c_t(0xc99465aa)},
{std::numeric_limits<uint32_t>::max(), absl::crc32c_t(0x9b1d5aaa)},
{0x12345678, absl::crc32c_t(0xcf0e9553)},
{std::numeric_limits<uint64_t>::max(), absl::crc32c_t(0xf5bff489)},
{0x12345678abcdefff, absl::crc32c_t(0xaa1ffb0b)},
}};
for (const auto &test_case : kTestCases) {
uint64_t length = std::get<0>(test_case);
absl::crc32c_t expected_value = std::get<1>(test_case);
SCOPED_TRACE(length);
if (length > std::numeric_limits<size_t>::max()) {
// On 32-bit platforms, 64-bit lengths cannot be used or tested.
continue;
}
EXPECT_EQ(absl::ExtendCrc32cByZeroes(base_crc, static_cast<size_t>(length)),
expected_value);
}
}
TEST(CRC32C, UnextendByZeroes) { TEST(CRC32C, UnextendByZeroes) {
constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000}; constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000};
constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000}; constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000};

View File

@@ -100,54 +100,67 @@ constexpr size_t kMediumCutoff = 2048;
namespace { namespace {
uint32_t multiply(uint32_t a, uint32_t b) { // Does polynomial multiplication a * b * x^33 mod G.
V128 power = V128_From64WithZeroFill(a); //
V128 crc = V128_From64WithZeroFill(b); // One of the multiplicands needs to have an extra factor of x^-33 to cancel out
V128 res = V128_PMulLow(power, crc); // the extra factor of x^33. The extra factor of x^33 comes from:
//
// - x^1 from the carry-less multiplication, due to the
// "least-significant-bit-first" convention of CRC-32C.
//
// - x^32 from using CRC32_u64() to reduce the carry-less product to 32 bits.
//
// Both could be avoided, but at the cost of extra instructions. It's more
// efficient to just drop a factor of x^33 from one of the multiplicands.
uint32_t MultiplyWithExtraX33(uint32_t a, uint32_t b) {
V128 a_vec = V128_From64WithZeroFill(a);
V128 b_vec = V128_From64WithZeroFill(b);
V128 res = V128_PMulLow(a_vec, b_vec);
// Combine crc values. return CRC32_u64(0, static_cast<uint64_t>(V128_Low64(res)));
//
// Adding res to itself is equivalent to multiplying by 2,
// or shifting left by 1. Addition is used as not all compilers
// are able to generate optimal code without this hint.
// https://godbolt.org/z/rr3fMnf39
res = V128_Add64(res, res);
return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
} }
// Powers of crc32c polynomial, for faster ExtendByZeros. // The number of low-order bits that ComputeZeroConstant() drops from the
// Verified against folly: // length, i.e. treats as zeroes
// folly/hash/detail/Crc32CombineDetail.cpp constexpr int kNumDroppedBits = 4;
// Precomputed constants for faster ExtendByZeroes(). This was generated by
// gen_crc32c_consts.py. The entry at index i is x^(2^(i + 3 + kNumDroppedBits)
// - 33) mod G. That is x^-33 times the polynomial by which the CRC value needs
// to be multiplied to extend it by 2^(i + 3 + kNumDroppedBits) zero bits, or
// equivalently 2^(i + kNumDroppedBits) zero bytes. The extra factor of x^-33
// cancels out the extra factor of x^33 that MultiplyWithExtraX33() introduces.
constexpr uint32_t kCRC32CPowers[] = { constexpr uint32_t kCRC32CPowers[] = {
0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7, 0x493c7d27, 0xba4fc28e, 0x9e4addf8, 0x0d3b6092, 0xb9e02b86, 0xdd7e3b0c,
0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564, 0x170076fa, 0xa51b6135, 0x82f89c77, 0x54a86326, 0x1dc403cc, 0x5ae703ab,
0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3, 0xc5013a36, 0xac2ac6dd, 0x9b4615a9, 0x688d1c61, 0xf6af14e6, 0xb6ffe386,
0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc, 0xb717425b, 0x478b0d30, 0x54cc62e5, 0x7b2102ee, 0x8a99adef, 0xa7568c8f,
0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000, 0xd610d67e, 0x6b086b3f, 0xd94f3c0b, 0xbf818109, 0x780d5a4d, 0x05ec76f1,
0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0x00000001, 0x493c7d27, 0xba4fc28e, 0x9e4addf8, 0x0d3b6092, 0xb9e02b86,
0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0xdd7e3b0c, 0x170076fa, 0xa51b6135, 0x82f89c77, 0x54a86326, 0x1dc403cc,
0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x5ae703ab, 0xc5013a36, 0xac2ac6dd, 0x9b4615a9, 0x688d1c61, 0xf6af14e6,
0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xb6ffe386, 0xb717425b, 0x478b0d30, 0x54cc62e5, 0x7b2102ee, 0x8a99adef,
0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0xa7568c8f, 0xd610d67e, 0x6b086b3f, 0xd94f3c0b, 0xbf818109, 0x780d5a4d,
0x00800000, 0x00008000,
}; };
// There must be an entry for each non-dropped bit in the size_t length.
static_assert(std::size(kCRC32CPowers) >= sizeof(size_t) * 8 - kNumDroppedBits);
} // namespace } // namespace
// Compute a magic constant, so that multiplying by it is the same as // Compute a magic constant, so that multiplying by it is the same as extending
// extending crc by length zeros. // crc by length zeros. The lowest kNumDroppedBits of the length are ignored and
// treated as zeroes; the caller is assumed to handle any nonzero bits there.
#if defined(NDEBUG) && ABSL_HAVE_CPP_ATTRIBUTE(clang::no_sanitize) #if defined(NDEBUG) && ABSL_HAVE_CPP_ATTRIBUTE(clang::no_sanitize)
// The array accesses in this are safe: // The array accesses in this are safe: `length >= size_t{1} <<
// length > 3, so countr_zero(length >> 2) < 62, and length & (length - 1) // kNumDroppedBits`, so `countr_zero(length >> kNumDroppedBits) < sizeof(size_t)
// cannot introduce bits >= 62. // * 8 - kNumDroppedBits`, and `length & (length - 1)` cannot introduce bits
// The compiler cannot prove this, so manually disable bounds checking. // `>= sizeof(size_t) * 8 - kNumDroppedBits`. The compiler cannot prove this, so
// manually disable bounds checking.
[[clang::no_sanitize("array-bounds")]] [[clang::no_sanitize("array-bounds")]]
#endif #endif
uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant( uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
size_t length) const { size_t length) const {
// Lowest 2 bits are handled separately in ExtendByZeroes length >>= kNumDroppedBits;
length >>= 2;
int index = absl::countr_zero(length); int index = absl::countr_zero(length);
uint32_t prev = kCRC32CPowers[index]; uint32_t prev = kCRC32CPowers[index];
@@ -156,7 +169,7 @@ uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
while (length) { while (length) {
// For each bit of length, extend by 2**n zeros. // For each bit of length, extend by 2**n zeros.
index = absl::countr_zero(length); index = absl::countr_zero(length);
prev = multiply(prev, kCRC32CPowers[index]); prev = MultiplyWithExtraX33(prev, kCRC32CPowers[index]);
length &= length - 1; length &= length - 1;
} }
return prev; return prev;
@@ -166,22 +179,13 @@ void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
size_t length) const { size_t length) const {
uint32_t val = *crc; uint32_t val = *crc;
// Don't bother with multiplication for small length. // Don't bother with multiplication for small length.
switch (length & 3) { if (length & 1) val = CRC32_u8(val, 0);
case 0: if (length & 2) val = CRC32_u16(val, 0);
break; if (length & 4) val = CRC32_u32(val, 0);
case 1: if (length & 8) val = CRC32_u64(val, 0);
val = CRC32_u8(val, 0); static_assert(kNumDroppedBits == 4);
break; if (length >= size_t{1} << kNumDroppedBits) {
case 2: val = MultiplyWithExtraX33(val, ComputeZeroConstant(length));
val = CRC32_u16(val, 0);
break;
case 3:
val = CRC32_u8(val, 0);
val = CRC32_u16(val, 0);
break;
}
if (length > 3) {
val = multiply(val, ComputeZeroConstant(length));
} }
*crc = val; *crc = val;
} }
@@ -549,14 +553,15 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
} }
// Combine all streams into single result. // Combine all streams into single result.
static_assert(64 % (1 << kNumDroppedBits) == 0);
uint32_t magic = ComputeZeroConstant(bs * 64); uint32_t magic = ComputeZeroConstant(bs * 64);
l64 = l64_crc[0]; l64 = l64_crc[0];
for (size_t i = 1; i < num_crc_streams; i++) { for (size_t i = 1; i < num_crc_streams; i++) {
l64 = multiply(static_cast<uint32_t>(l64), magic); l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
l64 ^= l64_crc[i]; l64 ^= l64_crc[i];
} }
for (size_t i = 0; i < num_pclmul_streams; i++) { for (size_t i = 0; i < num_pclmul_streams; i++) {
l64 = multiply(static_cast<uint32_t>(l64), magic); l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
l64 ^= l64_pclmul[i]; l64 ^= l64_pclmul[i];
} }

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
#
# Copyright 2025 The Abseil Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This script generates kCRC32CPowers[]."""
def poly_mul(a, b):
"""Polynomial multiplication: a * b."""
product = 0
for i in range(b.bit_length()):
if (b & (1 << i)) != 0:
product ^= a << i
return product
def poly_div(a, b):
"""Polynomial division: floor(a / b)."""
q = 0
while a.bit_length() >= b.bit_length():
q ^= 1 << (a.bit_length() - b.bit_length())
a ^= b << (a.bit_length() - b.bit_length())
return q
def poly_reduce(a, b):
"""Polynomial reduction: a mod b."""
return a ^ poly_mul(poly_div(a, b), b)
def poly_exp(a, b, g):
"""Polynomial exponentiation: a^b mod g."""
if b == 1:
return poly_reduce(a, g)
c = poly_exp(a, b // 2, g)
c = poly_mul(c, c)
if b % 2 != 0:
c = poly_mul(c, a)
return poly_reduce(c, g)
def bitreflect(a, num_bits):
"""Reflects the bits of the given integer."""
if a.bit_length() > num_bits:
raise ValueError(f'Integer has more than {num_bits} bits')
return sum(((a >> i) & 1) << (num_bits - 1 - i) for i in range(num_bits))
G = 0x11EDC6F41 # The CRC-32C reducing polynomial, in the "natural" bit order
CRC_BITS = 32 # The degree of G, i.e. the 32 in "CRC-32C"
LSB_FIRST = True # CRC-32C is a least-significant-bit-first CRC
NUM_SIZE_BITS = 64 # The maximum number of bits in the length (size_t)
NUM_DROPPED_BITS = 4 # The number of bits dropped from the length
LOG2_BITS_PER_BYTE = 3 # log2 of the number of bits in a byte, i.e. log2(8)
X = 2 # The polynomial 'x', in the "natural" bit order
def print_crc32c_powers():
"""Generates kCRC32CPowers[].
kCRC32CPowers[] is an array of length NUM_SIZE_BITS - NUM_DROPPED_BITS,
whose i'th entry is x^(2^(i + LOG2_BITS_PER_BYTE + NUM_DROPPED_BITS) -
CRC_BITS - 1) mod G. See kCRC32CPowers[] in the C++ source for more info.
"""
for i in range(NUM_SIZE_BITS - NUM_DROPPED_BITS):
poly = poly_exp(
X,
2 ** (i + LOG2_BITS_PER_BYTE + NUM_DROPPED_BITS)
- CRC_BITS
- (1 if LSB_FIRST else 0),
G,
)
poly = bitreflect(poly, CRC_BITS)
print(f'0x{poly:0{2*CRC_BITS//8}x}, ', end='')
if __name__ == '__main__':
print_crc32c_powers()