Optimize CRC-32C extension by zeroes

Optimize multiply() (renamed to MultiplyWithExtraX33()) to eliminate several instructions that were present only to avoid introducing an extra factor of x^33 into the multiplication. It's actually fine to introduce the extra factor of x^33 as long as it's canceled out with an extra factor of x^-33 in all the kCRC32CPowers[] entries. To make this work, the number of bits dropped by ComputeZeroConstant() had to be increased from 2 to at least 3, since 2^(i + 3 + kNumDroppedBits) - 33 must be >= 0 for all i including i=0; otherwise kCRC32CPowers[0] would need a negative power of x. However, this is fine since it's more efficient to utilize CRC32_u32() and CRC32_u64() for bits 2 and 3 anyway. So, increase kNumDroppedBits to 4. Add a Python script that generates the updated kCRC32CPowers[]. It isn't wired up to the build system, but rather is just added so that kCRC32CPowers[] can be reproduced. Also add a test which tests ExtendCrc32cByZeroes() with all the length bits, thus testing all the entries of kCRC32CPowers[]. Note that the kCRC32CPowers[] generation script and new test case are things we should have had anyway, regardless of the x^33 optimization. This change slightly improves the performance of Extend() for lengths greater than or equal to 2048 bytes, and also the performance of ExtendByZeroes(). It also slightly reduces the binary code size. Before: BM_Calculate/2048 84.3 ns 84.3 ns 8307735 BM_Calculate/10000 376 ns 375 ns 1865976 BM_Calculate/500000 18538 ns 18531 ns 37813 BM_ExtendByZeroes/1 3.55 ns 3.55 ns 197111095 BM_ExtendByZeroes/10 3.90 ns 3.89 ns 179773877 BM_ExtendByZeroes/100 6.06 ns 6.06 ns 115242160 BM_ExtendByZeroes/1000 12.0 ns 12.0 ns 58078004 BM_ExtendByZeroes/10000 9.97 ns 9.97 ns 70335772 BM_ExtendByZeroes/100000 12.1 ns 12.1 ns 58157829 BM_ExtendByZeroes/1000000 14.4 ns 14.4 ns 48527365 After: BM_Calculate/2048 82.8 ns 82.7 ns 8478296 BM_Calculate/10000 375 ns 375 ns 1869663 BM_Calculate/500000 18547 ns 18538 ns 37846 BM_ExtendByZeroes/1 2.96 ns 2.96 ns 236772500 BM_ExtendByZeroes/10 3.85 ns 3.85 ns 182059238 BM_ExtendByZeroes/100 5.42 ns 5.42 ns 129077546 BM_ExtendByZeroes/1000 9.43 ns 9.42 ns 74232457 BM_ExtendByZeroes/10000 8.14 ns 8.14 ns 86244218 BM_ExtendByZeroes/100000 10.7 ns 10.7 ns 65467391 BM_ExtendByZeroes/1000000 11.0 ns 11.0 ns 63575936 PiperOrigin-RevId: 786828855 Change-Id: I6208625fd1c35c2c137e756cf5fadc1adccfdd5d
2026-06-04 12:07:05 +08:00 · 2025-07-24 14:03:26 -07:00
parent 342afd0767
commit 57abc0ee3f
3 changed files with 179 additions and 54 deletions
--- a/absl/crc/crc32c_test.cc
+++ b/absl/crc/crc32c_test.cc
@@ -15,11 +15,14 @@
 #include "absl/crc/crc32c.h"
 #include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <limits>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include "gtest/gtest.h"
 #include "absl/crc/internal/crc32c.h"
@@ -101,6 +104,33 @@ TEST(CRC32C, ExtendByZeroes) {
  }
 }
 // Test ExtendCrc32cByZeroes() for the full range of the size_t length,
 // including every bit. This is important because ExtendCrc32cByZeroes() is
 // implemented using an array of constants, where each entry in the array is
 // used only when a particular bit in the size_t length is set. This test
 // verifies that every entry in that array is correct.
 TEST(CRC32C, ExtendByZeroesAllLengthBits) {
  absl::crc32c_t base_crc = absl::crc32c_t{0xc99465aa};
  const std::array<std::tuple<uint64_t, absl::crc32c_t>, 5> kTestCases = {{
      {0, absl::crc32c_t(0xc99465aa)},
      {std::numeric_limits<uint32_t>::max(), absl::crc32c_t(0x9b1d5aaa)},
      {0x12345678, absl::crc32c_t(0xcf0e9553)},
      {std::numeric_limits<uint64_t>::max(), absl::crc32c_t(0xf5bff489)},
      {0x12345678abcdefff, absl::crc32c_t(0xaa1ffb0b)},
  }};
  for (const auto &test_case : kTestCases) {
    uint64_t length = std::get<0>(test_case);
    absl::crc32c_t expected_value = std::get<1>(test_case);
    SCOPED_TRACE(length);
    if (length > std::numeric_limits<size_t>::max()) {
      // On 32-bit platforms, 64-bit lengths cannot be used or tested.
      continue;
    }
    EXPECT_EQ(absl::ExtendCrc32cByZeroes(base_crc, static_cast<size_t>(length)),
              expected_value);
  }
 }
 TEST(CRC32C, UnextendByZeroes) {
  constexpr size_t kExtendByValues[] = {2, 200, 20000, 200000, 20000000};
  constexpr size_t kUnextendByValues[] = {0, 100, 10000, 100000, 10000000};
--- a/absl/crc/internal/crc_x86_arm_combined.cc
+++ b/absl/crc/internal/crc_x86_arm_combined.cc
@@ -100,54 +100,67 @@ constexpr size_t kMediumCutoff = 2048;
 namespace {
-uint32_t multiply(uint32_t a, uint32_t b) {
+// Does polynomial multiplication a * b * x^33 mod G.
-  V128 power = V128_From64WithZeroFill(a);
+//
-  V128 crc = V128_From64WithZeroFill(b);
+// One of the multiplicands needs to have an extra factor of x^-33 to cancel out
-  V128 res = V128_PMulLow(power, crc);
+// the extra factor of x^33. The extra factor of x^33 comes from:
 //
 // - x^1 from the carry-less multiplication, due to the
 //   "least-significant-bit-first" convention of CRC-32C.
 //
 // - x^32 from using CRC32_u64() to reduce the carry-less product to 32 bits.
 //
 // Both could be avoided, but at the cost of extra instructions. It's more
 // efficient to just drop a factor of x^33 from one of the multiplicands.
 uint32_t MultiplyWithExtraX33(uint32_t a, uint32_t b) {
  V128 a_vec = V128_From64WithZeroFill(a);
  V128 b_vec = V128_From64WithZeroFill(b);
  V128 res = V128_PMulLow(a_vec, b_vec);
-  // Combine crc values.
+  return CRC32_u64(0, static_cast<uint64_t>(V128_Low64(res)));
  //
  // Adding res to itself is equivalent to multiplying by 2,
  // or shifting left by 1. Addition is used as not all compilers
  // are able to generate optimal code without this hint.
  // https://godbolt.org/z/rr3fMnf39
  res = V128_Add64(res, res);
  return static_cast<uint32_t>(V128_Extract32<1>(res)) ^
         CRC32_u32(0, static_cast<uint32_t>(V128_Low64(res)));
 }
-// Powers of crc32c polynomial, for faster ExtendByZeros.
+// The number of low-order bits that ComputeZeroConstant() drops from the
-// Verified against folly:
+// length, i.e. treats as zeroes
-// folly/hash/detail/Crc32CombineDetail.cpp
+constexpr int kNumDroppedBits = 4;
 // Precomputed constants for faster ExtendByZeroes(). This was generated by
 // gen_crc32c_consts.py. The entry at index i is x^(2^(i + 3 + kNumDroppedBits)
 // - 33) mod G. That is x^-33 times the polynomial by which the CRC value needs
 // to be multiplied to extend it by 2^(i + 3 + kNumDroppedBits) zero bits, or
 // equivalently 2^(i + kNumDroppedBits) zero bytes. The extra factor of x^-33
 // cancels out the extra factor of x^33 that MultiplyWithExtraX33() introduces.
 constexpr uint32_t kCRC32CPowers[] = {
-    0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955, 0xb8fdb1e7,
+    0x493c7d27, 0xba4fc28e, 0x9e4addf8, 0x0d3b6092, 0xb9e02b86, 0xdd7e3b0c,
-    0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62, 0x28461564,
+    0x170076fa, 0xa51b6135, 0x82f89c77, 0x54a86326, 0x1dc403cc, 0x5ae703ab,
-    0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f, 0x538586e3,
+    0xc5013a36, 0xac2ac6dd, 0x9b4615a9, 0x688d1c61, 0xf6af14e6, 0xb6ffe386,
-    0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe, 0xe94ca9bc,
+    0xb717425b, 0x478b0d30, 0x54cc62e5, 0x7b2102ee, 0x8a99adef, 0xa7568c8f,
-    0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000, 0x00800000,
+    0xd610d67e, 0x6b086b3f, 0xd94f3c0b, 0xbf818109, 0x780d5a4d, 0x05ec76f1,
-    0x00008000, 0x82f63b78, 0x6ea2d55c, 0x18b8ea18, 0x510ac59a, 0xb82be955,
+    0x00000001, 0x493c7d27, 0xba4fc28e, 0x9e4addf8, 0x0d3b6092, 0xb9e02b86,
-    0xb8fdb1e7, 0x88e56f72, 0x74c360a4, 0xe4172b16, 0x0d65762a, 0x35d73a62,
+    0xdd7e3b0c, 0x170076fa, 0xa51b6135, 0x82f89c77, 0x54a86326, 0x1dc403cc,
-    0x28461564, 0xbf455269, 0xe2ea32dc, 0xfe7740e6, 0xf946610b, 0x3c204f8f,
+    0x5ae703ab, 0xc5013a36, 0xac2ac6dd, 0x9b4615a9, 0x688d1c61, 0xf6af14e6,
-    0x538586e3, 0x59726915, 0x734d5309, 0xbc1ac763, 0x7d0722cc, 0xd289cabe,
+    0xb6ffe386, 0xb717425b, 0x478b0d30, 0x54cc62e5, 0x7b2102ee, 0x8a99adef,
-    0xe94ca9bc, 0x05b74f3f, 0xa51e1f42, 0x40000000, 0x20000000, 0x08000000,
+    0xa7568c8f, 0xd610d67e, 0x6b086b3f, 0xd94f3c0b, 0xbf818109, 0x780d5a4d,
    0x00800000, 0x00008000,
 };
 // There must be an entry for each non-dropped bit in the size_t length.
 static_assert(std::size(kCRC32CPowers) >= sizeof(size_t) * 8 - kNumDroppedBits);
 }  // namespace
-// Compute a magic constant, so that multiplying by it is the same as
+// Compute a magic constant, so that multiplying by it is the same as extending
-// extending crc by length zeros.
+// crc by length zeros. The lowest kNumDroppedBits of the length are ignored and
 // treated as zeroes; the caller is assumed to handle any nonzero bits there.
 #if defined(NDEBUG) && ABSL_HAVE_CPP_ATTRIBUTE(clang::no_sanitize)
-// The array accesses in this are safe:
+// The array accesses in this are safe: `length >= size_t{1} <<
-// length > 3, so countr_zero(length >> 2) < 62, and length & (length - 1)
+// kNumDroppedBits`, so `countr_zero(length >> kNumDroppedBits) < sizeof(size_t)
-// cannot introduce bits >= 62.
+// * 8 - kNumDroppedBits`, and `length & (length - 1)` cannot introduce bits
-// The compiler cannot prove this, so manually disable bounds checking.
+// `>= sizeof(size_t) * 8 - kNumDroppedBits`. The compiler cannot prove this, so
 // manually disable bounds checking.
 [[clang::no_sanitize("array-bounds")]]
 #endif
 uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
    size_t length) const {
-  // Lowest 2 bits are handled separately in ExtendByZeroes
+  length >>= kNumDroppedBits;
  length >>= 2;
  int index = absl::countr_zero(length);
  uint32_t prev = kCRC32CPowers[index];
@@ -156,7 +169,7 @@ uint32_t CRC32AcceleratedX86ARMCombined::ComputeZeroConstant(
  while (length) {
    // For each bit of length, extend by 2**n zeros.
    index = absl::countr_zero(length);
-    prev = multiply(prev, kCRC32CPowers[index]);
+    prev = MultiplyWithExtraX33(prev, kCRC32CPowers[index]);
    length &= length - 1;
  }
  return prev;
@@ -166,22 +179,13 @@ void CRC32AcceleratedX86ARMCombined::ExtendByZeroes(uint32_t* crc,
                                                    size_t length) const {
  uint32_t val = *crc;
  // Don't bother with multiplication for small length.
-  switch (length & 3) {
+  if (length & 1) val = CRC32_u8(val, 0);
-    case 0:
+  if (length & 2) val = CRC32_u16(val, 0);
-      break;
+  if (length & 4) val = CRC32_u32(val, 0);
-    case 1:
+  if (length & 8) val = CRC32_u64(val, 0);
-      val = CRC32_u8(val, 0);
+  static_assert(kNumDroppedBits == 4);
-      break;
+  if (length >= size_t{1} << kNumDroppedBits) {
-    case 2:
+    val = MultiplyWithExtraX33(val, ComputeZeroConstant(length));
      val = CRC32_u16(val, 0);
      break;
    case 3:
      val = CRC32_u8(val, 0);
      val = CRC32_u16(val, 0);
      break;
  }
  if (length > 3) {
    val = multiply(val, ComputeZeroConstant(length));
  }
  *crc = val;
 }
@@ -549,14 +553,15 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
      }
      // Combine all streams into single result.
      static_assert(64 % (1 << kNumDroppedBits) == 0);
      uint32_t magic = ComputeZeroConstant(bs * 64);
      l64 = l64_crc[0];
      for (size_t i = 1; i < num_crc_streams; i++) {
-        l64 = multiply(static_cast<uint32_t>(l64), magic);
+        l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
        l64 ^= l64_crc[i];
      }
      for (size_t i = 0; i < num_pclmul_streams; i++) {
-        l64 = multiply(static_cast<uint32_t>(l64), magic);
+        l64 = MultiplyWithExtraX33(static_cast<uint32_t>(l64), magic);
        l64 ^= l64_pclmul[i];
      }
--- a/absl/crc/internal/gen_crc32c_consts.py
+++ b/absl/crc/internal/gen_crc32c_consts.py
@@ -0,0 +1,90 @@
 #!/usr/bin/env python3
 #
 # Copyright 2025 The Abseil Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #      https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """This script generates kCRC32CPowers[]."""
 def poly_mul(a, b):
  """Polynomial multiplication: a * b."""
  product = 0
  for i in range(b.bit_length()):
    if (b & (1 << i)) != 0:
      product ^= a << i
  return product
 def poly_div(a, b):
  """Polynomial division: floor(a / b)."""
  q = 0
  while a.bit_length() >= b.bit_length():
    q ^= 1 << (a.bit_length() - b.bit_length())
    a ^= b << (a.bit_length() - b.bit_length())
  return q
 def poly_reduce(a, b):
  """Polynomial reduction: a mod b."""
  return a ^ poly_mul(poly_div(a, b), b)
 def poly_exp(a, b, g):
  """Polynomial exponentiation: a^b mod g."""
  if b == 1:
    return poly_reduce(a, g)
  c = poly_exp(a, b // 2, g)
  c = poly_mul(c, c)
  if b % 2 != 0:
    c = poly_mul(c, a)
  return poly_reduce(c, g)
 def bitreflect(a, num_bits):
  """Reflects the bits of the given integer."""
  if a.bit_length() > num_bits:
    raise ValueError(f'Integer has more than {num_bits} bits')
  return sum(((a >> i) & 1) << (num_bits - 1 - i) for i in range(num_bits))
 G = 0x11EDC6F41  # The CRC-32C reducing polynomial, in the "natural" bit order
 CRC_BITS = 32  # The degree of G, i.e. the 32 in "CRC-32C"
 LSB_FIRST = True  # CRC-32C is a least-significant-bit-first CRC
 NUM_SIZE_BITS = 64  # The maximum number of bits in the length (size_t)
 NUM_DROPPED_BITS = 4  # The number of bits dropped from the length
 LOG2_BITS_PER_BYTE = 3  # log2 of the number of bits in a byte, i.e. log2(8)
 X = 2  # The polynomial 'x', in the "natural" bit order
 def print_crc32c_powers():
  """Generates kCRC32CPowers[].
  kCRC32CPowers[] is an array of length NUM_SIZE_BITS - NUM_DROPPED_BITS,
  whose i'th entry is x^(2^(i + LOG2_BITS_PER_BYTE + NUM_DROPPED_BITS) -
  CRC_BITS - 1) mod G. See kCRC32CPowers[] in the C++ source for more info.
  """
  for i in range(NUM_SIZE_BITS - NUM_DROPPED_BITS):
    poly = poly_exp(
        X,
        2 ** (i + LOG2_BITS_PER_BYTE + NUM_DROPPED_BITS)
        - CRC_BITS
        - (1 if LSB_FIRST else 0),
        G,
    )
    poly = bitreflect(poly, CRC_BITS)
    print(f'0x{poly:0{2*CRC_BITS//8}x}, ', end='')
 if __name__ == '__main__':
  print_crc32c_powers()