Add prefetch to crc32

We already prefetch in case of large inputs, do the same
for medium sized inputs as well. This is mostly neutral
for performance in most cases, so this also adds a new
bench with working size >> cache size to ensure that we
are seeing performance benefits of prefetch. Main benefits
are on AMD with hardware prefetchers turned off:

AMD prefetchers on:
name                           old time/op  new time/op  delta
BM_Calculate/0                 2.43ns ± 1%  2.43ns ± 1%     ~     (p=0.814 n=40+40)
BM_Calculate/1                 2.50ns ± 2%  2.50ns ± 2%     ~     (p=0.745 n=39+39)
BM_Calculate/100               9.17ns ± 1%  9.17ns ± 2%     ~     (p=0.747 n=40+40)
BM_Calculate/10000              474ns ± 1%   474ns ± 2%     ~     (p=0.749 n=40+40)
BM_Calculate/500000            22.8µs ± 1%  22.9µs ± 2%     ~     (p=0.298 n=39+40)
BM_Extend/0                    1.38ns ± 1%  1.38ns ± 1%     ~     (p=0.651 n=40+40)
BM_Extend/1                    1.53ns ± 2%  1.53ns ± 1%     ~     (p=0.957 n=40+39)
BM_Extend/100                  9.48ns ± 1%  9.48ns ± 2%     ~     (p=1.000 n=40+40)
BM_Extend/10000                 474ns ± 2%   474ns ± 1%     ~     (p=0.928 n=40+40)
BM_Extend/500000               22.8µs ± 1%  22.9µs ± 2%     ~     (p=0.331 n=40+40)
BM_Extend/100000000            4.79ms ± 1%  4.79ms ± 1%     ~     (p=0.753 n=38+38)
BM_ExtendCacheMiss/10          25.5ms ± 2%  25.5ms ± 2%     ~     (p=0.988 n=38+40)
BM_ExtendCacheMiss/100         23.1ms ± 2%  23.1ms ± 2%     ~     (p=0.792 n=40+40)
BM_ExtendCacheMiss/1000        37.2ms ± 1%  28.6ms ± 2%  -23.00%  (p=0.000 n=38+40)
BM_ExtendCacheMiss/100000      7.77ms ± 2%  7.74ms ± 2%   -0.45%  (p=0.006 n=40+40)

AMD prefetchers off:
name                           old time/op  new time/op  delta
BM_Calculate/0                 2.43ns ± 2%  2.43ns ± 2%     ~     (p=0.351 n=40+39)
BM_Calculate/1                 2.51ns ± 2%  2.51ns ± 1%     ~     (p=0.535 n=40+40)
BM_Calculate/100               9.18ns ± 2%  9.15ns ± 2%     ~     (p=0.120 n=38+39)
BM_Calculate/10000              475ns ± 2%   475ns ± 2%     ~     (p=0.852 n=40+40)
BM_Calculate/500000            22.9µs ± 2%  22.8µs ± 2%     ~     (p=0.396 n=40+40)
BM_Extend/0                    1.38ns ± 2%  1.38ns ± 2%     ~     (p=0.466 n=40+40)
BM_Extend/1                    1.53ns ± 2%  1.53ns ± 2%     ~     (p=0.914 n=40+39)
BM_Extend/100                  9.49ns ± 2%  9.49ns ± 2%     ~     (p=0.802 n=40+40)
BM_Extend/10000                 475ns ± 2%   474ns ± 1%     ~     (p=0.589 n=40+40)
BM_Extend/500000               22.8µs ± 2%  22.8µs ± 2%     ~     (p=0.872 n=39+40)
BM_Extend/100000000            10.0ms ± 3%  10.0ms ± 4%     ~     (p=0.355 n=40+40)
BM_ExtendCacheMiss/10           196ms ± 2%   196ms ± 2%     ~     (p=0.698 n=40+40)
BM_ExtendCacheMiss/100          129ms ± 1%   129ms ± 1%     ~     (p=0.602 n=36+37)
BM_ExtendCacheMiss/1000        88.6ms ± 1%  57.2ms ± 1%  -35.49%  (p=0.000 n=36+38)
BM_ExtendCacheMiss/100000      14.9ms ± 1%  14.9ms ± 1%     ~     (p=0.888 n=39+40)

Intel skylake:
BM_Calculate/0                 2.49ns ± 2%  2.44ns ± 4%  -2.15%  (p=0.001 n=31+34)
BM_Calculate/1                 3.04ns ± 2%  2.98ns ± 9%  -1.95%  (p=0.003 n=31+35)
BM_Calculate/100               8.64ns ± 3%  8.53ns ± 5%    ~     (p=0.065 n=31+35)
BM_Calculate/10000              290ns ± 3%   285ns ± 7%  -1.80%  (p=0.004 n=28+34)
BM_Calculate/500000            11.8µs ± 2%  11.6µs ± 8%  -1.59%  (p=0.003 n=26+34)
BM_Extend/0                    1.56ns ± 1%  1.52ns ± 3%  -2.44%  (p=0.000 n=26+35)
BM_Extend/1                    1.88ns ± 3%  1.83ns ± 6%  -2.17%  (p=0.001 n=27+35)
BM_Extend/100                  9.31ns ± 3%  9.13ns ± 7%  -1.92%  (p=0.000 n=33+38)
BM_Extend/10000                 290ns ± 3%   283ns ± 3%  -2.45%  (p=0.000 n=32+38)
BM_Extend/500000               11.8µs ± 2%  11.5µs ± 8%  -1.80%  (p=0.001 n=35+37)
BM_Extend/100000000            6.39ms ±10%  6.11ms ± 8%  -4.34%  (p=0.000 n=40+40)
BM_ExtendCacheMiss/10          36.2ms ± 7%  35.8ms ±14%    ~     (p=0.281 n=33+37)
BM_ExtendCacheMiss/100         26.9ms ±15%  25.9ms ±12%  -3.93%  (p=0.000 n=40+40)
BM_ExtendCacheMiss/1000        23.8ms ± 5%  23.4ms ± 5%  -1.68%  (p=0.001 n=39+40)
BM_ExtendCacheMiss/100000      10.1ms ± 5%  10.0ms ± 4%    ~     (p=0.051 n=39+39)

PiperOrigin-RevId: 495119444
Change-Id: I67bcf3b0282b5e1c43122de2837a24c16b8aded7
This commit is contained in:
Ilya Tokar
2022-12-13 13:57:36 -08:00
committed by Copybara-Service
parent 1887dece5e
commit 4cb6c38936
4 changed files with 31 additions and 1 deletions

View File

@@ -204,6 +204,7 @@ cc_binary(
deps = [
":crc32c",
"//absl/memory",
"//absl/strings",
"@com_github_google_benchmark//:benchmark_main",
],
)

View File

@@ -17,6 +17,7 @@
#include "absl/crc/crc32c.h"
#include "absl/crc/internal/crc32c.h"
#include "absl/memory/memory.h"
#include "absl/strings/string_view.h"
#include "benchmark/benchmark.h"
namespace {
@@ -52,7 +53,27 @@ void BM_Extend(benchmark::State& state) {
benchmark::DoNotOptimize(crc);
}
}
BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000);
BENCHMARK(BM_Extend)->Arg(0)->Arg(1)->Arg(100)->Arg(10000)->Arg(500000)->Arg(
100 * 1000 * 1000);
// Make working set >> CPU cache size to benchmark prefetches better
void BM_ExtendCacheMiss(benchmark::State& state) {
int len = state.range(0);
constexpr int total = 300 * 1000 * 1000;
std::string extension = TestString(total);
absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World"
for (auto s : state) {
for (int i = 0; i < total; i += len * 2) {
benchmark::DoNotOptimize(base);
benchmark::DoNotOptimize(extension);
absl::crc32c_t crc =
absl::ExtendCrc32c(base, absl::string_view(&extension[i], len));
benchmark::DoNotOptimize(crc);
}
}
state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * total / 2);
}
BENCHMARK(BM_ExtendCacheMiss)->Arg(10)->Arg(100)->Arg(1000)->Arg(100000);
void BM_ExtendByZeroes(benchmark::State& state) {
absl::crc32c_t base = absl::crc32c_t{0xC99465AA}; // CRC32C of "Hello World"

View File

@@ -29,6 +29,8 @@ namespace crc_internal {
// Prefetch constants used in some Extend() implementations
constexpr int kPrefetchHorizon = ABSL_CACHELINE_SIZE * 4; // Prefetch this far
// Shorter prefetch distance for smaller buffers
constexpr int kPrefetchHorizonMedium = ABSL_CACHELINE_SIZE * 1;
static_assert(kPrefetchHorizon >= 64, "CRCPrefetchHorizon less than loop len");
// We require the Scramble() function:

View File

@@ -429,6 +429,12 @@ class CRC32AcceleratedX86ARMCombinedMultipleStreams
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p1 + kPrefetchHorizonMedium));
base_internal::PrefetchT0(
reinterpret_cast<const char*>(p2 + kPrefetchHorizonMedium));
}
// Don't run crc on last 8 bytes.
ABSL_INTERNAL_STEP8BY3(l64, l641, l642, p, p1, p2);