mirror of
https://github.com/abseil/abseil-cpp.git
synced 2026-06-04 12:07:05 +08:00
Use same element-width for non-temporal loads and stores on Arm
Increase the consistency between _mm_loadu_si128 and _mm_stream_si128 by
using vector loads/stores of 64-bit elements in both. This should have no
impact on existing users. On aarch64 (release build, GCC 15.2),
crc_non_temporal_memcpy.cc.o stays effectively the same, the only change
being as follows:
--- crc_non_temporal_memcpy.cc.o (original)
+++ crc_non_temporal_memcpy.cc.o (patched)
├── objdump --line-numbers --disassemble --demangle --reloc --no-show-raw-insn --section=.text {}
│ @@ -255,15 +255,15 @@
│ add x2, x21, x2
│ mov x0, x21
│ ldp q31, q30, [x0, #32]
│ add x1, x1, #0x40
│ ldp q29, q28, [x0], #64
│ stp q31, q30, [x1, #-32]
│ stp q29, q28, [x1, #-64]
│ - cmp x0, x2
│ + cmp x2, x0
│ b.ne 3b0 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x270> // b.any
│ and x0, x3, #0xffffffffffffffc0
│ and x23, x23, #0x3f
│ dmb ish
│ add x22, x22, x0
│ add x21, x21, x0
│ b 380 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x240>
On big-endian Arm (aarch64_be), this fixes a bug in non_temporal_store_memcpy,
in which each 32-bit half out of a 64-bit parcel of memory was swapped
with the other. For example, the byte sequence 218edf0b 13c68753 would be
copied as 13c68753 218edf0b.
This commit is contained in:
@@ -21,7 +21,7 @@
|
||||
#include <arm_neon.h>
|
||||
|
||||
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
|
||||
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
|
||||
#define vreinterpretq_m128i_s64(x) (x)
|
||||
#define vreinterpretq_s64_m128i(x) (x)
|
||||
|
||||
// Guarantees that every preceding store is globally visible before any
|
||||
@@ -44,7 +44,7 @@ static inline __attribute__((always_inline)) void _mm_sfence(void) {
|
||||
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
|
||||
static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
|
||||
const __m128i *p) {
|
||||
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
|
||||
return vreinterpretq_m128i_s64(vld1q_s64((const int64_t *)p));
|
||||
}
|
||||
|
||||
// Stores the data in a to the address p without polluting the caches. If the
|
||||
|
||||
Reference in New Issue
Block a user