Use same element-width for non-temporal loads and stores on Arm

Increase the consistency between _mm_loadu_si128 and _mm_stream_si128 by
using vector loads/stores of 64-bit elements in both. This should have no
impact on existing users. On aarch64 (release build, GCC 15.2),
crc_non_temporal_memcpy.cc.o stays effectively the same, the only change
being as follows:

--- crc_non_temporal_memcpy.cc.o (original)
+++ crc_non_temporal_memcpy.cc.o (patched)
├── objdump --line-numbers --disassemble --demangle --reloc --no-show-raw-insn --section=.text {}
│ @@ -255,15 +255,15 @@
│       add     x2, x21, x2
│       mov     x0, x21
│       ldp     q31, q30, [x0, #32]
│       add     x1, x1, #0x40
│       ldp     q29, q28, [x0], #64
│       stp     q31, q30, [x1, #-32]
│       stp     q29, q28, [x1, #-64]
│ -     cmp     x0, x2
│ +     cmp     x2, x0
│       b.ne    3b0 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x270>  // b.any
│       and     x0, x3, #0xffffffffffffffc0
│       and     x23, x23, #0x3f
│       dmb     ish
│       add     x22, x22, x0
│       add     x21, x21, x0
│       b       380 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x240>

On big-endian Arm (aarch64_be), this fixes a bug in non_temporal_store_memcpy,
in which each 32-bit half out of a 64-bit parcel of memory was swapped
with the other. For example, the byte sequence 218edf0b 13c68753 would be
copied as 13c68753 218edf0b.
This commit is contained in:
J. Neuschäfer
2025-10-03 13:52:49 +00:00
parent 0c01ee1895
commit 8f08d4c792

View File

@@ -21,7 +21,7 @@
#include <arm_neon.h>
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
#define vreinterpretq_m128i_s64(x) (x)
#define vreinterpretq_s64_m128i(x) (x)
// Guarantees that every preceding store is globally visible before any
@@ -44,7 +44,7 @@ static inline __attribute__((always_inline)) void _mm_sfence(void) {
// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
const __m128i *p) {
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
return vreinterpretq_m128i_s64(vld1q_s64((const int64_t *)p));
}
// Stores the data in a to the address p without polluting the caches. If the