Use same element-width for non-temporal loads and stores on Arm

Increase the consistency between _mm_loadu_si128 and _mm_stream_si128 by using vector loads/stores of 64-bit elements in both. This should have no impact on existing users. On aarch64 (release build, GCC 15.2), crc_non_temporal_memcpy.cc.o stays effectively the same, the only change being as follows: --- crc_non_temporal_memcpy.cc.o (original) +++ crc_non_temporal_memcpy.cc.o (patched) ├── objdump --line-numbers --disassemble --demangle --reloc --no-show-raw-insn --section=.text {} │ @@ -255,15 +255,15 @@ │ add x2, x21, x2 │ mov x0, x21 │ ldp q31, q30, [x0, #32] │ add x1, x1, #0x40 │ ldp q29, q28, [x0], #64 │ stp q31, q30, [x1, #-32] │ stp q29, q28, [x1, #-64] │ - cmp x0, x2 │ + cmp x2, x0 │ b.ne 3b0 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x270> // b.any │ and x0, x3, #0xffffffffffffffc0 │ and x23, x23, #0x3f │ dmb ish │ add x22, x22, x0 │ add x21, x21, x0 │ b 380 <absl::crc_internal::CrcNonTemporalMemcpyEngine::Compute(void*, void const*, unsigned long, absl::crc32c_t) const+0x240> On big-endian Arm (aarch64_be), this fixes a bug in non_temporal_store_memcpy, in which each 32-bit half out of a 64-bit parcel of memory was swapped with the other. For example, the byte sequence 218edf0b 13c68753 would be copied as 13c68753 218edf0b.
2026-06-04 12:07:05 +08:00 · 2025-10-03 13:52:49 +00:00
parent 0c01ee1895
commit 8f08d4c792
1 changed files with 2 additions and 2 deletions
--- a/absl/crc/internal/non_temporal_arm_intrinsics.h
+++ b/absl/crc/internal/non_temporal_arm_intrinsics.h
@@ -21,7 +21,7 @@
 #include <arm_neon.h>

 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
 #define vreinterpretq_s64_m128i(x) (x)

 // Guarantees that every preceding store is globally visible before any
@@ -44,7 +44,7 @@ static inline __attribute__((always_inline)) void _mm_sfence(void) {
 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
 static inline __attribute__((always_inline)) __m128i _mm_loadu_si128(
    const __m128i *p) {
-  return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
+  return vreinterpretq_m128i_s64(vld1q_s64((const int64_t *)p));
 }

 // Stores the data in a to the address p without polluting the caches.  If the