https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121315
Bug ID: 121315
Summary: Missed LDP/STP fusion opportunity
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
CC: acoplan at gcc dot gnu.org
Target Milestone: ---
Target: aarch64
We have some C++ code that implements various reversed memcpy-like operations.
A reproducer is:
#include <cstdlib>
#include <cstdint>
#include <bit>
#include <iostream>
template <size_t Size>
struct uint_types_by_size;
#define GEN(sz, fn) \
static inline uint##sz##_t byteswap_gen(uint##sz##_t v) { \
return fn(v); \
} \
template <> \
struct uint_types_by_size<sz / 8> { \
using type = uint##sz##_t; \
};
GEN(8, uint8_t)
GEN(64, __builtin_bswap64)
GEN(32, __builtin_bswap32)
GEN(16, __builtin_bswap16)
#undef GEN
constexpr auto kIsLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
constexpr auto kIsBigEndian = !kIsLittleEndian;
template <typename T>
struct Endian {
static T swap(T x) {
constexpr auto s = sizeof(T);
using B = typename uint_types_by_size<s>::type;
return std::bit_cast<T>(byteswap_gen(std::bit_cast<B>(x)));
}
static T big(T x) { return kIsLittleEndian ? swap(x) : x; }
static T little(T x) { return kIsBigEndian ? swap(x) : x; }
};
template <typename T>
void __attribute__((noinline)) copyReverseGeneric(T* dst, T* src, size_t len) {
for(int i = 0; i < len; ++i) {
dst[i] = Endian<T>::big(src[i]);
}
}
int main(int argc, char** argv) {
constexpr size_t N = 10000;
constexpr int iterations = 100000;
int* src = static_cast<int*>(aligned_alloc(16, N * sizeof(int)));
int* dst = static_cast<int*>(aligned_alloc(16, N * sizeof(int)));
for (size_t i = 0; i < N; ++i) {
src[i] = i;
}
volatile int sink = 0; // Prevent compiler optimization
for (int i = 0; i < iterations; ++i) {
copyReverseGeneric(dst, src, N);
sink += dst[0]; // Force memory access
}
free(src);
free(dst);
return 0;
}
Compiled with e.g. -std=c++20 -O3 -mcpu=neoverse-v2 it generates for
copyReverseGeneric:
...
add x5, x1, 16
add x4, x0, 16
mov x3, 40000
.p2align 5,,15
.L3:
ldr q31, [x1, x2]
ldr q30, [x5, x2]
rev32 v31.16b, v31.16b
rev32 v30.16b, v30.16b
str q31, [x0, x2]
str q30, [x4, x2]
add x2, x2, 32
cmp x2, x3
bne .L3
...
The two LDRs and two STRs should be merged into LDP and STP but I guess the
register addressing mode blocks this. Maybe this something induction variable
selection needs to take into account, or maybe the ldp/stp fusion analysis can
reason about the offset increments?