https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118505
Bug ID: 118505
Summary: [15 regression] aarch64: 25% regression in TSVC s258
since r15-3436-gb2b20b277988ab
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: dhruvc at nvidia dot com
Target Milestone: ---
Test case:
===
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
#define ARRAY_ALIGNMENT 64
#include <sys/time.h>
struct args_t
{
struct timeval t1;
struct timeval t2;
void *__restrict__ arg_info;
};
typedef float real_t;
__attribute__ ((aligned (ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D],
c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D];
void
s258 (struct args_t *func_args)
{
real_t s;
for (int nl = 0; nl < iterations; nl++)
{
s = 0.;
for (int i = 0; i < LEN_2D; ++i)
{
if (a[i] > 0.)
{
s = d[i] * d[i];
}
b[i] = s * c[i] + d[i];
e[i] = (s + (real_t) 1.) * aa[0][i];
}
}
}
===
Before the patch:
===
s258:
.LFB0:
.cfi_startproc
fmov s31, 1.0e+0
adrp x2, d
adrp x1, a
adrp x6, c
adrp x5, b
adrp x4, e
adrp x3, aa
mov w7, 34464
add x2, x2, :lo12:d
add x1, x1, :lo12:a
add x6, x6, :lo12:c
add x5, x5, :lo12:b
add x4, x4, :lo12:e
add x3, x3, :lo12:aa
movk w7, 0x1, lsl 16
.L7:
movi v30.2s, #0
mov x0, 0
.p2align 5,,15
.L6:
ldr s4, [x1, x0]
fadd s27, s30, s31
ldr s29, [x2, x0]
fcmpe s4, #0.0
bls .L13
fmul s30, s29, s29
ldr s3, [x6, x0]
ldr s1, [x3, x0]
fadd s2, s30, s31
fmadd s3, s3, s30, s29
fmul s1, s1, s2
str s3, [x5, x0]
str s1, [x4, x0]
add x0, x0, 4
cmp x0, 1024
bne .L6
subs w7, w7, #1
bne .L7
.L15:
ret
.p2align 2,,3
.L13:
ldr s0, [x6, x0]
ldr s28, [x3, x0]
fmadd s0, s30, s0, s29
fmul s28, s27, s28
str s28, [x4, x0]
str s0, [x5, x0]
add x0, x0, 4
cmp x0, 1024
bne .L6
subs w7, w7, #1
bne .L7
b .L15
.cfi_endproc
===
After the patch:
===
s258:
.LFB0:
.cfi_startproc
fmov s31, 1.0e+0
adrp x6, d
adrp x5, a
adrp x4, c
adrp x3, b
adrp x2, e
adrp x1, aa
mov w7, 34464
add x6, x6, :lo12:d
add x5, x5, :lo12:a
add x4, x4, :lo12:c
add x3, x3, :lo12:b
add x2, x2, :lo12:e
add x1, x1, :lo12:aa
movk w7, 0x1, lsl 16
.L5:
movi v30.2s, #0
mov x0, 0
.p2align 5,,15
.L4:
ldr s2, [x5, x0]
ldr s29, [x6, x0]
ldr s0, [x4, x0]
fcmpe s2, #0.0
ldr s28, [x1, x0]
fmul s1, s29, s29
fcsel s30, s1, s30, gt
fadd s27, s30, s31
fmadd s0, s30, s0, s29
fmul s28, s27, s28
str s0, [x3, x0]
str s28, [x2, x0]
add x0, x0, 4
cmp x0, 1024
bne .L4
subs w7, w7, #1
bne .L5
ret
.cfi_endproc
===
Compiled on NVIDIA Grace with:
gcc -std=c99 -march=native -Ofast -fstrict-aliasing -fivopts -ftree-vectorize
Before: 0.02s
After: 0.025s
The part I find strange is that the faster code has an extra section (label
.L13) containing an extra fmadd. Could the fcsel be causing the performance
hit?