https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125201
Bug ID: 125201
Summary: [17 Regression] 20% slowdown in TSVC s116 since
r17-140-gf8d911e6ae3fc1
Product: gcc
Version: 17.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: dhruvc at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
Target Milestone: ---
Looks like the loop wasn't getting vectorized before and is now, but the
emission of a tbl instruction is killing the backend.
On compiler explorer: https://godbolt.org/z/xK43M5K3M
==============================
Flags: -O3 -mcpu=grace
-----
Source:
------
real_t s116(struct args_t * func_args)
{
// linear dependence testing
initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);
for (int nl = 0; nl < iterations*10; nl++) {
for (int i = 0; i < LEN_1D - 5; i += 5) {
a[i] = a[i + 1] * a[i];
a[i + 1] = a[i + 2] * a[i + 1];
a[i + 2] = a[i + 3] * a[i + 2];
a[i + 3] = a[i + 4] * a[i + 3];
a[i + 4] = a[i + 5] * a[i + 4];
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}
==============================
In the SLP dumps (194t.slp1):
It looks like there is a new VEC_PERM_EXPR and a vector multiply being
generated.
Good:
----
src/tsvc.c:315:18: note: Cost model analysis for part in loop 2:
Vector cost: 28
Scalar cost: 16
GIMPLE loop body:
----------------
<bb 3> [local count: 1063004408]:
# i_46 = PHI <_18(8), 0(5)>
# ivtmp_44 = PHI <ivtmp_43(8), 6399(5)>
# a_I_lsm0.15_24 = PHI <_19(8), _41(5)>
_2 = i_46 + 1;
_3 = a[_2];
_5 = _3 * a_I_lsm0.15_24;
a[i_46] = _5;
_6 = i_46 + 2;
_7 = a[_6];
_9 = _3 * _7;
a[_2] = _9;
_10 = i_46 + 3;
_11 = a[_10];
_13 = _7 * _11;
a[_6] = _13;
_14 = i_46 + 4;
_15 = a[_14];
_17 = _11 * _15;
a[_10] = _17;
_18 = i_46 + 5;
_19 = a[_18];
_21 = _15 * _19;
a[_14] = _21;
ivtmp_43 = ivtmp_44 - 1;
if (ivtmp_43 != 0)
goto <bb 8>; [98.99%]
else
goto <bb 4>; [1.01%]
Bad:
---
src/tsvc.c:315:18: note: Cost model analysis for part in loop 2:
Vector cost: 12
Scalar cost: 16
GIMPLE loop body:
----------------
<bb 3> [local count: 1063004408]:
# i_46 = PHI <_18(8), 0(5)>
# ivtmp_44 = PHI <ivtmp_43(8), 6399(5)>
# a_I_lsm0.15_24 = PHI <_19(8), _41(5)>
_2 = i_46 + 1;
vectp.19_51 = &a[_2];
vect__3.20_4 = MEM <vector(4) float> [(float *)vectp.19_51];
vectp.19_25 = vectp.19_51 + 16;
vectp.19_12 = vectp.19_51 + 4;
vect__3.22_45 = VEC_PERM_EXPR <vect__3.20_4, vect__3.20_4, { 0, 0, 1, 2 }>;
_3 = a[_2];
_5 = _3 * a_I_lsm0.15_24;
_6 = i_46 + 2;
_7 = a[_6];
_9 = _3 * _7;
_10 = i_46 + 3;
_11 = a[_10];
_13 = _7 * _11;
_14 = i_46 + 4;
_15 = a[_14];
_16 = {a_I_lsm0.15_24, _7, _11, _15};
vect__5.23_23 = vect__3.22_45 * _16;
_17 = _11 * _15;
vectp.25_20 = &a[i_46];
MEM <vector(4) float> [(float *)vectp.25_20] = vect__5.23_23;
_18 = i_46 + 5;
_19 = a[_18];
_21 = _15 * _19;
a[_14] = _21;
ivtmp_43 = ivtmp_44 - 1;
if (ivtmp_43 != 0)
goto <bb 8>; [98.99%]
else
goto <bb 4>; [1.01%]
==============================
This causes generation of a tbl instruction and a loop-carried dependency,
which seem to be the cause:
Good:
----
.L2:
ldr s31, [x26]
mov x0, x26
.p2align 5,,15
.L3:
ldp s1, s0, [x0, 4]
ldp s30, s29, [x0, 12]
add x0, x0, 20
fmul s2, s1, s31
ldr s31, [x0]
fmul s1, s1, s0
fmul s0, s0, s30
fmul s30, s30, s29
fmul s29, s29, s31
stp s2, s1, [x0, -20]
stp s0, s30, [x0, -12]
str s29, [x0, -4]
cmp x0, x28
bne .L3
movi v0.2s, #0
mov x7, x25
mov x6, x24
mov x5, x23
mov x4, x22
mov x3, x21
mov x2, x20
mov x1, x19
mov x0, x26
bl dummy
subs w27, w27, #1
bne .L2
Bad:
---
.L2:
ldr s27, [x26]
mov x0, x26
.p2align 5,,15
.L3:
ldp s28, s31, [x0, 8]
ldr q29, [x0, 4]
uzp1 v31.2s, v27.2s, v31.2s
ldp s30, s27, [x0, 16]
add x0, x0, 20
tbl v29.16b, {v29.16b}, v26.16b <--- loop-carried dep from q26
uzp1 v28.2s, v28.2s, v30.2s
fmul s30, s30, s27
zip1 v31.4s, v31.4s, v28.4s
str s30, [x0, -4]
fmul v31.4s, v31.4s, v29.4s
str q31, [x0, -20]
cmp x0, x28
bne .L3
movi v0.2s, #0
mov x7, x25
mov x6, x24
mov x5, x23
mov x4, x22
mov x3, x21
mov x2, x20
mov x1, x19
mov x0, x26
bl dummy
adrp x0, .LANCHOR0+16
subs w27, w27, #1
ldr q26, [x0, #:lo12:.LANCHOR0+16] ---> loop-carried dep to tbl
bne .L2
On compiler explorer: https://godbolt.org/z/WvW5ErqYq
==============================
This seems to lead to a huge increase in the number of backend stalls:
Good:
----
>> perf stat -e stalled-cycles-backend -- ./good/tsvc/bin/S116/tsvc.exe
Loop Time(sec) Checksum
s116 4.064 32000.000000
Performance counter stats for './good/tsvc/bin/S116/tsvc.exe':
174,173,615 stalled-cycles-backend
Bad:
---
>> perf stat -e stalled-cycles-backend -- ./bad/tsvc/bin/S116/tsvc.exe
Loop Time(sec) Checksum
s116 4.898 32000.000000
Performance counter stats for './bad/tsvc/bin/S116/tsvc.exe':
1,661,928,005 stalled-cycles-backend
==============================