https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121290
Bug ID: 121290 Summary: Regrsssions in TSVC s119, s3113, s312, s313, s314, s315, s316 since commit 3bf2aa834e1 Product: gcc Version: 16.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: soumyaa at gcc dot gnu.org CC: rguenth at gcc dot gnu.org Target Milestone: --- Hi, Commit 3bf2aa834e1 [https://gcc.gnu.org/cgit/gcc/commit/?id=3bf2aa834e1270e3167c9559bef9a8ef1f668604] has led to the following regressions in TSVC kernels: Summary: s3113 [230% Regression] -Ofast -mcpu=neoverse-v2 https://godbolt.org/z/En8rYf7sE s312 [300%] -Ofast -mcpu=neoverse-v2 https://godbolt.org/z/WbT6rfW3h s313 [92%] -Ofast -mcpu=neoverse-v2 https://godbolt.org/z/Kz1vv54hh s314 [288%] -Ofast -mcpu=neoverse-v2 https://godbolt.org/z/9eo6dc9Pv s316 [186%] -Ofast -mcpu=neoverse-v2 https://godbolt.org/z/9Mqzxr9aE s315 [48%] -O3 -mcpu=neoverse-v2 -msve-vector-bits=128 https://godbolt.org/z/8465KKMc4 ------ s3113, s312, s313, s314, s316, are affected by missing loop unrolling. For example, in s3113: #define iterations 100000 #define LEN_1D 32000 float a[LEN_1D]; int main() { for (int i = 0; i < LEN_1D; i++) { a[i] = i; } float max; for (int nl = 0; nl < iterations*4; nl++) { max = fabsf(a[0]); for (int i = 0; i < LEN_1D; i++) { if (fabsf(a[i]) > max) { max = fabsf(a[i]); } } } return max; } Now: ldr s24, [x0], 4 fabs s24, s24 dup v24.4s, v24.s[0] fmaxnm v25.4s, v25.4s, v24.4s cmp x1, x0 bne .L3 add w2, w2, 1 cmp w2, w4 bne .L4 mov w0, 1148846080 dup s25, v25.s[3] fmov s31, w0 fcmpe s25, s31 cset w0, gt ret Before: ldp q2, q1, [x0] ldp q0, q21, [x0, 32] add x0, x0, 64 fabs v2.4s, v2.4s fabs v1.4s, v1.4s fabs v0.4s, v0.4s fabs v21.4s, v21.4s fmaxnm v25.4s, v25.4s, v2.4s fmaxnm v22.4s, v22.4s, v1.4s fmaxnm v23.4s, v23.4s, v0.4s fmaxnm v24.4s, v24.4s, v21.4s cmp x1, x0 bne .L3 subs w2, w2, #1 bne .L4 fmaxnm v22.4s, v25.4s, v22.4s mov w0, 1148846080 fmaxnm v23.4s, v23.4s, v24.4s fmov s31, w0 fmaxnm v23.4s, v22.4s, v23.4s fmaxnmv s23, v23.4s fcmpe s23, s31 cset w0, gt ret s315 is odd, it removes branching by using bit/bsl, but still executes slower: #define iterations 100000 #define LEN_1D 32000 float a[LEN_1D]; int main() { for (int i = 0; i < LEN_1D; i++) { a[i] = (i * 7) % LEN_1D; } float x, chksum; int index; for (int nl = 0; nl < iterations; nl++) { x = a[0]; index = 0; for (int i = 0; i < LEN_1D; ++i) { if (a[i] > x) { x = a[i]; index = i; } } chksum = x + (float) index; } return index + x > 1; } Now: .L4: movi v23.4s, 0 mov v24.16b, v26.16b mov x0, x3 mov v22.16b, v23.16b .L3: ld1r {v1.4s}, [x0], 4 fcmgt v20.4s, v1.4s, v24.4s bit v23.16b, v22.16b, v20.16b bsl v20.16b, v1.16b, v24.16b add v22.4s, v22.4s, v25.4s mov v24.16b, v20.16b cmp x1, x0 bne .L3 add w2, w2, 1 cmp w2, w4 bne .L4 dup s23, v23.s[3] dup s20, v20.s[3] fmov s21, 1.0e+0 scvtf s0, s23 fadd s20, s0, s20 fcmpe s20, s21 cset w0, gt ret Before: .L6: fmov s25, s1 movi v26.2d, #0 mov x0, 0 .L5: ldr s0, [x1, x0, lsl 2] fcmpe s25, s0 bmi .L7 .L3: add x0, x0, 1 cmp x0, x2 bne .L5 subs w3, w3, #1 bne .L6 scvtf s26, s26 fmov s24, 1.0e+0 fadd s26, s26, s25 fcmpe s26, s24 cset w0, gt ret .L7: fmov s26, w0 fmov s25, s0 b .L3 Thanks, Soumya