https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119442
Bug ID: 119442
Summary: [14/15 Regression] Regression in creating SVE
predicate
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: aarch64-sve
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
Target Milestone: ---
Target: aarch64
The testcase is nonsense in itself but is heavily reduced from a real
application. I'd appreciate help with crafting a better one:
float fasten_main_etot_0;
void fasten_main() {
for (int l = 0; l < 2;) {
int phphb_nz;
for (; l < 32; l++) {
float dslv_e = l && phphb_nz;
fasten_main_etot_0 += dslv_e;
}
}
}
Compiled with -O3 -march=armv8.2-a+sve -msve-vector-bits=128 it tries to create
a governing predicate but ends up scalarising the operation into the
inefficient:
fasten_main:
sub sp, sp, #16
pfalse p15.b
str p15, [sp, #6, mul vl]
mov w0, 0
adrp x1, .LANCHOR0
ptrue p14.b, vl1
ptrue p7.b, vl16
ptrue p15.s, vl4
eor p15.b, p15/z, p14.b, p15.b
ldr w2, [sp, 12]
ldr s31, [x1, #:lo12:.LANCHOR0]
bfi w2, w0, 0, 4
uxtw x2, w2
bfi w2, w0, 4, 4
uxtw x2, w2
bfi w2, w0, 8, 4
uxtw x2, w2
bfi w2, w0, 12, 4
str w2, [sp, 12]
ldr p14, [sp, #6, mul vl]
and p15.b, p14/z, p15.b, p15.b
mov z30.s, p14/z, #1
mov z0.s, p15/z, #1
scvtf z30.s, p7/m, z30.s
scvtf z0.s, p7/m, z0.s
fadda s31, p7, s31, z0.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
fadda s31, p7, s31, z30.s
str s31, [x1, #:lo12:.LANCHOR0]
add sp, sp, 16
ret
This seems to be a regression from GCC 13:
fasten_main:
adrp x0, .LANCHOR0
mov z1.b, #0
ptrue p0.s, vl4
ptrue p1.b, vl1
eor p1.b, p0/z, p1.b, p0.b
cmpne p1.s, p1/z, z1.s, z1.s
ptrue p0.b, vl16
ldr s0, [x0, #:lo12:.LANCHOR0]
movprfx z2, z1
mov z2.s, p1/m, #1
cmpne p1.s, p0/z, z1.s, z1.s
scvtf z2.s, p0/m, z2.s
mov z1.s, p1/m, #1
fadda s0, p0, s0, z2.s
scvtf z1.s, p0/m, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
fadda s0, p0, s0, z1.s
str s0, [x0, #:lo12:.LANCHOR0]
ret
As said, the testcase itself is nonsense (and Clang optimises most of the
function away) but the predicate construction regression is real.
It only triggers for -msve-vector-bits VLS code.