https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105793
Bug ID: 105793
Summary: Missed vectorisation with conditional-select inside
loop
Product: gcc
Version: unknown
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
Target Milestone: ---
The code:
#define N 1024
float f(const float in[N], unsigned int n) {
float a = 0.0f;
for (unsigned i = 0; i < N; ++i) {
float b = in[i];
if (b < 10.f)
a += b;
else
a -= b;
}
return a;
}
with -Ofast does not vectorise (on aarch64, for example):
f:
movi v0.2s, #0
add x1, x0, 4096
fmov s3, 1.0e+1
.L5:
ldr s1, [x0], 4
fsub s2, s0, s1
fcmpe s1, s3
fadd s0, s0, s1
fcsel s0, s0, s2, mi
cmp x1, x0
bne .L5
ret
whereas clang can and does. Commenting out the "else a -=b;" line allows GCC to
vectorise it:
f:
movi v0.4s, 0
add x1, x0, 4096
fmov v3.4s, 1.0e+1
.L2:
ldr q2, [x0], 16
fcmgt v1.4s, v3.4s, v2.4s
and v1.16b, v1.16b, v2.16b
fadd v0.4s, v0.4s, v1.4s
cmp x1, x0
bne .L2
faddp v0.4s, v0.4s, v0.4s
faddp v0.4s, v0.4s, v0.4s
ret
Examples at https://gcc.godbolt.org/z/qbn6T73qE