| Issue |
182625
|
| Summary |
[AArch64] Suboptimal code for counting true elements in a vector
|
| Labels |
backend:AArch64,
missed-optimization
|
| Assignees |
|
| Reporter |
Kmeakin
|
https://godbolt.org/z/YGn4YxjcP
Consider this function for counting the number of elements in `haystack` that match `needle`:
```c++
#include <arm_neon.h>
auto count_1(uint8x16_t haystack, uint8_t needle) ->uint32_t {
uint8x16_t needles = vdupq_n_u8(needle);
uint8x16_t mask = haystack == needles;
return -vaddvq_u8(mask);
}
```
GCC produces exactly the assembly I wanted:
```asm
count_1(__Uint8x16_t, unsigned char):
dup v31.16b, w0
cmeq v31.16b, v31.16b, v0.16b
addv b31, v31.16b
fmov w0, s31
neg w0, w0
ret
```
But clang produces this monstrosity:
```asm
.LCPI0_0:
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
count_1(__Uint8x16_t, unsigned char):
dup v1.16b, w0
adrp x9, .LCPI0_0
mov w8, wzr
cmeq v0.16b, v0.16b, v1.16b
ldr q1, [x9, :lo12:.LCPI0_0]
and v0.16b, v0.16b, v1.16b
ext v1.16b, v0.16b, v0.16b, #8
zip1 v0.16b, v0.16b, v1.16b
addv h0, v0.8h
fmov w9, s0
fmov s0, w9
cnt v0.8b, v0.8b
addv b0, v0.8b
fmov w9, s0
neg w9, w9
sub w0, w8, w9, uxtb
ret
```
Restricting the return type to 8-bits for some reason allows LLVM to remove the table lookup, but still produces lots of unnecessary shifts:
```c++
auto count_2(uint8x16_t haystack, uint8_t needle) -> uint8_t {
uint8x16_t needles = vdupq_n_u8(needle);
uint8x16_t mask = haystack == needles;
return -vaddvq_u8(mask);
}
```
```asm
count_2(__Uint8x16_t, unsigned char):
dup v1.16b, w0
cmeq v0.16b, v0.16b, v1.16b
sshll2 v1.8h, v0.16b, #0
sshll v0.8h, v0.8b, #0
saddl2 v2.4s, v0.8h, v1.8h
saddl v0.4s, v0.4h, v1.4h
add v0.4s, v0.4s, v2.4s
addv s0, v0.4s
fmov w8, s0
neg w0, w8
ret
```
I can make LLVM emit the optimal assembly if I write the LLVM-IR myself:
https://godbolt.org/z/bW1rhn884
```llvm
define noundef i8 @count_optimal(<16 x i8> noundef %haystack, i8 noundef %needle) {
%single_needle = insertelement <16 x i8> poison, i8 %needle, i64 0
%splatted_needles = shufflevector <16 x i8> %single_needle, <16 x i8> poison, <16 x i32> zeroinitializer
%cmp = icmp eq <16 x i8> %haystack, %splatted_needles
%ext = sext <16 x i1> %cmp to <16 x i8>
%sum = call i8 @llvm.vector.reduce.add(<16 x i8> %ext)
%neg = sub i8 0, %sum
ret i8 %neg
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs