Issue 182625
Summary [AArch64] Suboptimal code for counting true elements in a vector
Labels backend:AArch64, missed-optimization
Assignees
Reporter Kmeakin
    https://godbolt.org/z/YGn4YxjcP

Consider this function for counting the number of elements in `haystack` that match `needle`:
```c++
#include <arm_neon.h>

auto count_1(uint8x16_t haystack, uint8_t needle) ->uint32_t {
    uint8x16_t needles = vdupq_n_u8(needle);
 uint8x16_t mask = haystack == needles;
    return -vaddvq_u8(mask);
}
```

GCC produces exactly the assembly I wanted:
```asm
count_1(__Uint8x16_t, unsigned char):
        dup v31.16b, w0
        cmeq    v31.16b, v31.16b, v0.16b
        addv    b31, v31.16b
        fmov    w0, s31
        neg     w0, w0
 ret
```

But clang produces this monstrosity:
```asm
.LCPI0_0:
 .byte   1
        .byte   2
        .byte   4
        .byte   8
 .byte   16
        .byte   32
        .byte   64
        .byte   128
 .byte   1
        .byte   2
        .byte   4
        .byte   8
 .byte   16
        .byte   32
        .byte   64
        .byte 128
count_1(__Uint8x16_t, unsigned char):
        dup     v1.16b, w0
 adrp    x9, .LCPI0_0
        mov     w8, wzr
        cmeq    v0.16b, v0.16b, v1.16b
        ldr     q1, [x9, :lo12:.LCPI0_0]
        and v0.16b, v0.16b, v1.16b
        ext     v1.16b, v0.16b, v0.16b, #8
 zip1    v0.16b, v0.16b, v1.16b
        addv    h0, v0.8h
        fmov w9, s0
        fmov    s0, w9
        cnt     v0.8b, v0.8b
        addv b0, v0.8b
        fmov    w9, s0
        neg     w9, w9
        sub w0, w8, w9, uxtb
        ret
```

Restricting the return type to 8-bits for some reason allows LLVM to remove the table lookup, but still produces lots of unnecessary shifts:
```c++
auto count_2(uint8x16_t haystack, uint8_t needle) -> uint8_t {
    uint8x16_t needles = vdupq_n_u8(needle);
 uint8x16_t mask = haystack == needles;
    return -vaddvq_u8(mask);
}
```

```asm
count_2(__Uint8x16_t, unsigned char):
 dup     v1.16b, w0
        cmeq    v0.16b, v0.16b, v1.16b
 sshll2  v1.8h, v0.16b, #0
        sshll   v0.8h, v0.8b, #0
        saddl2 v2.4s, v0.8h, v1.8h
        saddl   v0.4s, v0.4h, v1.4h
        add v0.4s, v0.4s, v2.4s
        addv    s0, v0.4s
        fmov    w8, s0
 neg     w0, w8
        ret
```

I can make LLVM emit the optimal assembly if I write the LLVM-IR myself:
https://godbolt.org/z/bW1rhn884
```llvm
define noundef i8 @count_optimal(<16 x i8> noundef %haystack, i8 noundef %needle) {
 %single_needle = insertelement <16 x i8> poison, i8 %needle, i64 0
 %splatted_needles = shufflevector <16 x i8> %single_needle, <16 x i8> poison, <16 x i32> zeroinitializer
  %cmp = icmp eq <16 x i8> %haystack, %splatted_needles
  %ext = sext <16 x i1> %cmp to <16 x i8>
  %sum = call i8 @llvm.vector.reduce.add(<16 x i8> %ext)
  %neg = sub i8 0, %sum
  ret i8 %neg
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to