| Issue |
181490
|
| Summary |
[AARCH64] manual widening adjacent arithmetic not recognized
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
folkertdev
|
I'd expect all three to optimize to the one instruction
https://godbolt.org/z/ETqGa8Ynh
```llvm
define <8 x i16> @vpaddlq_u8_v1(<16 x i8> %a) unnamed_addr {
start:
%_0 = tail call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a)
ret <8 x i16> %_0
}
define <8 x i16> @vpaddlq_u8_v2_widen_shuffle_add(<16 x i8> %a) unnamed_addr {
start:
%0 = zext <16 x i8> %a to <16 x i16>
%1 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%2 = shufflevector <16 x i16> %0, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%3 = add nuw nsw <8 x i16> %1, %2
ret <8 x i16> %3
}
define range(i16 0, 511) <8 x i16> @vpaddlq_u8_v3_shuffle_widen_add(<16 x i8> %a) unnamed_addr {
start:
%0 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%1 = shufflevector <16 x i8> %a, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
%2 = zext <8 x i8> %0 to <8 x i16>
%3 = zext <8 x i8> %1 to <8 x i16>
%4 = add nuw nsw <8 x i16> %2, %3
ret <8 x i16> %4
}
```
instead we get
```asm
vpaddlq_u8_v1:
uaddlp v0.8h, v0.16b
ret
vpaddlq_u8_v2_widen_shuffle_add:
mov v1.16b, v0.16b
fmov d0, d1
mov d2, v1.d[1]
ushll v0.8h, v0.8b, #0
ushll2 v1.8h, v1.16b, #0
addp v0.8h, v0.8h, v1.8h
ret
vpaddlq_u8_v3_shuffle_widen_add:
mov v1.16b, v0.16b
uzp1 v3.16b, v1.16b, v2.16b
fmov d0, d3
mov d3, v3.d[1]
uzp2 v2.16b, v1.16b, v2.16b
fmov d1, d2
mov d2, v2.d[1]
uaddl v0.8h, v0.8b, v1.8b
ret
```
I believe the same thing happens for many other (arithmetic) operations, so ideally there is some general solution.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs