| Issue |
55438
|
| Summary |
[Aarch64][SVE] Bad code generation of llvm.fmuladd.* for SVE
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
jsetoain
|
I'm trying to compile code that does this:
c += a * broadcast(b[0])
For fixed length vectors, if I use this:
```
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <4 x float>*
%bp = bitcast float* %arg1 to <4 x float>*
%cp = bitcast float* %arg2 to <4 x float>*
%a = load <4 x float>, <4 x float>* %ap
%b = load <4 x float>, <4 x float>* %bp
%c = load <4 x float>, <4 x float>* %cp
%b0splat = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
%mad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b0splat, <4 x float> %c) #3
store <4 x float> %mad, <4 x float>* %cp
ret void
}
```
And, when I compile it, I get this assembly:
```
ldr q0, [x1] // Load b
ldr q1, [x0] // Load a
ldr q2, [x2] // Load c
fmla v2.4s, v1.4s, v0.s[0] // mad = c + a * splat(b[i])
str q2, [x2] // store mad in c
ret
```
Which looks good to me, although one might argue that it should not need to load the whole vector `b` if it's going to splat its first element.
But, if I write a scalable version of the same code:
```
declare <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <vscale x 4 x float>*
%bp = bitcast float* %arg1 to <vscale x 4 x float>*
%cp = bitcast float* %arg2 to <vscale x 4 x float>*
%a = load <vscale x 4 x float>, <vscale x 4 x float>* %ap
%b = load <vscale x 4 x float>, <vscale x 4 x float>* %bp
%c = load <vscale x 4 x float>, <vscale x 4 x float>* %cp
%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
%mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b0splat, <vscale x 4 x float> %c) #3
store <vscale x 4 x float> %mad, <vscale x 4 x float>* %cp
ret void
}
```
I obtain this:
```
ptrue p0.s
ld1w { z0.s }, p0/z, [x0] // Load a
ld1w { z1.s }, p0/z, [x1] // Load b
ld1w { z2.s }, p0/z, [x2] // Load c
mov z1.s, s1 // Splat b[0]
fmad z0.s, p0/m, z1.s, z2.s // mad = a + (splat b[0]) * c
st1w { z0.s }, p0, [x2] // store mad in c
ret
```
Which is semantically incorrect (it's reading the arguments of `fmuladd` in the wrong order) and instead of using the indexed version of fmad is using a splat, which might have performance implications.
Additionally, if I try to use vscale_range to generate SVE from fixed-length vector code:
```
declare <8 x float> @llvm.fmuladd.v4f32(<8 x float>, <8 x float>, <8 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <8 x float>*
%bp = bitcast float* %arg1 to <8 x float>*
%cp = bitcast float* %arg2 to <8 x float>*
%a = load <8 x float>, <8 x float>* %ap
%b = load <8 x float>, <8 x float>* %bp
%c = load <8 x float>, <8 x float>* %cp
%b0splat = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
%mad = call <8 x float> @llvm.fmuladd.v4f32(<8 x float> %a, <8 x float> %b0splat, <8 x float> %c) #3
store <8 x float> %mad, <8 x float>* %cp
ret void
}
attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
```
I get this:
```
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
sub x9, sp, #48
and sp, x9, #0xffffffffffffffe0
ptrue p0.s
ld1w { z0.s }, p0/z, [x0] // Load a
ld1w { z1.s }, p0/z, [x1] // Load b
ld1w { z2.s }, p0/z, [x2] // Load c
stp s1, s1, [sp, #24] // Splat b[0] to the stack
stp s1, s1, [sp, #16] // Splat b[0] to the stack
stp s1, s1, [sp, #8] // Splat b[0] to the stack
stp s1, s1, [sp] // Splat b[0] to the stack
ld1w { z1.s }, p0/z, [sp] // Load splatted b[0] from the stack
fmad z0.s, p0/m, z1.s, z2.s // mad = splat(b[0]) * c + a
st1w { z0.s }, p0, [x2] // Store mad in c
mov sp, x29
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
```
Which introduces additional performance issues by doing the splat through the stack, instead of simply `mov z1.s, s1`, or something to that effect.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs