| Issue |
169289
|
| Summary |
LoopVectorizer: incorrect FP operation reordering
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
xortator
|
Repro: https://godbolt.org/z/dbhsnTWG9
LLVM should respect overflow and precision effects for floating point. For example, in (good) test above
```
define float @test_single(float %a, float %b) {
%sum = fadd float %a, %b
%res = fsub float %sum, %a
ret float %res
}
```
does not get instcombined into `ret float %b`, because (a + b) could go up to infinity, and result would be infinity or NaN (depending on what `%a` is). And this behavior should be preserved.
However, Loop Vectorizer seems to ignore this semantics.
`opt -passes=loop-vectorize -force-vector-width=2`
no test
```
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
%should_execute = icmp ne i32 %length, 0
br i1 %should_execute, label %loop, label %empty
loop:
%iv = phi i32 [0, %entry], [%iv.next, %loop]
%sum = phi float [0.0, %entry], [%sum.next, %loop]
%a.gep = getelementptr float, ptr %pa, i32 %iv
%b.gep = getelementptr float, ptr %pb, i32 %iv
%a = load float, ptr %a.gep, align 4
%b = load float, ptr %b.gep, align 4
%mul = fmul float %a, %b
%sum.next = fsub float %sum, %mul
%iv.next = add nuw nsw i32 %iv, 1
%loop.cond = icmp ult i32 %iv.next, %length
br i1 %loop.cond, label %loop, label %done
done:
ret float %sum.next
empty:
ret float 0.0
}
```
leads to classical 2-accumulator + add reduce vectorization:
```
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
%should_execute = icmp ne i32 %length, 0
br i1 %should_execute, label %loop.preheader, label %empty
loop.preheader:
%min.iters.check = icmp ult i32 %length, 2
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph:
%n.mod.vf = urem i32 %length, 2
%n.vec = sub i32 %length, %n.mod.vf
br label %vector.body
vector.body:
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr float, ptr %pa, i32 %index
%1 = getelementptr float, ptr %pb, i32 %index
%wide.load = load <2 x float>, ptr %0, align 4
%wide.load1 = load <2 x float>, ptr %1, align 4
%2 = fmul <2 x float> %wide.load, %wide.load1
%3 = fsub <2 x float> %vec.phi, %2
%index.next = add nuw i32 %index, 2
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block:
%5 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %3)
%cmp.n = icmp eq i32 %length, %n.vec
br i1 %cmp.n, label %done, label %scalar.ph
scalar.ph:
%bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %loop.preheader ]
%bc.merge.rdx = phi float [ %5, %middle.block ], [ 0.000000e+00, %loop.preheader ]
br label %loop
loop:
%iv = phi i32 [ %iv.next, %loop ], [ %bc.resume.val, %scalar.ph ]
%sum = phi float [ %sum.next, %loop ], [ %bc.merge.rdx, %scalar.ph ]
%a.gep = getelementptr float, ptr %pa, i32 %iv
%b.gep = getelementptr float, ptr %pb, i32 %iv
%a = load float, ptr %a.gep, align 4
%b = load float, ptr %b.gep, align 4
%mul = fmul float %a, %b
%sum.next = fsub float %sum, %mul
%iv.next = add nuw nsw i32 %iv, 1
%loop.cond = icmp ult i32 %iv.next, %length
br i1 %loop.cond, label %loop, label %done
done:
%sum.next.lcssa = phi float [ %sum.next, %loop ], [ %5, %middle.block ]
ret float %sum.next.lcssa
empty:
ret float 0.000000e+00
}
declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```
This is downright broken in multiple ways. For example, given some huge `x`, array `a` is `{x, x, x, ..., x}` and array `b` is `{1, -1, 1, -1, ... 1, -1}`.
Sum of even elements might go to infinity, sum of odd element goes to negative infinity, and the result would be NaN, while the original answer was `0`.
Seems that Loop Vectorizer doesn't respect this semantics.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs