[llvm-bugs] [Bug 169289] LoopVectorizer: incorrect FP operation reordering

LLVM Bugs via llvm-bugs Fri, 28 Nov 2025 20:22:42 -0800

Issue	169289
Summary	LoopVectorizer: incorrect FP operation reordering
Labels	new issue
Assignees
Reporter	xortator

    Repro: https://godbolt.org/z/dbhsnTWG9

LLVM should respect overflow and precision effects for floating point. For example, in (good) test above
```
define float @test_single(float %a, float %b) {
    %sum = fadd float %a, %b
    %res = fsub float %sum, %a
    ret float %res
}


```
does not get instcombined into `ret float %b`, because (a + b) could go up to infinity, and result would be infinity or NaN (depending on what `%a` is). And this behavior should be preserved.

However, Loop Vectorizer seems to ignore this semantics.
`opt -passes=loop-vectorize -force-vector-width=2`
no test
```
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
  %should_execute = icmp ne i32 %length, 0
  br i1 %should_execute, label %loop, label %empty

loop:
  %iv = phi i32 [0, %entry], [%iv.next, %loop]
  %sum = phi float [0.0, %entry], [%sum.next, %loop]
  %a.gep = getelementptr float, ptr %pa, i32 %iv
  %b.gep = getelementptr float, ptr %pb, i32 %iv
  %a = load float, ptr %a.gep, align 4
  %b = load float, ptr %b.gep, align 4
  %mul = fmul float %a, %b
  %sum.next = fsub float %sum, %mul
  %iv.next = add nuw nsw i32 %iv, 1
  %loop.cond = icmp ult i32 %iv.next, %length
  br i1 %loop.cond, label %loop, label %done

done:
  ret float %sum.next

empty:
  ret float 0.0
}
```

leads to classical 2-accumulator + add reduce vectorization:
```
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
  %should_execute = icmp ne i32 %length, 0
  br i1 %should_execute, label %loop.preheader, label %empty

loop.preheader:
  %min.iters.check = icmp ult i32 %length, 2
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:
 %n.mod.vf = urem i32 %length, 2
  %n.vec = sub i32 %length, %n.mod.vf
  br label %vector.body

vector.body:
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %3, %vector.body ]
  %0 = getelementptr float, ptr %pa, i32 %index
  %1 = getelementptr float, ptr %pb, i32 %index
  %wide.load = load <2 x float>, ptr %0, align 4
 %wide.load1 = load <2 x float>, ptr %1, align 4
  %2 = fmul <2 x float> %wide.load, %wide.load1
  %3 = fsub <2 x float> %vec.phi, %2
  %index.next = add nuw i32 %index, 2
  %4 = icmp eq i32 %index.next, %n.vec
  br i1 %4, label %middle.block, label %vector.body

middle.block:
  %5 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %3)
  %cmp.n = icmp eq i32 %length, %n.vec
  br i1 %cmp.n, label %done, label %scalar.ph

scalar.ph:
  %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %loop.preheader ]
  %bc.merge.rdx = phi float [ %5, %middle.block ], [ 0.000000e+00, %loop.preheader ]
  br label %loop

loop:
  %iv = phi i32 [ %iv.next, %loop ], [ %bc.resume.val, %scalar.ph ]
  %sum = phi float [ %sum.next, %loop ], [ %bc.merge.rdx, %scalar.ph ]
  %a.gep = getelementptr float, ptr %pa, i32 %iv
  %b.gep = getelementptr float, ptr %pb, i32 %iv
 %a = load float, ptr %a.gep, align 4
  %b = load float, ptr %b.gep, align 4
  %mul = fmul float %a, %b
  %sum.next = fsub float %sum, %mul
 %iv.next = add nuw nsw i32 %iv, 1
  %loop.cond = icmp ult i32 %iv.next, %length
  br i1 %loop.cond, label %loop, label %done

done:
 %sum.next.lcssa = phi float [ %sum.next, %loop ], [ %5, %middle.block ]
 ret float %sum.next.lcssa

empty:
  ret float 0.000000e+00
}

declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

This is downright broken in multiple ways. For example, given some huge `x`, array `a` is `{x, x, x, ..., x}` and array `b` is `{1, -1, 1, -1, ... 1, -1}`.

Sum of even elements might go to infinity, sum of odd element goes to negative infinity, and the result would be NaN, while the original answer was `0`.

Seems that Loop Vectorizer doesn't respect this semantics.

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 169289] LoopVectorizer: incorrect FP operation reordering

Reply via email to