Issue 114891
Summary Loop Vectorizer chooses small vectorization factor VF when known trip count isn't a multiple of it.
Labels new issue
Assignees
Reporter MatzeB
    I am looking into vectorization for AVX512 in a case where the loop trip count isn't a multiple of the ideal VF of 16.

A simplified version of the problem looks like this[1]:
```
void foosum56(float *dest, const float *values) {
 float buf[56];
  memcpy(buf, dest, sizeof(buf));
// #pragma clang loop vectorize_width(16)
  for (int i = 0; i < 56; i++) {
    buf[i] += values[i];
  }
  memcpy(dest, buf, sizeof(buf));
}
```

Compiling with something like `clang -O3 -S -o - -mavx512f test.c` the loop vectorizer chooses a vectorization factor of 8 and the resuling code is using ymm registers:

```
...
 vmovups (%rsi), %ymm0
        vmovups 32(%rsi), %ymm1
        vmovups 64(%rsi), %ymm2
        vmovups 96(%rsi), %ymm3
        vaddps  (%rdi), %ymm0, %ymm0
        vaddps  32(%rdi), %ymm1, %ymm1
        vaddps 64(%rdi), %ymm2, %ymm2
        vaddps  96(%rdi), %ymm3, %ymm3
 vmovups 128(%rsi), %ymm4
        vaddps  128(%rdi), %ymm4, %ymm4
 vmovups 160(%rsi), %ymm5
        vaddps  160(%rdi), %ymm5, %ymm5
 vmovups 192(%rsi), %ymm6
        vaddps  192(%rdi), %ymm6, %ymm6
 vmovups %ymm0, (%rdi)
        vmovups %ymm1, 32(%rdi)
 vmovups %ymm2, 64(%rdi)
        vmovups %ymm3, 96(%rdi)
        vmovups %ymm4, 128(%rdi)
        vmovups %ymm5, 160(%rdi)
        vmovups %ymm6, 192(%rdi)
...
```

- Ideally though we would use zmm registers/operations for the first couple elements and a ymm register/operation for the remaining 8 elements.
- Manually adding `#pragma clang loop vectorize_width(16)` produces a poor result too: While it does nicely use zmm for the first elements, the remaining 8 elements are scalarized...

This still reproduces on LLVM trunk (on 8b55162e195783dd27e1c69fb4d97971ef76725b from Oct 29).

Filing this to document the issue while I am trying to figure out how this situation could be improved...


[1] Simplified llvm-ir:
```
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-redhat-linux-gnu"

define void @foosum120(ptr %dest, ptr readonly %values) #0 {
entry:
  %buf = alloca [56 x float], align 16
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(224) %buf, ptr noundef nonnull align 4 dereferenceable(224) %dest, i64 224, i1 false)
  br label %for.body

for.body:
 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
 %arrayidx = getelementptr inbounds float, ptr %values, i64 %indvars.iv
 %0 = load float, ptr %arrayidx, align 4
  %arrayidx2 = getelementptr inbounds [56 x float], ptr %buf, i64 0, i64 %indvars.iv
  %1 = load float, ptr %arrayidx2, align 4
  %add = fadd float %0, %1
  store float %add, ptr %arrayidx2, align 4
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, 56
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body ; , !llvm.loop !7

for.cond.cleanup:
  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(224) %dest, ptr noundef nonnull align 16 dereferenceable(224) %buf, i64 224, i1 false)
  ret void
}

declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2

attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-f
eatures"="+avx,+avx2,+avx512f,+cmov,+crc32,+cx8,+evex512,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" }
attributes #2 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) }

!7 = distinct !{!7, !8}
!8 = !{!"llvm.loop.vectorize.width", i32 16}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to