Issue 179950
Summary [CodeGenPrepare] Failure to hoist bitcast to legal type causes register splitting
Labels missed-optimization
Assignees
Reporter RKSimon
    ```ll
define i8 @src(ptr %a0, i8 %a1) {
entry:
  %src256 = load i256, ptr %a0, align 1
 %iszero = icmp eq i256 %src256, 0
  br i1 %iszero, label %exit, label %reduction
reduction:
  %src256.bitcast = bitcast i256 %src256 to <32 x i8>
  %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %src256.bitcast)
  br label %exit
exit:
  %result = phi i8 [ 0, %entry], [ %red, %reduction ]
  ret i8 %result
}

define i8 @dst(ptr %a0, i8 %a1) {
entry:
  %src256 = load i256, ptr %a0, align 1
  %src256.bitcast = bitcast i256 %src256 to <32 x i8>
  %iszero = icmp eq i256 %src256, 0
  br i1 %iszero, label %exit, label %reduction
reduction:
  %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %src256.bitcast)
  br label %exit
exit:
  %result = phi i8 [ 0, %entry], [ %red, %reduction ]
  ret i8 %result
}
```
https://rust.godbolt.org/z/Tv3MTjvnM

i256 is not a legal x86 type, so will get split into i64 CopyToReg/CopyFromReg across blocks and then put back together, causing a lot of spilling and build_vector noise.

But the only uses of the i256 is a legal <32 x i8> type - if we'd bitcasted to this in the entry block then we wouldn't need to split at all.

```s
src:
        vmovdqu (%rdi), %ymm0
        vptest  %ymm0, %ymm0
        je      .LBB0_1
        movq    (%rdi), %rax
        vmovd %eax, %xmm0
        movq    %rax, %rcx
        movq    %rax, %rdx
 movq    %rax, %rsi
        movq    %rax, %r8
        movl    %eax, %r9d
 movl    %eax, %r10d
        shrl    $8, %eax
        shrl    $16, %r10d
        shrl    $24, %r9d
        shrq    $32, %r8
        shrq $40, %rsi
        shrq    $48, %rdx
        shrq    $56, %rcx
 vpinsrb $1, %eax, %xmm0, %xmm0
        movq    8(%rdi), %rax
 vpinsrb $2, %r10d, %xmm0, %xmm0
        vpinsrb $3, %r9d, %xmm0, %xmm0
 vpinsrb $4, %r8d, %xmm0, %xmm0
        vpinsrb $5, %esi, %xmm0, %xmm0
 vpinsrb $6, %edx, %xmm0, %xmm0
        vpinsrb $7, %ecx, %xmm0, %xmm0
 movl    %eax, %ecx
        shrl    $8, %ecx
        vpinsrb $8, %eax, %xmm0, %xmm0
        vpinsrb $9, %ecx, %xmm0, %xmm0
        movl %eax, %ecx
        shrl    $16, %ecx
        vpinsrb $10, %ecx, %xmm0, %xmm0
        movl    %eax, %ecx
        shrl    $24, %ecx
        vpinsrb $11, %ecx, %xmm0, %xmm0
        movq    %rax, %rcx
        shrq    $32, %rcx
        vpinsrb $12, %ecx, %xmm0, %xmm0
        movq    %rax, %rcx
 shrq    $40, %rcx
        vpinsrb $13, %ecx, %xmm0, %xmm0
        movq %rax, %rcx
        shrq    $56, %rax
        shrq    $48, %rcx
 vpinsrb $14, %ecx, %xmm0, %xmm0
        movq    16(%rdi), %rcx
 vpinsrb $15, %eax, %xmm0, %xmm0
        vmovd   %ecx, %xmm1
        movl %ecx, %eax
        shrl    $8, %eax
        vpinsrb $1, %eax, %xmm1, %xmm1
        movl    %ecx, %eax
        shrl    $16, %eax
        vpinsrb $2, %eax, %xmm1, %xmm1
        movl    %ecx, %eax
        shrl    $24, %eax
        vpinsrb $3, %eax, %xmm1, %xmm1
        movq    %rcx, %rax
 shrq    $32, %rax
        vpinsrb $4, %eax, %xmm1, %xmm1
        movq %rcx, %rax
        shrq    $40, %rax
        vpinsrb $5, %eax, %xmm1, %xmm1
        movq    %rcx, %rax
        shrq    $56, %rcx
        shrq $48, %rax
        vpinsrb $6, %eax, %xmm1, %xmm1
        movq    24(%rdi), %rax
        vpinsrb $7, %ecx, %xmm1, %xmm1
        movl    %eax, %ecx
 vpinsrb $8, %eax, %xmm1, %xmm1
        shrl    $8, %ecx
        vpinsrb $9, %ecx, %xmm1, %xmm1
        movl    %eax, %ecx
        shrl    $16, %ecx
        vpinsrb $10, %ecx, %xmm1, %xmm1
        movl    %eax, %ecx
 shrl    $24, %ecx
        vpinsrb $11, %ecx, %xmm1, %xmm1
        movq %rax, %rcx
        shrq    $32, %rcx
        vpinsrb $12, %ecx, %xmm1, %xmm1
        movq    %rax, %rcx
        shrq    $40, %rcx
        vpinsrb $13, %ecx, %xmm1, %xmm1
        movq    %rax, %rcx
        shrq    $56, %rax
        shrq    $48, %rcx
        vpinsrb $14, %ecx, %xmm1, %xmm1
 vpinsrb $15, %eax, %xmm1, %xmm1
        vpmaxub %xmm1, %xmm0, %xmm0
 vpcmpeqd        %xmm1, %xmm1, %xmm1
        vpxor   %xmm1, %xmm0, %xmm0
        vpsrlw  $8, %xmm0, %xmm1
        vpminub %xmm1, %xmm0, %xmm0
        vphminposuw     %xmm0, %xmm0
        vmovd   %xmm0, %eax
 notb    %al
        vzeroupper
        retq
.LBB0_1:
        xorl %eax, %eax
        vzeroupper
        retq

dst:
        vmovdqu (%rdi), %ymm0
        xorl    %ecx, %ecx
        vpcmpeqd        %xmm1, %xmm1, %xmm1
        vptest  %ymm0, %ymm0
        vpmaxub 16(%rdi), %xmm0, %xmm0
 vpxor   %xmm1, %xmm0, %xmm0
        vpsrlw  $8, %xmm0, %xmm1
 vpminub %xmm1, %xmm0, %xmm0
        vphminposuw     %xmm0, %xmm0
 vmovd   %xmm0, %eax
        notb    %al
        movzbl  %al, %eax
 cmovel  %ecx, %eax
        vzeroupper
        retq
```

In this case its even worse, because the ` icmp eq i256` will be performed using a vector type in lowering :(

(Not sure if this is CodeGenPrepare but it already does the opposite fold to sink extension sources to avoid splitting).
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to