https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101611

            Bug ID: 101611
           Summary: AVX2 vector arithmetic shift lowered to scalar
                    unnecessarily
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: enhancement
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: glisse at gcc dot gnu.org
  Target Milestone: ---
            Target: x86_64-*-*

Stealing the example from PR 56873

#define SIZE 32
typedef long long veci __attribute__((vector_size(SIZE)));

veci f(veci a, veci b){
  return a>>b;
}

but compiling with -O3 -mavx2 this time, gcc produces scalar code

        vmovq   %xmm1, %rcx
        vmovq   %xmm0, %rax
        vpextrq $1, %xmm0, %rsi
        sarq    %cl, %rax
        vextracti128    $0x1, %ymm0, %xmm0
        vpextrq $1, %xmm1, %rcx
        vextracti128    $0x1, %ymm1, %xmm1
        movq    %rax, %rdx
        sarq    %cl, %rsi
        vmovq   %xmm0, %rax
        vmovq   %xmm1, %rcx
        vmovq   %rdx, %xmm5
        sarq    %cl, %rax
        vpextrq $1, %xmm1, %rcx
        movq    %rax, %rdi
        vpextrq $1, %xmm0, %rax
        vpinsrq $1, %rsi, %xmm5, %xmm0
        sarq    %cl, %rax
        vmovq   %rdi, %xmm4
        vpinsrq $1, %rax, %xmm4, %xmm1
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
        ret

while clang outputs much shorter vector code

        vpbroadcastq    .LCPI0_0(%rip), %ymm2   # ymm2 =
[9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
        vpsrlvq %ymm1, %ymm2, %ymm2
        vpsrlvq %ymm1, %ymm0, %ymm0
        vpxor   %ymm2, %ymm0, %ymm0
        vpsubq  %ymm2, %ymm0, %ymm0
        retq

Reply via email to