[Bug tree-optimization/92645] Hand written vector code is 450 times slower when compiled with GCC compared to Clang

rguenth at gcc dot gnu.org Wed, 27 Nov 2019 06:58:46 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92645


--- Comment #13 from Richard Biener <rguenth at gcc dot gnu.org> ---
So with all tricks I arrive at the following for the reduced testcase

f:
.LFB2:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movl    %ecx, %r9d
        vpxor   %xmm4, %xmm4, %xmm4
        movl    $255, %eax
        shrl    $24, %r9d
        subl    %r9d, %eax
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        movl    %eax, %r8d
        vmovdqa %ymm4, -32(%rsp)
        vmovd   %ecx, %xmm4
        shrl    $7, %eax
        vpshufd $0, %xmm4, %xmm4
        addl    %r8d, %eax
        vmovaps %xmm4, -32(%rsp)
        vpmovzxbw       -32(%rsp), %ymm2
        vmovd   %eax, %xmm0
        vpbroadcastb    %xmm0, %xmm0
        vpsllw  $8, %ymm2, %ymm2
        vpaddw  .LC1(%rip), %ymm2, %ymm2
        testq   %rdx, %rdx
        je      .L10
        vpxor   %xmm5, %xmm5, %xmm5
        vmovdqa .LC0(%rip), %xmm3
        xorl    %eax, %eax
        vmovdqa %ymm5, -32(%rsp)
        vmovaps %xmm0, -32(%rsp)
        vpmovzxbw       -32(%rsp), %ymm4
        .p2align 4,,10
        .p2align 3
.L7:
        vmovdqu (%rsi,%rax), %xmm6
        vpxor   %xmm5, %xmm5, %xmm5
        vmovdqa %ymm5, -32(%rsp)
        vmovaps %xmm6, -32(%rsp)
        vpmovzxbw       -32(%rsp), %ymm0
        vpmullw %ymm4, %ymm0, %ymm0
        vpaddw  %ymm2, %ymm0, %ymm0
        vpsrlw  $8, %ymm0, %ymm0
        vmovdqa %xmm0, %xmm1
        vextracti128    $0x1, %ymm0, %xmm0
        vpand   %xmm1, %xmm3, %xmm1
        vpand   %xmm0, %xmm3, %xmm0
        vpackuswb       %xmm0, %xmm1, %xmm0
        vmovups %xmm0, (%rdi,%rax)
        addq    $16, %rax
        subq    $4, %rdx
        jne     .L7
.L10:
        vzeroupper
        leave
        .cfi_def_cfa 7, 8
        ret
        .cfi_endproc
.LFE2:
        .size   f, .-f

where the stack spills still look bad - shomehow we don't like

  _60 = BIT_INSERT_EXPR <{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, _4, 0>;
  _61 = [vec_unpack_lo_expr] _60;

which is "widening" _4 to double vector size when we know we'll just need
the lowpart for the VEC_UNPACK_LO_EXPR.  This _should_ translate to
a mov %xmm, %ymm but somehow it doesn't.

A small testcase for that is the zxt() function in the reduced testcase.
Using an undef SSA name in place off the { 0, ... } vector doesn't help
either.  A simple VIEW_CONVERT isn't valid (it changes size).

[Bug tree-optimization/92645] Hand written vector code is 450 times slower when compiled with GCC compared to Clang

Reply via email to