https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64909

            Bug ID: 64909
           Summary: [4.8/5 regression] Missed vectorization
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org

Hi,
the following loop (taken from firefox unicode stuff)
unsigned short a[32];
unsigned int b[32];
t()
{
  int i;
  for (i=0;i<12;i++)
    b[i]=a[i];
}

compiles by clang to:
t:                                      # @t
        .cfi_startproc
# BB#0:
        vpmovzxwd       a(%rip), %xmm0
        vmovdqa .LCPI0_0(%rip), %xmm1   # xmm1 = [65535,65535,65535,65535]
        vpand   %xmm1, %xmm0, %xmm0
        vmovdqa %xmm0, b(%rip)
        vpmovzxwd       a+8(%rip), %xmm0
        vpand   %xmm1, %xmm0, %xmm0
        vmovdqa %xmm0, b+16(%rip)
        vpmovzxwd       a+16(%rip), %xmm0
        vpand   %xmm1, %xmm0, %xmm0
        vmovdqa %xmm0, b+32(%rip)
        retq

GCC 4.7 does:
t:
.LFB0:
        .cfi_startproc
        movzwl  a+16(%rip), %eax
        vmovaps a(%rip), %xmm0
        vpmovzxwd       %xmm0, %xmm1
        vpsrldq $8, %xmm0, %xmm0
        vpmovzxwd       %xmm0, %xmm0
        movl    %eax, b+32(%rip)
        movzwl  a+18(%rip), %eax
        vmovaps %xmm1, b(%rip)
        vmovaps %xmm0, b+16(%rip)
        movl    %eax, b+36(%rip)
        movzwl  a+20(%rip), %eax
        movl    %eax, b+40(%rip)
        movzwl  a+22(%rip), %eax
        movl    %eax, b+44(%rip)
        ret

while 4.8 and mainline unrolls and keeps it that way:

t:
.LFB0:
        .cfi_startproc
        movzwl  a(%rip), %eax
        movl    %eax, b(%rip)
        movzwl  a+2(%rip), %eax
        movl    %eax, b+4(%rip)
        movzwl  a+4(%rip), %eax
        movl    %eax, b+8(%rip)
        movzwl  a+6(%rip), %eax
        movl    %eax, b+12(%rip)
        movzwl  a+8(%rip), %eax
        movl    %eax, b+16(%rip)
        movzwl  a+10(%rip), %eax
        movl    %eax, b+20(%rip)
        movzwl  a+12(%rip), %eax
        movl    %eax, b+24(%rip)
        movzwl  a+14(%rip), %eax
        movl    %eax, b+28(%rip)
        movzwl  a+16(%rip), %eax
        movl    %eax, b+32(%rip)
        movzwl  a+18(%rip), %eax
        movl    %eax, b+36(%rip)
        movzwl  a+20(%rip), %eax
        movl    %eax, b+40(%rip)
        movzwl  a+22(%rip), %eax
        movl    %eax, b+44(%rip)
        ret

Reply via email to