https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80634

            Bug ID: 80634
           Summary: strangely missed vectorization optimizations
           Product: gcc
           Version: 6.3.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: other
          Assignee: unassigned at gcc dot gnu.org
          Reporter: steven at uplinklabs dot net
  Target Milestone: ---

Created attachment 41322
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41322&action=edit
gcc 6.3.1 outputs for ELEMS=1 through ELEMS=32

(Not sure which component is the correct one for this issue).

I've noticed that the vectorizer makes some strange choices occasionally, and
will turn some straightforward code into a large branchy code sequence. Take
this, for example:

void saxpy(float a, float * restrict x, float * restrict y)
{
        for (int i = 0; i < ELEMS; ++i)
                y[i] = a*x[i] + y[i];
}

If I use the flags "-O3 -march=haswell" (or "-O3 -xAVX2" on ICC) and use
varying definitions of ELEMS, I will sometimes get odd results with GCC 6.3.1.

Here's -DELEMS=6 with GCC 6.3.1:

saxpy:
        vshufps $0, %xmm0, %xmm0, %xmm1
        vmovups (%rsi), %xmm2
        vfmadd132ps     (%rdi), %xmm2, %xmm1
        vmovss  20(%rsi), %xmm3
        vmovups %xmm1, (%rsi)
        vmovss  16(%rdi), %xmm1
        vfmadd213ss     16(%rsi), %xmm0, %xmm1
        vfmadd132ss     20(%rdi), %xmm3, %xmm0
        vmovss  %xmm1, 16(%rsi)
        vmovss  %xmm0, 20(%rsi)
        ret

Seems reasonable.


Here's -DELEMS=7 with GCC 6.3.1:

saxpy:
        movq    %rsi, %rax
        shrq    $2, %rax
        negq    %rax
        andl    $3, %eax
        je      .L7
        vmovss  (%rdi), %xmm1
        vfmadd213ss     (%rsi), %xmm0, %xmm1
        vmovss  %xmm1, (%rsi)
        cmpl    $1, %eax
        je      .L8
        vmovss  4(%rdi), %xmm1
        vfmadd213ss     4(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 4(%rsi)
        cmpl    $3, %eax
        jne     .L9
        vmovss  8(%rdi), %xmm1
        vfmadd213ss     8(%rsi), %xmm0, %xmm1
        movl    $4, %r8d
        movl    $3, %edx
        vmovss  %xmm1, 8(%rsi)
.L3:
        movl    $7, %ecx
        movl    %eax, %r9d
        subl    %eax, %ecx
.L2:
        leaq    (%rsi,%r9,4), %rax
        vshufps $0, %xmm0, %xmm0, %xmm1
        vmovaps (%rax), %xmm3
        vfmadd132ps     (%rdi,%r9,4), %xmm3, %xmm1
        vmovaps %xmm1, (%rax)
        leal    4(%rdx), %eax
        cmpl    $4, %ecx
        je      .L19
        cltq
        addl    $5, %edx
        leaq    (%rsi,%rax,4), %rcx
        vmovss  (%rdi,%rax,4), %xmm1
        vfmadd213ss     (%rcx), %xmm0, %xmm1
        vmovss  %xmm1, (%rcx)
        cmpl    $5, %r8d
        je      .L17
        movslq  %edx, %rdx
        leaq    (%rsi,%rdx,4), %rax
        vmovss  (%rdi,%rdx,4), %xmm1
        vfmadd213ss     (%rax), %xmm0, %xmm1
        vmovss  %xmm1, (%rax)
        cmpl    $6, %r8d
        je      .L17
        vmovss  24(%rsi), %xmm2
        vfmadd132ss     24(%rdi), %xmm2, %xmm0
        vmovss  %xmm0, 24(%rsi)
        ret
.L17:
        ret
.L7:
        movl    $7, %ecx
        xorl    %r9d, %r9d
        movl    $7, %r8d
        xorl    %edx, %edx
        jmp     .L2
.L19:
        ret
.L8:
        movl    $6, %r8d
        movl    $1, %edx
        jmp     .L3
.L9:
        movl    $5, %r8d
        movl    $2, %edx
        jmp     .L3


This might be explained away by it being an odd number just short of a power of
two, but ICC does an apparently better job (one packed FMA plus three single
FMAs):

saxpy:

        vbroadcastss %xmm0, %xmm2
        vmovups   (%rdi), %xmm1
        vmovss    16(%rdi), %xmm3
        vmovss    20(%rdi), %xmm4
        vmovss    24(%rdi), %xmm5
        vfmadd213ps (%rsi), %xmm1, %xmm2
        vfmadd213ss 16(%rsi), %xmm0, %xmm3
        vfmadd213ss 20(%rsi), %xmm0, %xmm4
        vfmadd213ss 24(%rsi), %xmm5, %xmm0
        vmovups   %xmm2, (%rsi)
        vmovss    %xmm3, 16(%rsi)
        vmovss    %xmm4, 20(%rsi)
        vmovss    %xmm0, 24(%rsi)
        ret

The results from GCC 6.3.1 for ELEMS values 8 through 14 look fine (short
branchless code sequences similar to what ICC emits), but things go to crap
again for what seems to be *any* value ELEMS=15 or above.

It even misses the opportunity with ELEMS=16 to just do two packed FMAs with
YMM registers:

saxpy:
        movq    %rsi, %rax
        shrq    $2, %rax
        negq    %rax
        andl    $7, %eax
        je      .L7
        vmovss  (%rdi), %xmm1
        vfmadd213ss     (%rsi), %xmm0, %xmm1
        vmovss  %xmm1, (%rsi)
        cmpl    $1, %eax
        je      .L8
        vmovss  4(%rdi), %xmm1
        vfmadd213ss     4(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 4(%rsi)
        cmpl    $2, %eax
        je      .L9
        vmovss  8(%rdi), %xmm1
        vfmadd213ss     8(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 8(%rsi)
        cmpl    $3, %eax
        je      .L10
        vmovss  12(%rdi), %xmm1
        vfmadd213ss     12(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 12(%rsi)
        cmpl    $4, %eax
        je      .L11
        vmovss  16(%rdi), %xmm1
        vfmadd213ss     16(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 16(%rsi)
        cmpl    $5, %eax
        je      .L12
        vmovss  20(%rdi), %xmm1
        vfmadd213ss     20(%rsi), %xmm0, %xmm1
        vmovss  %xmm1, 20(%rsi)
        cmpl    $7, %eax
        jne     .L13
        vmovss  24(%rdi), %xmm1
        vfmadd213ss     24(%rsi), %xmm0, %xmm1
        movl    $9, %r9d
        movl    $7, %r10d
        vmovss  %xmm1, 24(%rsi)
.L3:
        movl    $16, %ecx
        movl    %eax, %edx
        movl    $8, %r8d
        movl    $1, %r11d
        subl    %eax, %ecx
.L2:
        salq    $2, %rdx
        vbroadcastss    %xmm0, %ymm1
        leaq    (%rdi,%rdx), %rax
        addq    %rsi, %rdx
        vmovups (%rax), %ymm2
        vfmadd213ps     (%rdx), %ymm1, %ymm2
        vmovaps %ymm2, (%rdx)
        cmpl    $2, %r11d
        jne     .L4
        vmovaps 32(%rdx), %ymm4
        vfmadd132ps     32(%rax), %ymm4, %ymm1
        vmovaps %ymm1, 32(%rdx)
.L4:
        movl    %r9d, %edx
        leal    (%r8,%r10), %eax
        subl    %r8d, %edx
        cmpl    %r8d, %ecx
        je      .L29
        movslq  %eax, %r8
        leaq    (%rsi,%r8,4), %rcx
        vmovss  (%rdi,%r8,4), %xmm1
        vfmadd213ss     (%rcx), %xmm0, %xmm1
        vmovss  %xmm1, (%rcx)
        leal    1(%rax), %ecx
        cmpl    $1, %edx
        je      .L29
        movslq  %ecx, %rcx
        leaq    (%rsi,%rcx,4), %r8
        vmovss  (%rdi,%rcx,4), %xmm1
        leal    2(%rax), %ecx
        vfmadd213ss     (%r8), %xmm0, %xmm1
        vmovss  %xmm1, (%r8)
        cmpl    $2, %edx
        je      .L29
        movslq  %ecx, %rcx
        leaq    (%rsi,%rcx,4), %r8
        vmovss  (%rdi,%rcx,4), %xmm1
        leal    3(%rax), %ecx
        vfmadd213ss     (%r8), %xmm0, %xmm1
        vmovss  %xmm1, (%r8)
        cmpl    $3, %edx
        je      .L29
        movslq  %ecx, %rcx
        leaq    (%rsi,%rcx,4), %r8
        vmovss  (%rdi,%rcx,4), %xmm1
        leal    4(%rax), %ecx
        vfmadd213ss     (%r8), %xmm0, %xmm1
        vmovss  %xmm1, (%r8)
        cmpl    $4, %edx
        je      .L29
        movslq  %ecx, %rcx
        leaq    (%rsi,%rcx,4), %r8
        vmovss  (%rdi,%rcx,4), %xmm1
        leal    5(%rax), %ecx
        vfmadd213ss     (%r8), %xmm0, %xmm1
        vmovss  %xmm1, (%r8)
        cmpl    $5, %edx
        je      .L29
        movslq  %ecx, %rcx
        addl    $6, %eax
        leaq    (%rsi,%rcx,4), %r8
        vmovss  (%rdi,%rcx,4), %xmm1
        vfmadd213ss     (%r8), %xmm0, %xmm1
        vmovss  %xmm1, (%r8)
        cmpl    $6, %edx
        je      .L29
        cltq
        leaq    (%rsi,%rax,4), %rdx
        vmovss  (%rdx), %xmm3
        vfmadd132ss     (%rdi,%rax,4), %xmm3, %xmm0
        vmovss  %xmm0, (%rdx)
.L29:
        vzeroupper
        ret
.L7:
        movl    $16, %r8d
        movl    $16, %ecx
        xorl    %edx, %edx
        xorl    %r10d, %r10d
        movl    $2, %r11d
        movl    $16, %r9d
        jmp     .L2
.L13:
        movl    $10, %r9d
        movl    $6, %r10d
        jmp     .L3
.L8:
        movl    $15, %r9d
        movl    $1, %r10d
        jmp     .L3
.L9:
        movl    $14, %r9d
        movl    $2, %r10d
        jmp     .L3
.L10:
        movl    $13, %r9d
        movl    $3, %r10d
        jmp     .L3
.L11:
        movl    $12, %r9d
        movl    $4, %r10d
        jmp     .L3
.L12:
        movl    $11, %r9d
        movl    $5, %r10d
        jmp     .L3


ICC gets ELEMS=16 right:

saxpy:

        vmovups   (%rdi), %ymm1
        vmovups   32(%rdi), %ymm2
        vbroadcastss %xmm0, %ymm3
        vfmadd213ps (%rsi), %ymm3, %ymm1
        vfmadd213ps 32(%rsi), %ymm2, %ymm3
        vmovups   %ymm1, (%rsi)
        vmovups   %ymm3, 32(%rsi)
        vzeroupper 
        ret 


I'll attach the code outputs for ELEMS values 1 through 32 using GCC 6.3.1 and
ICC 17.0.1.

Reply via email to