[Bug c/79491] New: Possibly inefficient code for the inner product of two vectors

drraph at gmail dot com Mon, 13 Feb 2017 08:50:14 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79491


            Bug ID: 79491
           Summary: Possibly inefficient code for the inner product of two
                    vectors
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: drraph at gmail dot com
  Target Milestone: ---

Consider:

float f(float x[], float y[]) {
  float p = 0;
  for (int i = 0; i <64; i++)
    p += x[i] * y[i];
  return p;
}

Using gcc 7 (snapshot) and  -Ofast -march=core-avx2  you get:

f:
        mov     rax, rdi
        shr     rax, 2
        neg     rax
        and     eax, 7
        je      .L6
        vmovss  xmm0, DWORD PTR [rdi]
        vmulss  xmm1, xmm0, DWORD PTR [rsi]
        cmp     eax, 1
        je      .L7
        vmovss  xmm4, DWORD PTR [rdi+4]
        vfmadd231ss     xmm1, xmm4, DWORD PTR [rsi+4]
        cmp     eax, 2
        je      .L8
        vmovss  xmm3, DWORD PTR [rdi+8]
        vfmadd231ss     xmm1, xmm3, DWORD PTR [rsi+8]
        cmp     eax, 3
        je      .L9
        vmovss  xmm2, DWORD PTR [rdi+12]
        vfmadd231ss     xmm1, xmm2, DWORD PTR [rsi+12]
        cmp     eax, 4
        je      .L10
        vmovss  xmm3, DWORD PTR [rdi+16]
        vfmadd231ss     xmm1, xmm3, DWORD PTR [rsi+16]
        cmp     eax, 5
        je      .L11
        vmovss  xmm7, DWORD PTR [rdi+20]
        vfmadd231ss     xmm1, xmm7, DWORD PTR [rsi+20]
        cmp     eax, 7
        jne     .L12
        vmovss  xmm4, DWORD PTR [rsi+24]
        vfmadd231ss     xmm1, xmm4, DWORD PTR [rdi+24]
        mov     r9d, 57
        mov     r10d, 7
.L2:
        mov     ecx, 64
        sub     ecx, eax
        mov     eax, eax
        sal     rax, 2
        mov     r8d, ecx
        lea     rdx, [rdi+rax]
        add     rax, rsi
        shr     r8d, 3
        vmovups ymm0, YMMWORD PTR [rax+32]
        vmulps  ymm0, ymm0, YMMWORD PTR [rdx+32]
        vmovaps ymm3, YMMWORD PTR [rdx]
        vfmadd231ps     ymm0, ymm3, YMMWORD PTR [rax]
        vmovaps ymm4, YMMWORD PTR [rdx+64]
        vfmadd231ps     ymm0, ymm4, YMMWORD PTR [rax+64]
        vmovaps ymm5, YMMWORD PTR [rdx+96]
        vfmadd231ps     ymm0, ymm5, YMMWORD PTR [rax+96]
        vmovaps ymm6, YMMWORD PTR [rdx+128]
        vmovaps ymm7, YMMWORD PTR [rdx+160]
        vfmadd231ps     ymm0, ymm6, YMMWORD PTR [rax+128]
        vmovaps ymm3, YMMWORD PTR [rdx+192]
        vfmadd231ps     ymm0, ymm7, YMMWORD PTR [rax+160]
        vfmadd231ps     ymm0, ymm3, YMMWORD PTR [rax+192]
        cmp     r8d, 8
        jne     .L4
        vmovaps ymm4, YMMWORD PTR [rdx+224]
        vfmadd231ps     ymm0, ymm4, YMMWORD PTR [rax+224]
.L4:
        vhaddps ymm0, ymm0, ymm0
        mov     r8d, ecx
        mov     edx, r9d
        and     r8d, -8
        lea     eax, [r8+r10]
        sub     edx, r8d
        vhaddps ymm2, ymm0, ymm0
        vperm2f128      ymm0, ymm2, ymm2, 1
        vaddps  ymm0, ymm0, ymm2
        vaddss  xmm0, xmm0, xmm1
        cmp     ecx, r8d
        je      .L31
        movsx   rcx, eax
        vmovss  xmm5, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm5, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+1]
        cmp     edx, 1
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm6, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm6, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+2]
        cmp     edx, 2
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm7, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm7, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+3]
        cmp     edx, 3
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm2, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm2, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+4]
        cmp     edx, 4
        je      .L31
        movsx   rcx, ecx
        vmovss  xmm7, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm7, DWORD PTR [rsi+rcx*4]
        lea     ecx, [rax+5]
        cmp     edx, 5
        je      .L31
        movsx   rcx, ecx
        add     eax, 6
        vmovss  xmm5, DWORD PTR [rdi+rcx*4]
        vfmadd231ss     xmm0, xmm5, DWORD PTR [rsi+rcx*4]
        cmp     edx, 6
        je      .L31
        cdqe
        vmovss  xmm6, DWORD PTR [rdi+rax*4]
        vfmadd231ss     xmm0, xmm6, DWORD PTR [rsi+rax*4]
.L31:
        vzeroupper
        ret
.L10:
        mov     r9d, 60
        mov     r10d, 4
        jmp     .L2
.L7:
        mov     r9d, 63
        mov     r10d, 1
        jmp     .L2
.L6:
        mov     r9d, 64
        xor     r10d, r10d
        vxorps  xmm1, xmm1, xmm1
        jmp     .L2
.L8:
        mov     r9d, 62
        mov     r10d, 2
        jmp     .L2
.L9:
        mov     r9d, 61
        mov     r10d, 3
        jmp     .L2
.L11:
        mov     r9d, 59
        mov     r10d, 5
        jmp     .L2
.L12:
        mov     r9d, 58
        mov     r10d, 6
        jmp     .L2

However this seems more efficient from clang trunk:

f:                                      # @f
        vmovups ymm0, ymmword ptr [rsi]
        vmovups ymm1, ymmword ptr [rsi + 32]
        vmovups ymm2, ymmword ptr [rsi + 64]
        vmovups ymm3, ymmword ptr [rsi + 96]
        vmulps  ymm0, ymm0, ymmword ptr [rdi]
        vfmadd231ps     ymm0, ymm1, ymmword ptr [rdi + 32]
        vfmadd231ps     ymm0, ymm2, ymmword ptr [rdi + 64]
        vfmadd231ps     ymm0, ymm3, ymmword ptr [rdi + 96]
        vmovups ymm1, ymmword ptr [rsi + 128]
        vfmadd132ps     ymm1, ymm0, ymmword ptr [rdi + 128]
        vmovups ymm0, ymmword ptr [rsi + 160]
        vfmadd132ps     ymm0, ymm1, ymmword ptr [rdi + 160]
        vmovups ymm1, ymmword ptr [rsi + 192]
        vfmadd132ps     ymm1, ymm0, ymmword ptr [rdi + 192]
        vmovups ymm0, ymmword ptr [rsi + 224]
        vfmadd132ps     ymm0, ymm1, ymmword ptr [rdi + 224]
        vextractf128    xmm1, ymm0, 1
        vaddps  ymm0, ymm0, ymm1
        vpermilpd       xmm1, xmm0, 1   # xmm1 = xmm0[1,0]
        vaddps  ymm0, ymm0, ymm1
        vhaddps ymm0, ymm0, ymm0
        vzeroupper
        ret


It seems that gcc is going to some lengths to align the data which may not be
worth the cost.

[Bug c/79491] New: Possibly inefficient code for the inner product of two vectors

Reply via email to