https://bugs.llvm.org/show_bug.cgi?id=48046

            Bug ID: 48046
           Summary: Better codegen for strided load
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows NT
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Backend: X86
          Assignee: [email protected]
          Reporter: [email protected]
                CC: [email protected], [email protected],
                    [email protected], [email protected],
                    [email protected]

#define N 4

float x[4*N], y[N];

void foo (int p)
{
  int i;
  for (i = 0; i < N; i++)
    y[i] = x[p + 3*i];
}

Clang -O3 -mavx2:
foo(int):                                # @foo(int)
        movsxd  rax, edi
        vmovss  xmm0, dword ptr [4*rax + x]     # xmm0 = mem[0],zero,zero,zero
        vmovss  dword ptr [rip + y], xmm0
        vmovss  xmm0, dword ptr [4*rax + x+12]  # xmm0 = mem[0],zero,zero,zero
        vmovss  dword ptr [rip + y+4], xmm0
        vmovss  xmm0, dword ptr [4*rax + x+24]  # xmm0 = mem[0],zero,zero,zero
        vmovss  dword ptr [rip + y+8], xmm0
        vmovss  xmm0, dword ptr [4*rax + x+36]  # xmm0 = mem[0],zero,zero,zero
        vmovss  dword ptr [rip + y+12], xmm0
        ret

ICC -O3 -mavx2:
foo(int):
        movsxd    rdi, edi                                      #6.1
        vmovss    xmm16, DWORD PTR [12+x+rdi*4]                 #10.12
        vmovss    xmm17, DWORD PTR [x+rdi*4]                    #10.12
        vinsertps xmm1, xmm16, DWORD PTR [36+x+rdi*4], 16       #10.12
        vinsertps xmm0, xmm17, DWORD PTR [24+x+rdi*4], 16       #10.12
        vunpcklps xmm2, xmm0, xmm1                              #10.12
        vmovups   XMMWORD PTR y[rip], xmm2                      #10.5
        ret                                                     #11.1
x:
y:

GCC  -O3 -mavx2:
foo(int):
        movsx   rdi, edi
        vmovss  xmm0, DWORD PTR x[0+rdi*4]
        vinsertps       xmm0, xmm0, DWORD PTR x[12+rdi*4], 0x10
        vmovlps QWORD PTR y[rip], xmm0
        vmovss  xmm0, DWORD PTR x[24+rdi*4]
        vinsertps       xmm0, xmm0, DWORD PTR x[36+rdi*4], 0x10
        vmovlps QWORD PTR y[rip+8], xmm0
        ret
y:

GCC has the best Block RThroughput value - 2.5.

https://godbolt.org/z/Pc1TWz

-- 
You are receiving this mail because:
You are on the CC list for the bug.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to