https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84106

            Bug ID: 84106
           Summary: gcc is not able to vectorize code for 1D array, but
                    does so for 2D array of the same size
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: bugzi...@poradnik-webmastera.com
  Target Milestone: ---

[code]
#define N 9

int a1[N][N];
int a2[N][N];

int b1[N*N];
int b2[N*N];

void test1()
{
    for (int i = 0; i < N; ++i)
    {
        for (int j = 0; j < N; ++j)
        {
            a2[i][j] = a1[i][j];
        }
    }
}

void test2()
{
    for (int i = 0; i < N*N; ++i)
    {
        b2[i] = b1[i];
    }
}
[/code]

This code compiled using gcc 8.0 (trunk) with "-O3 -mavx2" produces following
result. For some reason gcc is not able to vectorize code for test2 function. I
also tried to add "__attribute__((aligned(32)))" to all arrays, but it did not
help.

Similar code is also generated when compiling with "-O3 -mavx512f -mavx512vl
-mavx512bw -mavx512dq -mavx512cd" - gcc still generates code which uses YMM
registers, instead of ZMM ones.

[out]
test1():
  vmovdqa ymm0, YMMWORD PTR a1[rip]
  vmovdqa ymm1, YMMWORD PTR a1[rip+32]
  vmovdqa ymm2, YMMWORD PTR a1[rip+64]
  vmovdqa ymm3, YMMWORD PTR a1[rip+96]
  vmovdqa YMMWORD PTR a2[rip], ymm0
  vmovdqa ymm4, YMMWORD PTR a1[rip+128]
  vmovdqa ymm5, YMMWORD PTR a1[rip+160]
  vmovdqa YMMWORD PTR a2[rip+32], ymm1
  vmovdqa ymm6, YMMWORD PTR a1[rip+192]
  vmovdqa ymm7, YMMWORD PTR a1[rip+224]
  vmovdqa ymm0, YMMWORD PTR a1[rip+256]
  vmovdqa ymm1, YMMWORD PTR a1[rip+288]
  vmovdqa YMMWORD PTR a2[rip+64], ymm2
  mov eax, DWORD PTR a1[rip+320]
  vmovdqa YMMWORD PTR a2[rip+96], ymm3
  vmovdqa YMMWORD PTR a2[rip+128], ymm4
  vmovdqa YMMWORD PTR a2[rip+160], ymm5
  vmovdqa YMMWORD PTR a2[rip+192], ymm6
  vmovdqa YMMWORD PTR a2[rip+224], ymm7
  vmovdqa YMMWORD PTR a2[rip+256], ymm0
  vmovdqa YMMWORD PTR a2[rip+288], ymm1
  mov DWORD PTR a2[rip+320], eax
  vzeroupper
  ret
test2():
  mov esi, OFFSET FLAT:b1
  mov edi, OFFSET FLAT:b2
  mov ecx, 40
  rep movsq
  mov eax, DWORD PTR [rsi]
  mov DWORD PTR [rdi], eax
  ret
b2:
  .zero 324
b1:
  .zero 324
a2:
  .zero 324
a1:
  .zero 324
[/out]

Reply via email to