https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94872

            Bug ID: 94872
           Summary: Failure to optimize shuffle from u32 array into u64
                    array properly
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: gabravier at gmail dot com
  Target Milestone: ---

union u64Elems
{
    uint64_t as_u64;
    int32_t as_i32[2];
};

uint64_t f(u64Elems m1, u64Elems m2)
{
    u64Elems res;
    res.as_i32[0] = m1.as_i32[1];
    res.as_i32[1] = m2.as_i32[1];

    return res.as_u64;
}

With -O3, LLVM outputs :

f(u64Elems, u64Elems): # @f(u64Elems, u64Elems)
  shr rdi, 32
  movabs rax, -4294967296
  and rax, rsi
  or rax, rdi
  ret

GCC outputs :

f(u64Elems, u64Elems):
  sar rdi, 32
  sar rsi, 32
  movd xmm0, edi
  movd xmm1, esi
  punpckldq xmm0, xmm1
  movq rax, xmm0
  ret

With -mno-sse, it's even worse :

f(u64Elems, u64Elems):
  sar rdi, 32
  sar rsi, 32
  mov QWORD PTR [rsp-8], 0
  mov DWORD PTR [rsp-8], edi
  mov rax, QWORD PTR [rsp-8]
  mov QWORD PTR [rsp-16], rax
  mov DWORD PTR [rsp-12], esi
  mov rax, QWORD PTR [rsp-16]
  ret

Looking at the final tree representation :

f (union u64Elems m1, union u64Elems m2)
{
  int _1;
  int _2;
  long unsigned int _4;
  vector(2) int _8;

  <bb 2> [local count: 1073741824]:
  # DEBUG BEGIN_STMT
  # DEBUG BEGIN_STMT
  _1 = m1.as_i32[1];
  # DEBUG BEGIN_STMT
  _2 = m2.as_i32[1];
  _8 = {_1, _2};
  # DEBUG BEGIN_STMT
  _4 = VIEW_CONVERT_EXPR<long unsigned int>(_8);
  return _4;

}

It looks to me like the conversion to vector(2) is the culprit here, it's not
getting optimized properly

Reply via email to