https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91975

            Bug ID: 91975
           Summary: worse code for small array copy using pointer
                    arithmetic than array indexing
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: msebor at gcc dot gnu.org
  Target Milestone: ---

When the contents of one small array is copied into another, GCC with -O2
generates more or less optimal code depending on whether the source array is
statically initialized, and also depending on whether the copying takes place
using pointer arithmetic and dereferencing or using array indexing.  Some
copies are transformed into series of assignments while others remain as loops,
while others still into a single MEM_REF statement.

The emitted assembly is also more or less efficient.

-O3 is required to get the same optimally efficient code.

Clang emits optimally efficient code for all forms at -O2.

It seems that since GCC can make some of the functions below optimally
efficient it should be able to do it for all of them.

$ cat b.c && gcc -DT=int -O2 -S -fdump-tree-optimized=/dev/stdout b.c
const T a[] = { 0, 1, 2, 3, 4, 5, 6, 7 };
T b[sizeof a / sizeof *a];

void f0 (void)   // near optimal
{
  const T *s = a;
  T *d = b;
  for (unsigned i = 0; i != sizeof a / sizeof *a; ++i)
    d[i] = s[i];
}

void g0 (void)   // suboptimal
{
  const T *s = a;
  T *d = b;
  for (unsigned i = 0; i != sizeof a / sizeof *a; ++i)
    *d++ = *s++;
}

extern const T c[sizeof a / sizeof *a];

void f1 (void)   // optimal
{
  const T *s = c;
  T *d = b;
  for (unsigned i = 0; i != sizeof a / sizeof *a; ++i)
    d[i] = s[i];
}

void g1 (void)   // optimal
{
  const T *s = c;
  T *d = b;
  for (unsigned i = 0; i != sizeof a / sizeof *a; ++i)
    *d++ = *s++;
}


;; Function f0 (f0, funcdef_no=0, decl_uid=1926, cgraph_uid=1, symbol_order=2)

f0 ()
{
  <bb 2> [local count: 119292717]:
  MEM <unsigned long> [(int *)&b] = 4294967296;
  MEM <unsigned long> [(int *)&b + 8B] = 12884901890;
  MEM <unsigned long> [(int *)&b + 16B] = 21474836484;
  MEM <unsigned long> [(int *)&b + 24B] = 30064771078;
  return;

}



;; Function g0 (g0, funcdef_no=1, decl_uid=1935, cgraph_uid=2, symbol_order=3)

g0 ()
{
  sizetype ivtmp.25;
  int prephitmp_4;
  int pretmp_5;

  <bb 2> [local count: 119292716]:

  <bb 3> [local count: 954449108]:
  # prephitmp_4 = PHI <0(2), pretmp_5(4)>
  # ivtmp.25_2 = PHI <0(2), ivtmp.25_15(4)>
  MEM[symbol: b, index: ivtmp.25_2, offset: 0B] = prephitmp_4;
  ivtmp.25_15 = ivtmp.25_2 + 4;
  if (ivtmp.25_15 != 32)
    goto <bb 4>; [87.50%]
  else
    goto <bb 5>; [12.50%]

  <bb 4> [local count: 835156388]:
  pretmp_5 = MEM[symbol: a, index: ivtmp.25_15, offset: 0B];
  goto <bb 3>; [100.00%]

  <bb 5> [local count: 119292717]:
  return;

}



;; Function f1 (f1, funcdef_no=2, decl_uid=1945, cgraph_uid=3, symbol_order=4)

f1 ()
{
  <bb 2> [local count: 119292717]:
  MEM <unsigned char[32]> [(char * {ref-all})&b] = MEM <unsigned char[32]>
[(char * {ref-all})&c];
  return;

}



;; Function g1 (g1, funcdef_no=3, decl_uid=1954, cgraph_uid=4, symbol_order=5)

g1 ()
{
  <bb 2> [local count: 119292717]:
  MEM <unsigned char[32]> [(char * {ref-all})&b] = MEM <unsigned char[32]>
[(char * {ref-all})&c];
  return;

}

Reply via email to