The loop in the following code

$ cat stride.c
void foo(float *a, float *b, int n, int stride_a, int stride_b)
{
  int i;
  for (i=0; i<n; i++)
    {
      a[i*stride_a] = b[i*stride_b];
    }
}

is translated with "gcc -O3 -fdump-tree-optimized -S stride.c" into

<L0>:;
  *(float *) ivtmp.14 = *(float *) ivtmp.12;
  i = i + 1;
  ivtmp.12 = ivtmp.12 + ivtmp.18;
  ivtmp.14 = ivtmp.14 + ivtmp.17;
  if (n != i) goto <L0>; else goto <L2>;

and (on i686-pc-linux-gnu):

.L4:
        movl    (%ecx), %eax
        incl    %ebx
        addl    %edi, %ecx
        movl    %eax, (%edx)
        addl    %esi, %edx
        cmpl    %ebx, 16(%ebp)
        jne     .L4

The code

$ cat stride2.c
void foo(float *a, float *b, int n, int stride_a, int stride_b)
{
  int i;
  for (i=n; i>0; i--)
    {
      a[(n-i)*stride_a] = b[(n-i)*stride_b];
    }
}

is translated to

<L0>:;
  *(float *) ivtmp.16 = *(float *) ivtmp.14;
  i = i - 1;
  ivtmp.14 = ivtmp.14 + ivtmp.20;
  ivtmp.16 = ivtmp.16 + ivtmp.19;
  if (i != 0) goto <L0>; else goto <L2>;

and further

.L4:
        movl    (%ebx), %eax
        addl    %edi, %ebx
        movl    %eax, (%ecx)
        addl    %esi, %ecx
        decl    %edx
        jne     .L4

which saves one instruction and one load from memory.

-- 
           Summary: Reverse loop order for increased efficiency
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: enhancement
          Priority: P2
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tkoenig at gcc dot gnu dot org
                CC: gcc-bugs at gcc dot gnu dot org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22041

Reply via email to