Compile the following code with options -Os -mthumb -march=armv5te

union FloatIntUnion {
        float  fFloat;
        int fSignBitInt;
};

static inline float fast_inc(float x) {
      union FloatIntUnion data;
      data.fFloat = x;
      data.fSignBitInt += 1;
      return data.fFloat;
}

extern int MyConvert(float);
extern float dumm();
int time_math() {
      int i;
      int sum = 0;
      const int repeat = 100;
      float f;

      f = dumm();
      for (i = repeat - 1; i >= 0; --i) {
          sum += (int)f; f = fast_inc(f);
          sum += (int)f; f = fast_inc(f);
          sum += (int)f; f = fast_inc(f);
          sum += (int)f; f = fast_inc(f);
      }

      f = dumm();
      for (i = repeat - 1; i >= 0; --i) {
        sum += MyConvert(f); f = fast_inc(f);
        sum += MyConvert(f); f = fast_inc(f);
        sum += MyConvert(f); f = fast_inc(f);
      }
      return sum;
}

Gcc generates:

        push    {r4, r5, r6, r7, lr}
        sub     sp, sp, #12
        bl      dumm
        mov     r4, #0
        mov     r6, #99
        add     r5, r0, #0
.L2:
        add     r0, r5, #0
        bl      __aeabi_f2iz
        add     r5, r5, #1
        add     r4, r0, r4
        add     r0, r5, #0
        bl      __aeabi_f2iz
        add     r5, r5, #1
        add     r4, r4, r0
        add     r0, r5, #0
        bl      __aeabi_f2iz
        add     r5, r5, #1
        add     r4, r4, r0
        add     r0, r5, #0
        bl      __aeabi_f2iz
        add     r5, r5, #1
        add     r4, r4, r0
        sub     r6, r6, #1
        bcs     .L2
        bl      dumm
        mov     r6, #99
        add     r5, r0, #0
.L3:
        add     r0, r5, #0
        bl      MyConvert
        add     r5, r5, #1
        str     r0, [sp, #4]
        add     r0, r5, #0
        bl      MyConvert
        add     r5, r5, #1
        mov     r7, r0
        add     r0, r5, #0
        bl      MyConvert
        ldr     r3, [sp, #4]
        add     r5, r5, #1
        add     r7, r7, r3
        add     r7, r7, r0
        add     r4, r4, r7
        sub     r6, r6, #1
        bcs     .L3
        add     sp, sp, #12
        mov     r0, r4
        @ sp needed for prologue
        pop     {r4, r5, r6, r7, pc}

The source code contains 2 similar loops. But the generated code are quite
different. The code for first loop is as expected. After evaluating each
function, accumulates the returned value immediately. The code for second loop
is much worse. After evaluating each function, it saves the returned value to a
different place. After calling all functions in the same round of loop, it
accumulates all the saved results together. The code for second loop is larger
and slower, and even caused a register spilling.

The intermediate representation patterns for the two loops started to diverge
from pass float2int.c.078t.reassoc1. I don't know why gcc performs different
transforms on the two loops in this pass.


-- 
           Summary: inefficient code to accumulate function return values
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: carrot at google dot com
 GCC build triplet: i686-linux
  GCC host triplet: i686-linux
GCC target triplet: arm-eabi


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40783

Reply via email to