Compile the attached test case with options -mthumb -Os -fpic, gcc generates:

goo:
        push    {r3, r4, r5, lr}
        ldr     r4, .L7
        ldr     r3, .L7+4        // A
.LPIC0:
        add     r4, pc
        ldr     r3, [r4, r3]     // B
        ldr     r3, [r3]
        mov     r5, r0
        ldr     r0, [r3]
        cmp     r0, #0
        beq     .L2
        mov     r1, r5
        bl      foo
.L2:
        ldr     r3, [r5]
        mov     r0, #0
        cmp     r3, #0
        beq     .L3
        ldr     r2, .L7+4         // C
        ldr     r2, [r4, r2]      // D
        ldr     r2, [r2, #4]
        ldr     r2, [r2]
        cmp     r3, r2
        beq     .L3
        ldr     r0, [r3]
.L3:
        @ sp needed for prologue
        pop     {r3, r4, r5}
        pop     {r1}
        bx      r1
.L7:
        .word   _GLOBAL_OFFSET_TABLE_-(.LPIC0+4)
        .word   gObj(GOT)


Notice instructions A,B,C,D, they load the address of global variable gObj
twice.

When compiled with options -mthumb -O2 -fpic, gcc generates:

goo:
        push    {r4, r5, r6, lr}
        ldr     r4, .L8
        ldr     r5, .L8+4      // E
.LPIC0:
        add     r4, pc
        ldr     r3, [r4, r5]   // F
        ldr     r3, [r3]
        mov     r6, r0
        ldr     r0, [r3]
        cmp     r0, #0
        bne     .L7
.L2:
        ldr     r3, [r6]
        mov     r0, #0
        cmp     r3, #0
        beq     .L3
        ldr     r2, [r4, r5]     // G
        ldr     r2, [r2, #4]     // H
        ldr     r2, [r2]
        cmp     r2, r3
        beq     .L3
        ldr     r0, [r3]
.L3:
        @ sp needed for prologue
        pop     {r4, r5, r6}
        pop     {r1}
        bx      r1
.L7:
        mov     r1, r6
        bl      foo
        b       .L2
.L9:
        .align  2
.L8:
        .word   _GLOBAL_OFFSET_TABLE_-(.LPIC0+4)
        .word   gObj(GOT)

Instructions E,F,G do the same thing, but with one less memory load
instruction. It uses the same number of instructions. -Os should do the same
optimization.

Actually -O2 result is still not optimal. If we store the result of F into r4,
we can directly use r4 in instruction H, so G can also be removed.


-- 
           Summary: redundant memory load
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: carrot at google dot com
 GCC build triplet: i686-linux
  GCC host triplet: i686-linux
GCC target triplet: arm-eabi


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42495

Reply via email to