The compiler creates extremely bad code for the ARM target.
Consider the following source file:

--- SNIP ---
unsigned int code_in_ram[100];

void testme(void)
{
  unsigned int *p_rom, *p_ram, *p_end, len;

  extern unsigned int _ram_erase_sector_start;
  extern unsigned int _ram_erase_sector_end;


  p_ram = code_in_ram;
  p_rom = &_ram_erase_sector_start;
  len = ((unsigned int)&_ram_erase_sector_end 
         - (unsigned int)&_ram_erase_sector_start) / sizeof(unsigned int);

  for (p_rom = &_ram_erase_sector_start, p_end = &_ram_erase_sector_end;
       p_rom < p_end;) {
    *p_ram++ = *p_rom++;
  }
}
--- SNIP ---

Compiled with arm-elf-gcc -mcpu=arm7tdmi -S -Os testme.c, we get the following
code:

--- SNIP ---
        .file   "testme.c"
        .text
        .align  2
        .global testme
        .type   testme, %function
testme:
        @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
        ldr     r1, .L6
        ldr     r2, .L6+4
        @ lr needed for prologue
        b       .L2
.L3:
        ldr     r3, [r1], #4
        str     r3, [r2, #-4]
.L2:
        ldr     r3, .L6+8
        cmp     r1, r3
        add     r2, r2, #4
        bcc     .L3
        bx      lr
.L7:
        .align  2
.L6:
        .word   _ram_erase_sector_start
        .word   code_in_ram
        .word   _ram_erase_sector_end
        .size   testme, .-testme
        .comm   code_in_ram,400,4
        .ident  "GCC: (GNU) 4.1.0"
--- SNIP ---

Even a cursory examination reveals that it would be a lot better to write:

    ldr r1, .L6
    ldr r2, .L6+4
    ldr r0, .L6+8
    b   .L2

.L3:
    ldr r3, [r1], #4
    str r3, [r2], #4
.L2:
    cmp r1, r0
    bcc .L3
    bx  lr

This code would be one instruction shorter overall , and two instructions less
in the loop. The way
gcc-4.1.0 refuses to use post-indexed addressing for the store is especially
bizzare, since it does use
post-indexed addressing for the preceeding load. Gcc 3.4.3 does not exhibit
this behaviour; it compiles
the above code to:

   ldr r2, .L6
   ldr r0, .L6+4
   cmp r2,r0
   ldr r1, .L6
   movcs pc,lr

.L4:
   ldr r2,[r2],#4
   cmp r2, r0
   str r3,[r1],#4
   bcc .L4
   mov pc,lr

While not perfect either, this also only has 4 instructions in the loop.


-- 
           Summary: ARM optimizer produces severely suboptimal code
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: Eric dot Doenges at betty-tv dot com
  GCC host triplet: powerpc-apple-darwin8.5.0
GCC target triplet: arm-elf-unknown


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016

Reply via email to