The following code: struct A { int count; int *data; }; void func(int, int); void test (struct A* p, const void **ptrArray, int count) { int i, j; for (i = 0; i < p->count; i++) { for (j = 0; j < count; j++) { func (p->data[i], p->data[i + 1]); } } }
is compiled to 50 bytes by GCC 4.2.1 and to 56 bytes by GCC 4.4.0 (and GCC 4.3.1 also) on ARM in thumb mode GCC 4.2.1 (with -march=armv5te -mthumb -mthumb-interwork -fpic -Os) test: push {r4, r5, r6, r7, lr} sub sp, sp, #12 mov r7, r0 mov r5, #0 str r2, [sp, #4] b .L2 .L3: ldr r3, [r7, #4] add r4, r4, #1 ldr r0, [r3, r6] add r3, r6, r3 ldr r1, [r3, #4] bl func .L5: ldr r3, [sp, #4] cmp r4, r3 blt .L3 add r5, r5, #1 .L2: ldr r3, [r7] cmp r5, r3 bge .L6 lsl r6, r5, #2 mov r4, #0 b .L5 .L6: add sp, sp, #12 @ sp needed for prologue pop {r4, r5, r6, r7, pc} GCC 4.4.0: test: push {r4, r5, r6, r7, lr} sub sp, sp, #12 mov r4, r0 str r2, [sp, #4] mov r7, #4 // doesn't exist in 4.2.1 mov r5, #0 b .L2 .L3: ldr r3, [r4, #4] ldr r2, [sp] ldr r1, [r3, r7] ldr r0, [r3, r2] bl func add r6, r6, #1 .L5: ldr r3, [sp, #4] cmp r6, r3 blt .L3 add r5, r5, #1 add r7, r7, #4 // doesn't exist in 4.2.1 .L2: ldr r3, [r4] cmp r5, r3 bge .L6 lsl r2, r5, #2 str r2, [sp] // doesn't exist in 4.2.1 mov r6, #0 b .L5 .L6: add sp, sp, #12 @ sp needed for prologue pop {r4, r5, r6, r7, pc} Changing -Os to -O2 produces even worse code (50->64, 56->74, +6 -> +10). Bisection on trunk shows that it was changed by http://gcc.gnu.org/viewcvs?view=rev&revision=125755 which was a merge of pointer_plus branch (therefore adding Andrew Pinski in cc). It also reproduces on x86 as well: GCC 4.2.4 with -m32 -O2: test: pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %ebx subl $12, %esp movl 8(%ebp), %edi movl $0, -16(%ebp) movl (%edi), %edx testl %edx, %edx jle .L8 .L4: movl 16(%ebp), %eax testl %eax, %eax jle .L6 movl -16(%ebp), %esi xorl %ebx, %ebx sall $2, %esi .p2align 4,,7 .L5: movl 4(%edi), %eax addl $1, %ebx movl 4(%esi,%eax), %edx movl %edx, 4(%esp) movl (%eax,%esi), %eax movl %eax, (%esp) call func cmpl 16(%ebp), %ebx jne .L5 .L6: addl $1, -16(%ebp) movl -16(%ebp), %eax cmpl %eax, (%edi) jg .L4 .L8: addl $12, %esp popl %ebx popl %esi popl %edi popl %ebp ret GCC 4.4.0 (with the same options): test: pushl %ebp movl %esp, %ebp pushl %edi pushl %esi movl $4, %esi pushl %ebx subl $44, %esp movl 8(%ebp), %edi movl $0, -28(%ebp) movl (%edi), %edx testl %edx, %edx jle .L6 .p2align 4,,7 .p2align 3 .L3: movl 16(%ebp), %eax testl %eax, %eax jle .L5 movl -28(%ebp), %ecx movl %edi, %eax xorl %ebx, %ebx sall $2, %ecx movl %ecx, %edi movl %eax, %ecx .p2align 4,,7 .p2align 3 .L4: movl 4(%ecx), %eax addl $1, %ebx movl %ecx, -32(%ebp) movl (%eax,%esi), %edx movl %edx, 4(%esp) movl (%eax,%edi), %eax movl %eax, (%esp) call func movl -32(%ebp), %ecx cmpl %ebx, 16(%ebp) jg .L4 movl %ecx, %edi .L5: addl $1, -28(%ebp) addl $4, %esi movl -28(%ebp), %eax cmpl %eax, (%edi) jg .L3 .L6: addl $44, %esp popl %ebx popl %esi popl %edi popl %ebp ret Some stat by instructions: $ cat 1.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g 1 call 1 ret 1 sall 1 subl 1 xorl 2 cmpl 2 testl 3 addl 4 popl 4 pushl 12 movl $ cat 2.s|grep -v '[.:]'|awk '{print $1}'|sort|uniq -c|sort -g 1 call 1 ret 1 sall 1 subl 1 xorl 2 cmpl 2 testl 4 addl 4 popl 4 pushl 19 movl 12->19 movl's is not very good. -- Summary: [4.3/4.4/4.5 regression] unoptimal code for two simple loops Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: regression AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: alexvod at google dot com GCC build triplet: x86_64-unknown-linux-gnu GCC host triplet: x86_64-unknown-linux-gnu GCC target triplet: x86_64-unknown-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39838