while(__builtin_expect(len != 0, 1)) below produces a slower loop than without the __builtin_expect(). It adds a jmp which makes an extra testl instruction needed in the loop.
__builtin_expect(..., 1) and __builtin_expect(..., 0) produce the same code in this case, maybe that is relevant. -fprofile-arcs/-fbranch-probabilities are OK, they produce the same code as without __builtin_expect(). bash$ cat sum.c #ifdef E #define EXPECT(c) __builtin_expect(c, E) #else #define EXPECT(c) (c) #endif unsigned __attribute__((fastcall, noinline)) Sum(const unsigned *data, unsigned len) { unsigned s = 0; for (; EXPECT(len != 0); len--) s += *data++; return s; } #ifdef TEST unsigned data[65536]; int main(void) { unsigned i; for (i = 0; i < 1000; i++) Sum(data, sizeof (data) / sizeof (*data)); return 0; } #endif /* TEST */ bash$ : without __builtin_expect: bash$ gcc -S -O -fomit-frame-pointer sum.c -o- | sed -ne '/Sum:/,/ret/p' Sum: movl $0, %eax testl %edx, %edx je .L4 movl $0, %eax .L5: addl (%ecx), %eax addl $4, %ecx decl %edx jne .L5 .L4: ret bash$ : with __builtin_expect: bash$ gcc -DE=1 -S -O -fomit-frame-pointer sum.c -o- | sed -ne '/Sum:/,/ret/p' Sum: movl $0, %eax jmp .L2 .L3: addl -4(%ecx), %eax decl %edx .L2: addl $4, %ecx testl %edx, %edx jne .L3 ret bash$ : time without __builtin_expect: bash$ gcc -DTEST -O -fomit-frame-pointer sum.c && time ./a.out real 0m0.172s user 0m0.168s sys 0m0.001s bash$ : time with __builtin_expect: bash$ gcc -DE=1 -DTEST -O -fomit-frame-pointer sum.c && time ./a.out real 0m0.254s user 0m0.252s sys 0m0.001s -- Summary: while(__builtin_expect()) pessimizes loop Product: gcc Version: 4.1.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: h dot b dot furuseth at usit dot uio dot no GCC build triplet: i686-pc-linux-gnu GCC host triplet: i686-pc-linux-gnu GCC target triplet: i686-pc-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=30055