This is not a correctness bug, this is a performance problem. It appears constant propagation is inconsistent, causing a routine to be inlined and simplified some places and not others. Code of the form
a(..., ~0x3) for i=1..n a(..., ~0) a(..., 0x0fffffff) inlines and simplifies the call with ~0 as an arg, but not the others. Unfortunately, a() if inlined and simplified would reduce to about 4 machine instructions, but the out-of-line call is about 80 static instructions with a path length of about 20 instructions including an indirect branch. For small values of 'n', the out-of-line call dominates execution time of this routine. Following is complete source code (no headers) and the output of g++ -v. Note that many small source changes cause the symptom to go away, such as manually eliminating dead code (that otherwise GCC determines is dead and eliminates). This "smells" like there is an optimization threshold and beyond some level of source complexity the threshold is not reached. #define CHAR_BIT (8) template<typename T, typename size_t> size_t bitsizeof() { return CHAR_BIT * sizeof(T); } template<typename T> unsigned bitsizeof() { return bitsizeof<T, unsigned>(); } template<typename word_t> word_t ALLONES() { return ~static_cast<word_t>(0); } template<typename word_t, typename offset_t, typename bitlen_t> static bool is_ltoh(bool might_overlap, const word_t *dst, offset_t doff, const word_t *src, offset_t soff, bitlen_t bitlen) { if (!might_overlap) return true; if (dst <= src) { return true; } else /* (dst > src) */ { const unsigned WORDSIZE = bitsizeof<word_t>(); const word_t *srclast = src + (soff + bitlen) / WORDSIZE; return srclast < dst; } } typedef unsigned char op_t; const op_t SRC = 0xC; const op_t DST = 0xA; const op_t SET = 0xF; template<typename word_t> static word_t switch_case(op_t op, word_t d, word_t s, word_t dst_mask) { switch(op) { case 0 & 0xF: d &= ~dst_mask; break; // 0 case ~(DST|SRC) & 0xF: d ^= ((~s | d) & dst_mask); break; // 1 case DST&~SRC & 0xF: d ^= (( s & d) & dst_mask); break; // 2 case ~SRC & 0xF: d ^= ((~s ^ d) & dst_mask); break; // 3 case ~DST&SRC & 0xF: d ^= (( s | d) & dst_mask); break; // 4 case ~DST & 0xF: d ^= dst_mask; break; // 5 case (DST^SRC) & 0xF: d ^= ( s & dst_mask); break; // 6 case ~(DST&SRC) & 0xF: d ^= (( s | ~d) & dst_mask); break; // 7 case DST&SRC & 0xF: d ^= ((~s & d) & dst_mask); break; // 8 case ~(DST^SRC) & 0xF: d ^= (~s & dst_mask); break; // 9 case DST & 0xF: break; // 10 case (DST|~SRC) & 0xF: d |= (~s & dst_mask); break; // 11 case SRC & 0xF: d ^= (( s ^ d) & dst_mask); break; // 12 case (~DST|SRC) & 0xF: d ^= (~(s & d) & dst_mask); break; // 13 case (DST|SRC) & 0xF: d |= (s & dst_mask); break; // 14 case SET & 0xF: d |= dst_mask; break; // 15 default: ; // NOT REACHED } return d; } template<typename word_t, typename offset_t, typename bitlen_t> static void ltoh_MtoN_down(op_t op, word_t *di, const word_t *si, word_t lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool more_src_words) { unsigned bs = bitsizeof<word_t>() - ts; word_t top = si[1]; word_t s = (top << ts) | (si[0] >> bs); di[0] = switch_case(op, di[0], s, lo_mask); for (bitlen_t i=1; i<n1; ++i) { word_t bot = top; top = si[i+1]; s = top << ts | bot >> bs; di[i] = switch_case(op, di[i], s, ALLONES<word_t>()); } s = top >> bs; if (more_src_words) s |= (si[n1+1] << ts); di[n1] = switch_case(op, di[n1], s, hi_mask); } template<typename word_t, typename offset_t, typename bitlen_t> static void htol_MtoN_down(op_t op, word_t *di, const word_t *si, word_t lo_mask, word_t hi_mask, unsigned ts, bitlen_t n1, bool one_more_src_word) { unsigned bs = bitsizeof<word_t>() - ts; word_t bot = si[n1]; word_t s = bot >> bs; if (one_more_src_word) s |= si[n1+1] << ts; di[n1] = switch_case(op, di[n1], s, hi_mask); for (offset_t i=n1-1; i>0; --i) { word_t top = bot; bot = si[i]; s = top << ts | bot >> bs; di[i] = switch_case(op, di[i], s, ALLONES<word_t>()); } s = (bot << ts) | (si[0] >> bs); di[0] = switch_case(op, di[0], s, lo_mask); } template<typename word_t, typename offset_t, typename bitlen_t> void mem(word_t *di, offset_t doff, const word_t *si, offset_t soff, bitlen_t bitlen, op_t op, bool might_overlap=true) { const offset_t WS = bitsizeof<word_t,offset_t>(); const word_t WORDMASK = WS - 1; di += doff / WS; doff %= WS; si += soff / WS; soff %= WS; const word_t ONES = ALLONES<word_t>(); word_t lo_mask = ONES << doff; word_t hi_mask = ONES >> (WORDMASK - ((doff + bitlen - 1) & WORDMASK)); bool one_word = doff + static_cast<offset_t>(bitlen) <= WS; if (soff == doff) { } else if (soff < doff) { } else /* if (soff > doff) */ { unsigned ts = WS - (soff - doff); if (one_word) { } else { offset_t n1dst = (doff+bitlen-1)/WS; offset_t n1src = (soff+bitlen-1)/WS; if (is_ltoh(might_overlap, di, doff, si, soff, bitlen)) ltoh_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask, ts, n1dst, n1src != n1dst); else htol_MtoN_down<word_t,offset_t,bitlen_t>(op, di, si, lo_mask, hi_mask, ts, n1dst, n1src != n1dst); } } } void slow(unsigned *di, const unsigned *si) { mem<unsigned>(di, 2, si, 10, 2042, SRC, false); } int main(int argc, char **argv) {;} $ g++ -v -O4 -Wall -Werror x.cc -o x Using built-in specs. Target: i486-linux-gnu Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.4.1-4ubuntu8' --with-bugurl=file:///usr/share/doc/gcc-4.4/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared --enable-multiarch --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.4 --program-suffix=-4.4 --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc --enable-targets=all --disable-werror --with-arch-32=i486 --with-tune=generic --enable-checking=release --build=i486-linux-gnu --host=i486-linux-gnu --target=i486-linux-gnu Thread model: posix gcc version 4.4.1 (Ubuntu 4.4.1-4ubuntu8) COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc' '-mtune=generic' '-march=i486' /usr/lib/gcc/i486-linux-gnu/4.4.1/cc1plus -quiet -v -D_GNU_SOURCE x.cc -D_FORTIFY_SOURCE=2 -quiet -dumpbase x.cc -mtune=generic -march=i486 -auxbase x -O4 -Wall -Werror -version -fstack-protector -o /tmp/ccdsdx0h.s ignoring nonexistent directory "/usr/local/include/i486-linux-gnu" ignoring nonexistent directory "/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../i486-linux-gnu/include" ignoring nonexistent directory "/usr/include/i486-linux-gnu" #include "..." search starts here: #include <...> search starts here: /usr/include/c++/4.4 /usr/include/c++/4.4/i486-linux-gnu /usr/include/c++/4.4/backward /usr/local/include /usr/lib/gcc/i486-linux-gnu/4.4.1/include /usr/lib/gcc/i486-linux-gnu/4.4.1/include-fixed /usr/include End of search list. GNU C++ (Ubuntu 4.4.1-4ubuntu8) version 4.4.1 (i486-linux-gnu) compiled by GNU C version 4.4.1, GMP version 4.3.1, MPFR version 2.4.1-p2. GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 Compiler executable checksum: d2bfb4b66d7bb5211c5fc338cae297a2 COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc' '-mtune=generic' '-march=i486' as -V -Qy -o /tmp/cc6Uy9PA.o /tmp/ccdsdx0h.s GNU assembler version 2.20 (i486-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.20 COMPILER_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/ LIBRARY_PATH=/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/:/lib/../lib/:/usr/lib/../lib/:/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../:/lib/:/usr/lib/:/usr/lib/i486-linux-gnu/ COLLECT_GCC_OPTIONS='-v' '-O4' '-Wall' '-Werror' '-o' 'x' '-shared-libgcc' '-mtune=generic' '-march=i486' /usr/lib/gcc/i486-linux-gnu/4.4.1/collect2 --build-id --eh-frame-hdr -m elf_i386 --hash-style=both -dynamic-linker /lib/ld-linux.so.2 -o x -z relro /usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crt1.o /usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crti.o /usr/lib/gcc/i486-linux-gnu/4.4.1/crtbegin.o -L/usr/lib/gcc/i486-linux-gnu/4.4.1 -L/usr/lib/gcc/i486-linux-gnu/4.4.1 -L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib -L/lib/../lib -L/usr/lib/../lib -L/usr/lib/gcc/i486-linux-gnu/4.4.1/../../.. -L/usr/lib/i486-linux-gnu /tmp/cc6Uy9PA.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/i486-linux-gnu/4.4.1/crtend.o /usr/lib/gcc/i486-linux-gnu/4.4.1/../../../../lib/crtn.o (This is a standard Ubuntu build of GCC.) $ objdump -D x > x.dis The code of interest is for "slow()": For the inner loop (80485be..80485d7), "switch_case()" is inlined and simplifies. Since the mask is ~0, it reduces to nothing. For the leading and trailing calls, the relevant values are pushed and it calls switch_case() out-of-line. If the source is simplified (e.g., by replacing the unused "else" call to "htol..." with a semicolon) the routine is inlined and reduces to three instructions at each site. Where the inner loop is called just a few times (which is common) the leading/trailing cost dominates. 080484b0 <_Z11switch_caseIjET_hS0_S0_S0_>: 80484b0: 55 push %ebp 80484b1: 3c 0f cmp $0xf,%al 80484b3: 89 e5 mov %esp,%ebp 80484b5: 53 push %ebx 80484b6: 8b 5d 08 mov 0x8(%ebp),%ebx 80484b9: 77 15 ja 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 80484bb: 0f b6 c0 movzbl %al,%eax 80484be: ff 24 85 e0 86 04 08 jmp *0x80486e0(,%eax,4) 80484c5: 8d 76 00 lea 0x0(%esi),%esi 80484c8: f7 d3 not %ebx 80484ca: 21 da and %ebx,%edx 80484cc: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 80484d0: 89 d0 mov %edx,%eax 80484d2: 5b pop %ebx 80484d3: 5d pop %ebp 80484d4: c3 ret 80484d5: 8d 76 00 lea 0x0(%esi),%esi 80484d8: f7 d1 not %ecx 80484da: 21 d9 and %ebx,%ecx 80484dc: 31 ca xor %ecx,%edx 80484de: 89 d0 mov %edx,%eax 80484e0: 5b pop %ebx 80484e1: 5d pop %ebp 80484e2: c3 ret 80484e3: 90 nop 80484e4: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 80484e8: 89 d0 mov %edx,%eax 80484ea: f7 d0 not %eax 80484ec: 09 c8 or %ecx,%eax 80484ee: 21 d8 and %ebx,%eax 80484f0: 31 c2 xor %eax,%edx 80484f2: eb dc jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 80484f4: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 80484f8: f7 d1 not %ecx 80484fa: 21 d9 and %ebx,%ecx 80484fc: 21 d1 and %edx,%ecx 80484fe: 31 ca xor %ecx,%edx 8048500: eb ce jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048502: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 8048508: 09 da or %ebx,%edx 804850a: eb c4 jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 804850c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 8048510: f7 d1 not %ecx 8048512: 21 d9 and %ebx,%ecx 8048514: 09 ca or %ecx,%edx 8048516: eb b8 jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048518: 31 d1 xor %edx,%ecx 804851a: 21 d9 and %ebx,%ecx 804851c: 31 ca xor %ecx,%edx 804851e: eb b0 jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048520: 21 d1 and %edx,%ecx 8048522: f7 d1 not %ecx 8048524: 21 d9 and %ebx,%ecx 8048526: 31 ca xor %ecx,%edx 8048528: eb a6 jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 804852a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 8048530: 21 d9 and %ebx,%ecx 8048532: 09 ca or %ecx,%edx 8048534: eb 9a jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048536: 66 90 xchg %ax,%ax 8048538: f7 d1 not %ecx 804853a: 09 d1 or %edx,%ecx 804853c: 21 d9 and %ebx,%ecx 804853e: 31 ca xor %ecx,%edx 8048540: eb 8e jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048542: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 8048548: 21 d1 and %edx,%ecx 804854a: 21 d9 and %ebx,%ecx 804854c: 31 ca xor %ecx,%edx 804854e: eb 80 jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048550: f7 d1 not %ecx 8048552: 31 d1 xor %edx,%ecx 8048554: 21 d9 and %ebx,%ecx 8048556: 31 ca xor %ecx,%edx 8048558: e9 73 ff ff ff jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 804855d: 8d 76 00 lea 0x0(%esi),%esi 8048560: 09 d1 or %edx,%ecx 8048562: 21 d9 and %ebx,%ecx 8048564: 31 ca xor %ecx,%edx 8048566: e9 65 ff ff ff jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 804856b: 90 nop 804856c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 8048570: 31 da xor %ebx,%edx 8048572: e9 59 ff ff ff jmp 80484d0 <_Z11switch_caseIjET_hS0_S0_S0_+0x20> 8048577: 89 f6 mov %esi,%esi 8048579: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi 08048580 <_Z4slowPjPKj>: 8048580: 55 push %ebp 8048581: 89 e5 mov %esp,%ebp 8048583: 83 ec 10 sub $0x10,%esp 8048586: 89 7d fc mov %edi,-0x4(%ebp) 8048589: 8b 7d 0c mov 0xc(%ebp),%edi 804858c: 89 75 f8 mov %esi,-0x8(%ebp) 804858f: 8b 75 08 mov 0x8(%ebp),%esi 8048592: 89 5d f4 mov %ebx,-0xc(%ebp) 8048595: 8b 5f 04 mov 0x4(%edi),%ebx 8048598: 8b 07 mov (%edi),%eax 804859a: c7 04 24 fc ff ff ff movl $0xfffffffc,(%esp) 80485a1: 8b 16 mov (%esi),%edx 80485a3: 89 d9 mov %ebx,%ecx 80485a5: c1 e8 08 shr $0x8,%eax 80485a8: c1 e1 18 shl $0x18,%ecx 80485ab: 09 c1 or %eax,%ecx 80485ad: b8 0c 00 00 00 mov $0xc,%eax 80485b2: e8 f9 fe ff ff call 80484b0 <_Z11switch_caseIjET_hS0_S0_S0_> 80485b7: 89 06 mov %eax,(%esi) 80485b9: b8 01 00 00 00 mov $0x1,%eax 80485be: 8b 54 87 04 mov 0x4(%edi,%eax,4),%edx 80485c2: c1 eb 08 shr $0x8,%ebx 80485c5: 89 d1 mov %edx,%ecx 80485c7: c1 e1 18 shl $0x18,%ecx 80485ca: 09 cb or %ecx,%ebx 80485cc: 89 1c 86 mov %ebx,(%esi,%eax,4) 80485cf: 83 c0 01 add $0x1,%eax 80485d2: 89 d3 mov %edx,%ebx 80485d4: 83 f8 3f cmp $0x3f,%eax 80485d7: 75 e5 jne 80485be <_Z4slowPjPKj+0x3e> 80485d9: 8b 87 00 01 00 00 mov 0x100(%edi),%eax 80485df: 89 d1 mov %edx,%ecx 80485e1: c7 04 24 ff ff ff 0f movl $0xfffffff,(%esp) 80485e8: 8b 96 fc 00 00 00 mov 0xfc(%esi),%edx 80485ee: c1 e9 08 shr $0x8,%ecx 80485f1: c1 e0 18 shl $0x18,%eax 80485f4: 09 c1 or %eax,%ecx 80485f6: b8 0c 00 00 00 mov $0xc,%eax 80485fb: e8 b0 fe ff ff call 80484b0 <_Z11switch_caseIjET_hS0_S0_S0_> 8048600: 89 86 fc 00 00 00 mov %eax,0xfc(%esi) 8048606: 8b 5d f4 mov -0xc(%ebp),%ebx 8048609: 8b 75 f8 mov -0x8(%ebp),%esi 804860c: 8b 7d fc mov -0x4(%ebp),%edi 804860f: 89 ec mov %ebp,%esp 8048611: 5d pop %ebp 8048612: c3 ret -- Summary: missed optimization leads to several times slower code Product: gcc Version: 4.4.1 Status: UNCONFIRMED Severity: minor Priority: P3 Component: c++ AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: gb-0001 at xsim dot com GCC build triplet: i486-linux-gnu GCC host triplet: i486-linux-gnu GCC target triplet: i486-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42209