------- Comment #5 from ubizjak at gmail dot com 2008-03-21 10:17 ------- Inner loop, generated with -O2 -mmmx -fno-strict-aliasing, gcc version 4.0.2 20051125 (Red Hat 4.0.2-8):
.L45: movq (%ebx), %mm0 psubw (%ecx), %mm0 movq %mm0, %mm1 psraw $15, %mm1 pxor %mm1, %mm0 psubw %mm1, %mm0 movq %mm0, %mm1 punpckhwd %mm2, %mm1 punpcklwd %mm0, %mm0 psrad $16, %mm0 paddd %mm1, %mm0 movl %eax, -56(%ebp) movl %edx, -52(%ebp) movq -56(%ebp), %mm1 paddd %mm1, %mm0 movq %mm0, -56(%ebp) movl -56(%ebp), %eax movl -52(%ebp), %edx movl %eax, -24(%ebp) movl %edx, -20(%ebp) addl $4, %esi addl $8, %ebx addl $8, %ecx cmpl %esi, %edi jg .L45 time ./a.out 144 real 0m4.587s user 0m4.584s sys 0m0.004s Inner loop, generated with -O2 -mmmx -fno-strict-aliasing, gcc version 4.4.0 20080318 (experimental) [trunk revision 133304] (GCC) (this one has improved MMX move instructions): .L23: movq (%ecx,%eax,2), %mm0 psubw (%edx,%eax,2), %mm0 addl $4, %eax cmpl %eax, %ebx movq %mm0, %mm1 psraw $15, %mm1 pxor %mm1, %mm0 psubw %mm1, %mm0 movq %mm0, %mm1 punpcklwd %mm0, %mm0 punpckhwd %mm3, %mm1 psrad $16, %mm0 paddd %mm1, %mm0 paddd %mm0, %mm2 movq %mm2, -24(%ebp) jg .L23 time ./a.out 144 real 0m0.755s user 0m0.752s sys 0m0.000s Current mainline is _SIX_ times faster. Unfortunatelly, there are no plans to backport this functionality to anything older than 4.4, so fixed for 4.4. -- ubizjak at gmail dot com changed: What |Removed |Added ---------------------------------------------------------------------------- Status|WAITING |RESOLVED Resolution| |FIXED Target Milestone|--- |4.4.0 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21395