------- Comment #5 from eric-bugs at omnifarious dot org 2009-05-20 19:39 ------- This code:
#include <stdint.h> #include <stddef.h> inline uint64_t byteswap_64(const uint64_t x) { return __builtin_bswap64(x); } inline uint32_t byteswap_32(const uint32_t x) { return __builtin_bswap32(x); } extern void random_function(uint32_t a, uint64_t b, uint32_t c, uint64_t d); void swapping(const uint32_t x32, const uint64_t x64) { random_function(byteswap_32(x32), byteswap_64(x64), byteswap_32(byteswap_32(x32)), byteswap_64(byteswap_64(x64))); } void swaparray(uint64_t outvals[], char outtop[], const uint64_t invals[], const size_t size) { size_t i = 0; for (i = 0; i < size; ++i) { outvals[i] = byteswap_64(invals[i]); outtop[i] = (byteswap_64(invals[i]) >> 56) & 0xffull; } } results in this assembly: .globl swaparray .type swaparray, @function swaparray: .LFB5: testq %rcx, %rcx je .L8 xorl %r8d, %r8d .p2align 4,,7 .p2align 3 .L7: movq (%rdx,%r8,8), %rax bswap %rax movq %rax, (%rdi,%r8,8) movq (%rdx,%r8,8), %rax bswap %rax shrq $56, %rax movb %al, (%rsi,%r8) incq %r8 cmpq %r8, %rcx ja .L7 .L8: rep ret .LFE5: .size swaparray, .-swaparray .p2align 4,,15 .globl swapping .type swapping, @function swapping: .LFB4: bswap %rsi bswap %edi movq %rsi, %rcx movl %edi, %edx bswap %rcx bswap %edx jmp random_function .LFE4: .size swapping, .-swapping when compiled with gcc -O3 -mtune=native -march=native on an Opteron system. Notice that in swapping bswap is used twice rather than having two move instructions and two bswap instructions. The optimizer is apparently unaware that bswap is its own inverse. In swaparray the bswap operation is not subject to an obvious CSE optimization, nor is it realized that the latter line might be more efficiently implemented by movb %al, (%rsi,%r8) before the bswap operation. -- eric-bugs at omnifarious dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Summary|gcc needs byte swap builtins|gcc byte swap builtins | |inadequately optimized http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40210