------- Comment #5 from eric-bugs at omnifarious dot org  2009-05-20 19:39 
-------
This code:

#include <stdint.h>
#include <stddef.h>

inline uint64_t byteswap_64(const uint64_t x)
{
   return __builtin_bswap64(x);
}

inline uint32_t byteswap_32(const uint32_t x)
{
   return __builtin_bswap32(x);
}

extern void random_function(uint32_t a, uint64_t b, uint32_t c, uint64_t d);

void swapping(const uint32_t x32, const uint64_t x64)
{
   random_function(byteswap_32(x32), byteswap_64(x64),
byteswap_32(byteswap_32(x32)), byteswap_64(byteswap_64(x64)));
}

void swaparray(uint64_t outvals[], char outtop[], const uint64_t invals[],
const size_t size)
{
   size_t i = 0;
   for (i = 0; i < size; ++i) {
      outvals[i] = byteswap_64(invals[i]);
      outtop[i] = (byteswap_64(invals[i]) >> 56) & 0xffull;
   }
}

results in this assembly:

.globl swaparray
        .type   swaparray, @function
swaparray:
.LFB5:
        testq   %rcx, %rcx
        je      .L8
        xorl    %r8d, %r8d
        .p2align 4,,7
        .p2align 3
.L7:
        movq    (%rdx,%r8,8), %rax
        bswap   %rax
        movq    %rax, (%rdi,%r8,8)
        movq    (%rdx,%r8,8), %rax
        bswap   %rax
        shrq    $56, %rax
        movb    %al, (%rsi,%r8)
        incq    %r8
        cmpq    %r8, %rcx
        ja      .L7
.L8:
        rep
        ret
.LFE5:
        .size   swaparray, .-swaparray
        .p2align 4,,15
.globl swapping
        .type   swapping, @function
swapping:
.LFB4:
        bswap   %rsi
        bswap   %edi
        movq    %rsi, %rcx
        movl    %edi, %edx
        bswap   %rcx
        bswap   %edx
        jmp     random_function
.LFE4:
        .size   swapping, .-swapping

when compiled with gcc -O3 -mtune=native -march=native on an Opteron system.

Notice that in swapping bswap is used twice rather than having two move
instructions and two bswap instructions.  The optimizer is apparently unaware
that bswap is its own inverse.

In swaparray the bswap operation is not subject to an obvious CSE optimization,
nor is it realized that the latter line might be more efficiently implemented
by movb   %al, (%rsi,%r8) before the bswap operation.


-- 

eric-bugs at omnifarious dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
            Summary|gcc needs byte swap builtins|gcc byte swap builtins
                   |                            |inadequately optimized


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40210

Reply via email to