libgcc2.c defines __bswapsi2() as follows:

typedef int SItype __attribute__ ((mode (SI)));

SItype
__bswapsi2 (SItype u)
{
  return ((((u) & 0xff000000) >> 24)
   | (((u) & 0x00ff0000) >>  8)
   | (((u) & 0x0000ff00) <<  8)
   | (((u) & 0x000000ff) << 24));
}

JFTR: if (u & 0x80) == 0x80, (u & 0xff) << 24 exhibits undefined behaviour,
      but that's another story.

For i386 and AMD64 processors GCC optimises the above code properly and
generates a BSWAP or MOVBE instruction.
What about processors without such an instruction?
Does GCC generate (unoptimised) code there, similar to the following i386
assembly, using 4 loads, 4 shifts, 2 ands plus 3 ors?

gcc -m32 -o- -O1 -S bswapsi2.c

__bswapsi2:
        movl    4(%esp), %eax
        movl    %eax, %edx
        shrl    $24, %edx
        movl    %eax, %ecx
        sall    $24, %ecx
        orl     %ecx, %edx
        movl    %eax, %ecx
        sarl    $8, %ecx
        andl    $65280, %ecx
        orl     %ecx, %edx
        sall    $8, %eax
        andl    $16711680, %eax
        orl     %edx, %eax
        ret

Or is GCC able to optimise this to code similar to the following i386
assembly, using 2 loads, 2 rotates, 2 ands plus 1 or, i.e. halving the
number of instructions, if the target processor has rotate instructions?

__bswapsi2:
        movl    4(%esp), %eax
        movl    %eax, %edx
        andl    $-16711936, %edx
        rorl    $8, %edx
        andl    $16711935, %eax
        roll    $8, %eax
        orl     %edx, %eax
        ret

If not: shouldn't __bswapsi2() better be implemented as follows?

unsigned __rotlsi3 (unsigned v, int w)
{
  return (v << (31 & w))
       | (v >> (31 & -w));
}

unsigned __rotrsi3 (unsigned v, int w)
{
  return (v >> (31 & w))
       | (v << (31 & -w));
}

int __bswapsi2 (int u) // should better be unsigned __bswapsi2 (unsigned u)!
{
  return __rotlsi3 (u & 0xff00ff00, 8)
       | __rotrsi3 (u & 0x00ff00ff, 8);
}

Stefan KanthaK

PS: reimplementing __bswapdi2() is left as an exercise to the reader.

PPS: the following (due to the commented cast but wrong) implementation
     exhibits 4 bugs in the optimiser and register allocator:
     #1) failure to generate a second bswap from the high dword of the
         argument loaded into %edx;
     #2) superfluous pushl/popl of otherwise unused %esi;
     #3) unmotivated use of %edi instead of %edx;
     #4) use of movl/sarl to produce the sign of %eax in %edx.
     The first bug results in 6 (out of 18) instructions instead of 1,
     the last 3 bugs result in 6 (out of 18) superfluous instructions!

typedef int DItype __attribute__ ((mode (DI)));

DItype __bswapdi2 (DItype u)
{
  return ((DItype) __bswapsi2 (u) << 32)
 | /* (unsigned) */ __bswapsi2 (u >> 32);
}

gcc -m32 -o- -O3 -S bswapdi2.c

__bswapdi2:
        pushl   %edi            # Oops: not needed any more!
        pushl   %esi            # Ouch: superfluous!
        movl    16(%esp), %edx
        movl    12(%esp), %ecx
        popl    %esi            # Ouch: superfluous!
        movl    %edx, %eax
        andl    $16711935, %edx
        andl    $-16711936, %eax
        rorl    $8, %edx
        bswap   %ecx
        roll    $8, %eax
        orl     %edx, %eax
        movl    %eax, %edi      # Oops: %edx should be used here instead of %edi
        sarl    $31, %edi       # Oops: cltd should be used here instead of 
movl plus sarl
        orl     %edi, %ecx      # Oops: orl %ecx, %edx should be used here
        popl    %edi            # Oops: not needed any more!
        movl    %ecx, %edx      # Oops: not needed any more!
        ret

Reply via email to