libgcc2.c defines __bswapsi2() as follows:
typedef int SItype __attribute__ ((mode (SI)));
SItype
__bswapsi2 (SItype u)
{
return ((((u) & 0xff000000) >> 24)
| (((u) & 0x00ff0000) >> 8)
| (((u) & 0x0000ff00) << 8)
| (((u) & 0x000000ff) << 24));
}
JFTR: if (u & 0x80) == 0x80, (u & 0xff) << 24 exhibits undefined behaviour,
but that's another story.
For i386 and AMD64 processors GCC optimises the above code properly and
generates a BSWAP or MOVBE instruction.
What about processors without such an instruction?
Does GCC generate (unoptimised) code there, similar to the following i386
assembly, using 4 loads, 4 shifts, 2 ands plus 3 ors?
gcc -m32 -o- -O1 -S bswapsi2.c
__bswapsi2:
movl 4(%esp), %eax
movl %eax, %edx
shrl $24, %edx
movl %eax, %ecx
sall $24, %ecx
orl %ecx, %edx
movl %eax, %ecx
sarl $8, %ecx
andl $65280, %ecx
orl %ecx, %edx
sall $8, %eax
andl $16711680, %eax
orl %edx, %eax
ret
Or is GCC able to optimise this to code similar to the following i386
assembly, using 2 loads, 2 rotates, 2 ands plus 1 or, i.e. halving the
number of instructions, if the target processor has rotate instructions?
__bswapsi2:
movl 4(%esp), %eax
movl %eax, %edx
andl $-16711936, %edx
rorl $8, %edx
andl $16711935, %eax
roll $8, %eax
orl %edx, %eax
ret
If not: shouldn't __bswapsi2() better be implemented as follows?
unsigned __rotlsi3 (unsigned v, int w)
{
return (v << (31 & w))
| (v >> (31 & -w));
}
unsigned __rotrsi3 (unsigned v, int w)
{
return (v >> (31 & w))
| (v << (31 & -w));
}
int __bswapsi2 (int u) // should better be unsigned __bswapsi2 (unsigned u)!
{
return __rotlsi3 (u & 0xff00ff00, 8)
| __rotrsi3 (u & 0x00ff00ff, 8);
}
Stefan KanthaK
PS: reimplementing __bswapdi2() is left as an exercise to the reader.
PPS: the following (due to the commented cast but wrong) implementation
exhibits 4 bugs in the optimiser and register allocator:
#1) failure to generate a second bswap from the high dword of the
argument loaded into %edx;
#2) superfluous pushl/popl of otherwise unused %esi;
#3) unmotivated use of %edi instead of %edx;
#4) use of movl/sarl to produce the sign of %eax in %edx.
The first bug results in 6 (out of 18) instructions instead of 1,
the last 3 bugs result in 6 (out of 18) superfluous instructions!
typedef int DItype __attribute__ ((mode (DI)));
DItype __bswapdi2 (DItype u)
{
return ((DItype) __bswapsi2 (u) << 32)
| /* (unsigned) */ __bswapsi2 (u >> 32);
}
gcc -m32 -o- -O3 -S bswapdi2.c
__bswapdi2:
pushl %edi # Oops: not needed any more!
pushl %esi # Ouch: superfluous!
movl 16(%esp), %edx
movl 12(%esp), %ecx
popl %esi # Ouch: superfluous!
movl %edx, %eax
andl $16711935, %edx
andl $-16711936, %eax
rorl $8, %edx
bswap %ecx
roll $8, %eax
orl %edx, %eax
movl %eax, %edi # Oops: %edx should be used here instead of %edi
sarl $31, %edi # Oops: cltd should be used here instead of
movl plus sarl
orl %edi, %ecx # Oops: orl %ecx, %edx should be used here
popl %edi # Oops: not needed any more!
movl %ecx, %edx # Oops: not needed any more!
ret