libgcc2.c defines __bswapsi2() as follows: typedef int SItype __attribute__ ((mode (SI)));
SItype __bswapsi2 (SItype u) { return ((((u) & 0xff000000) >> 24) | (((u) & 0x00ff0000) >> 8) | (((u) & 0x0000ff00) << 8) | (((u) & 0x000000ff) << 24)); } JFTR: if (u & 0x80) == 0x80, (u & 0xff) << 24 exhibits undefined behaviour, but that's another story. For i386 and AMD64 processors GCC optimises the above code properly and generates a BSWAP or MOVBE instruction. What about processors without such an instruction? Does GCC generate (unoptimised) code there, similar to the following i386 assembly, using 4 loads, 4 shifts, 2 ands plus 3 ors? gcc -m32 -o- -O1 -S bswapsi2.c __bswapsi2: movl 4(%esp), %eax movl %eax, %edx shrl $24, %edx movl %eax, %ecx sall $24, %ecx orl %ecx, %edx movl %eax, %ecx sarl $8, %ecx andl $65280, %ecx orl %ecx, %edx sall $8, %eax andl $16711680, %eax orl %edx, %eax ret Or is GCC able to optimise this to code similar to the following i386 assembly, using 2 loads, 2 rotates, 2 ands plus 1 or, i.e. halving the number of instructions, if the target processor has rotate instructions? __bswapsi2: movl 4(%esp), %eax movl %eax, %edx andl $-16711936, %edx rorl $8, %edx andl $16711935, %eax roll $8, %eax orl %edx, %eax ret If not: shouldn't __bswapsi2() better be implemented as follows? unsigned __rotlsi3 (unsigned v, int w) { return (v << (31 & w)) | (v >> (31 & -w)); } unsigned __rotrsi3 (unsigned v, int w) { return (v >> (31 & w)) | (v << (31 & -w)); } int __bswapsi2 (int u) // should better be unsigned __bswapsi2 (unsigned u)! { return __rotlsi3 (u & 0xff00ff00, 8) | __rotrsi3 (u & 0x00ff00ff, 8); } Stefan KanthaK PS: reimplementing __bswapdi2() is left as an exercise to the reader. PPS: the following (due to the commented cast but wrong) implementation exhibits 4 bugs in the optimiser and register allocator: #1) failure to generate a second bswap from the high dword of the argument loaded into %edx; #2) superfluous pushl/popl of otherwise unused %esi; #3) unmotivated use of %edi instead of %edx; #4) use of movl/sarl to produce the sign of %eax in %edx. The first bug results in 6 (out of 18) instructions instead of 1, the last 3 bugs result in 6 (out of 18) superfluous instructions! typedef int DItype __attribute__ ((mode (DI))); DItype __bswapdi2 (DItype u) { return ((DItype) __bswapsi2 (u) << 32) | /* (unsigned) */ __bswapsi2 (u >> 32); } gcc -m32 -o- -O3 -S bswapdi2.c __bswapdi2: pushl %edi # Oops: not needed any more! pushl %esi # Ouch: superfluous! movl 16(%esp), %edx movl 12(%esp), %ecx popl %esi # Ouch: superfluous! movl %edx, %eax andl $16711935, %edx andl $-16711936, %eax rorl $8, %edx bswap %ecx roll $8, %eax orl %edx, %eax movl %eax, %edi # Oops: %edx should be used here instead of %edi sarl $31, %edi # Oops: cltd should be used here instead of movl plus sarl orl %edi, %ecx # Oops: orl %ecx, %edx should be used here popl %edi # Oops: not needed any more! movl %ecx, %edx # Oops: not needed any more! ret