>From one of the test cases from http://embed.cs.utah.edu/embarrassing/dec_09/harvest/gcc-head_llvm-gcc-head/
typedef unsigned int size_t; typedef unsigned char apr_byte_t; typedef unsigned long long apr_uint64_t; struct _SHA512_CTX { apr_uint64_t state[8]; apr_uint64_t bitcount[2]; apr_byte_t buffer[128]; }; typedef struct _SHA512_CTX SHA512_CTX; typedef apr_byte_t sha2_byte; typedef apr_uint64_t sha2_word64; /* compiler builtin: unsigned int __builtin_object_size(void * , int ) ; */ /* compiler builtin: void *__builtin___memset_chk(void * , int , unsigned int , unsigned int ) ; */ extern __attribute__ ((__nothrow__, __noreturn__)) void __assert_fail (char const *__assertion, char const *__file, unsigned int __line, char const *__function); void apr__SHA512_Final (apr_byte_t * digest, SHA512_CTX * context); void apr__SHA512_Last (SHA512_CTX * context); void apr__SHA512_Last (SHA512_CTX * context); void apr__SHA512_Final (apr_byte_t * digest, SHA512_CTX * context) { sha2_word64 *d; int j; sha2_word64 tmp; sha2_word64 *tmp___0; void __attribute__ ((__artificial__)) * _cil_inline_tmp_228; void *_cil_inline_tmp_229; int _cil_inline_tmp_230; size_t _cil_inline_tmp_231; unsigned int _cil_inline_tmp_232; void *_cil_inline_tmp_233; { d = (sha2_word64 *) digest; if (!((unsigned int) context != (unsigned int) ((SHA512_CTX *) 0))) { __assert_fail ("context != (SHA512_CTX*)0", "random/unix/sha2.c", 870U, "apr__SHA512_Final"); } if ((unsigned int) digest != (unsigned int) ((sha2_byte *) 0)) { apr__SHA512_Last (context); j = 0; while (j < 8) { tmp = context->state[j]; tmp = (tmp >> 32) | (tmp << 32); tmp = ((tmp & 0xff00ff00ff00ff00ULL) >> 8) | ((tmp & 71777214294589695ULL) << 8); context->state[j] = ((tmp & 0xffff0000ffff0000ULL) >> 16) | ((tmp & 281470681808895ULL) << 16); tmp___0 = d; d++; *tmp___0 = context->state[j]; j++; } } _cil_inline_tmp_229 = (void *) context; _cil_inline_tmp_230 = 0; _cil_inline_tmp_231 = 4U; _cil_inline_tmp_232 = __builtin_object_size ((void *) context, 0); _cil_inline_tmp_233 = __builtin___memset_chk ((void *) context, 0, 4U, _cil_inline_tmp_232); _cil_inline_tmp_228 = (void __attribute__ ((__artificial__)) *) _cil_inline_tmp_233; return; } } /* Checksum = D12A74A3 */ The 64bit gcc generates reasonable code (on par with LLVM with -fno-reoder-blocks in code size). That includes using bswap for the open coded endian conversions But with -m32 the code is much larger, mostly because bswap is never generated. I played around with various options including -march=core -mtune=core2 -O2 etc. but no bswap to be seen. -- Summary: bswap optimization does not work for 32bit (but for 64bit) Product: gcc Version: unknown Status: UNCONFIRMED Severity: enhancement Priority: P3 Component: rtl-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: andi-gcc at firstfloor dot org GCC host triplet: x86_64-linux GCC target triplet: x86_64-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42589