>From one of the test cases from 
http://embed.cs.utah.edu/embarrassing/dec_09/harvest/gcc-head_llvm-gcc-head/

typedef unsigned int size_t;
typedef unsigned char apr_byte_t;
typedef unsigned long long apr_uint64_t;
struct _SHA512_CTX
{
  apr_uint64_t state[8];
  apr_uint64_t bitcount[2];
  apr_byte_t buffer[128];
};
typedef struct _SHA512_CTX SHA512_CTX;
typedef apr_byte_t sha2_byte;
typedef apr_uint64_t sha2_word64;
/* compiler builtin: 
   unsigned int __builtin_object_size(void * , int  ) ;  */
/* compiler builtin: 
   void *__builtin___memset_chk(void * , int  , unsigned int  , unsigned int  )
;  */
extern __attribute__ ((__nothrow__, __noreturn__))
     void __assert_fail (char const *__assertion,
                         char const *__file,
                         unsigned int __line, char const *__function);
     void apr__SHA512_Final (apr_byte_t * digest, SHA512_CTX * context);
     void apr__SHA512_Last (SHA512_CTX * context);
     void apr__SHA512_Last (SHA512_CTX * context);
     void apr__SHA512_Final (apr_byte_t * digest, SHA512_CTX * context)
{
  sha2_word64 *d;
  int j;
  sha2_word64 tmp;
  sha2_word64 *tmp___0;
  void __attribute__ ((__artificial__)) * _cil_inline_tmp_228;
  void *_cil_inline_tmp_229;
  int _cil_inline_tmp_230;
  size_t _cil_inline_tmp_231;
  unsigned int _cil_inline_tmp_232;
  void *_cil_inline_tmp_233;

  {
    d = (sha2_word64 *) digest;
    if (!((unsigned int) context != (unsigned int) ((SHA512_CTX *) 0)))
      {
        __assert_fail ("context != (SHA512_CTX*)0", "random/unix/sha2.c",
                       870U, "apr__SHA512_Final");
      }
    if ((unsigned int) digest != (unsigned int) ((sha2_byte *) 0))
      {
        apr__SHA512_Last (context);
        j = 0;
        while (j < 8)
          {
            tmp = context->state[j];
            tmp = (tmp >> 32) | (tmp << 32);
            tmp =
              ((tmp & 0xff00ff00ff00ff00ULL) >> 8) |
              ((tmp & 71777214294589695ULL) << 8);
            context->state[j] =
              ((tmp & 0xffff0000ffff0000ULL) >> 16) |
              ((tmp & 281470681808895ULL) << 16);
            tmp___0 = d;
            d++;
            *tmp___0 = context->state[j];
            j++;
          }
      }
    _cil_inline_tmp_229 = (void *) context;
    _cil_inline_tmp_230 = 0;
    _cil_inline_tmp_231 = 4U;
    _cil_inline_tmp_232 = __builtin_object_size ((void *) context, 0);
    _cil_inline_tmp_233 =
      __builtin___memset_chk ((void *) context, 0, 4U, _cil_inline_tmp_232);
    _cil_inline_tmp_228 =
      (void __attribute__ ((__artificial__)) *) _cil_inline_tmp_233;
    return;
  }
}

/* Checksum = D12A74A3 */


The 64bit gcc generates reasonable code (on par with LLVM with
-fno-reoder-blocks in code size). That includes using bswap for the open coded
endian
conversions

But with -m32 the code is much larger, mostly because bswap is never generated.
I played around with various options including -march=core -mtune=core2 -O2
etc.
but no bswap to be seen.


-- 
           Summary: bswap optimization does not work for 32bit (but for
                    64bit)
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: rtl-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: andi-gcc at firstfloor dot org
  GCC host triplet: x86_64-linux
GCC target triplet: x86_64-linux


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42589

Reply via email to