I've committed a patch to libbacktrace to speed up decompression a few percent by loading 32-bit values rather than 8-bit bytes. Bootstrapped and ran libbacktrace and Go tests on x86_64-pc-linux-gnu. Committed to mainline.
Ian 2017-10-05 Ian Lance Taylor <i...@golang.org> * elf.c (elf_zlib_fetch): Change pval argument to uint64_t *. Read a four byte integer. (elf_zlib_inflate): Change val to uint64_t. Align pin to a 32-bit boundary before ever calling elf_zlib_fetch. * ztest.c (test_large): Simplify print statements a bit.
Index: elf.c =================================================================== --- elf.c (revision 253376) +++ elf.c (working copy) @@ -1031,11 +1031,12 @@ elf_zlib_failed(void) static int elf_zlib_fetch (const unsigned char **ppin, const unsigned char *pinend, - uint32_t *pval, unsigned int *pbits) + uint64_t *pval, unsigned int *pbits) { unsigned int bits; const unsigned char *pin; - uint32_t val; + uint64_t val; + uint32_t next; bits = *pbits; if (bits >= 15) @@ -1043,20 +1044,25 @@ elf_zlib_fetch (const unsigned char **pp pin = *ppin; val = *pval; - if (unlikely (pinend - pin < 2)) + if (unlikely (pinend - pin < 4)) { elf_zlib_failed (); return 0; } - val |= pin[0] << bits; - val |= pin[1] << (bits + 8); - bits += 16; - pin += 2; - - /* We will need the next two bytes soon. We ask for high temporal - locality because we will need the whole cache line soon. */ - __builtin_prefetch (pin, 0, 3); - __builtin_prefetch (pin + 1, 0, 3); + + /* We've ensured that PIN is aligned. */ + next = *(const uint32_t *)pin; + +#if __BYTE_ORDER == __ORDER_BIG_ENDIAN + next = __builtin_bswap32 (next); +#endif + + val |= (uint64_t)next << bits; + bits += 32; + pin += 4; + + /* We will need the next four bytes soon. */ + __builtin_prefetch (pin, 0, 0); *ppin = pin; *pval = val; @@ -1566,7 +1572,7 @@ elf_zlib_inflate (const unsigned char *p poutend = pout + sout; while ((pinend - pin) > 4) { - uint32_t val; + uint64_t val; unsigned int bits; int last; @@ -1601,10 +1607,19 @@ elf_zlib_inflate (const unsigned char *p } pin += 2; - /* Read blocks until one is marked last. */ + /* Align PIN to a 32-bit boundary. */ val = 0; bits = 0; + while ((((uintptr_t) pin) & 3) != 0) + { + val |= (uint64_t)*pin << bits; + bits += 8; + ++pin; + } + + /* Read blocks until one is marked last. */ + last = 0; while (!last) @@ -1671,6 +1686,14 @@ elf_zlib_inflate (const unsigned char *p pout += len; pin += len; + /* Align PIN. */ + while ((((uintptr_t) pin) & 3) != 0) + { + val |= (uint64_t)*pin << bits; + bits += 8; + ++pin; + } + /* Go around to read the next block. */ continue; } Index: ztest.c =================================================================== --- ztest.c (revision 253377) +++ ztest.c (working copy) @@ -432,9 +432,9 @@ test_large (struct backtrace_state *stat ctime = average_time (ctimes, trials); ztime = average_time (ztimes, trials); - printf ("backtrace time: %zu ns\n", ctime); - printf ("zlib time: : %zu ns\n", ztime); - printf ("percentage : %g\n", (double) ztime / (double) ctime); + printf ("backtrace: %zu ns\n", ctime); + printf ("zlib : %zu ns\n", ztime); + printf ("ratio : %g\n", (double) ztime / (double) ctime); return;