https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119193
Bug ID: 119193
Summary: Suboptimal packing codegen
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
Target Milestone: ---
Target: aarch64
Example source:
#include <stdint.h>
#define PACK_8_TO_64( a, b, c, d, e, f, g, h )\
(((uint64_t)a&0xFF) | ((uint64_t)(b&0xFF)<<8) | ((uint64_t)(c&0xFF)<<16) |
((uint64_t)(d&0xFF)<<24)\
| ((uint64_t)(e&0xFF)<<32) | ((uint64_t)(f&0xFF)<<40) |
((uint64_t)(g&0xFF)<<48) | ((uint64_t)h<<56))
uint64_t
pack (uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e, uint64_t f,
uint64_t g, uint64_t h)
{
return PACK_8_TO_64 (a, b, c, d, e, f, g, h);
}
GCC for aarch64 at -O2 generates:
pack:
ubfiz x5, x5, 40, 8
ubfiz x6, x6, 48, 8
ubfiz x4, x4, 32, 8
ubfiz x1, x1, 8, 8
orr x4, x4, x5
orr x7, x6, x7, lsl 56
lsl w3, w3, 24
orr x4, x4, x1
ubfiz x2, x2, 16, 8
orr x7, x7, x3
and x0, x0, 255
orr x4, x4, x2
orr x0, x7, x0
orr x0, x4, x0
ret
but Clang does better in using the bitfield insert instructions:
pack:
and x8, x0, #0xff
bfi x8, x1, #8, #8
bfi x8, x2, #16, #8
bfi x8, x3, #24, #8
bfi x8, x4, #32, #8
bfi x8, x5, #40, #8
bfi x8, x6, #48, #8
orr x0, x8, x7, lsl #56
ret