https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91753
Bug ID: 91753 Summary: Bad register allocation of multi-register types Product: gcc Version: 10.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: wilco at gcc dot gnu.org Target Milestone: --- The following example shows that register allocation of types which require multiple registers is quite non-optimal: #include <stdint.h> #include <arm_neon.h> void neon_transform_nada(const uint8x16x4_t table, uint8_t * values, int volume) { uint8x16_t x1 = vld1q_u8(values + 0); uint8x16_t x2 = vld1q_u8(values + 16); uint8x16_t x3 = vld1q_u8(values + 16*2); uint8x16_t x4 = vld1q_u8(values + 16*3); for(int i = 0; i < volume; i++) { x1 = vqtbx4q_u8(x1, table,x1); x2 = vqtbx4q_u8(x2, table,x2); x3 = vqtbx4q_u8(x3, table,x3); x4 = vqtbx4q_u8(x4, table,x4); } vst1q_u8(values + 0, x1); vst1q_u8(values + 16, x2); vst1q_u8(values + 16*2, x3); vst1q_u8(values + 16*3, x4); } With -O2/O3: neon_transform_nada: cmp w1, 0 ldp q31, q30, [x0] ldp q29, q28, [x0, 32] ble .L2 mov v27.16b, v1.16b mov w2, 0 mov v26.16b, v3.16b mov v25.16b, v0.16b mov v24.16b, v2.16b .p2align 3,,7 .L3: mov v0.16b, v25.16b add w2, w2, 1 mov v20.16b, v25.16b cmp w1, w2 mov v16.16b, v25.16b mov v4.16b, v25.16b mov v1.16b, v27.16b mov v21.16b, v27.16b mov v17.16b, v27.16b mov v5.16b, v27.16b mov v2.16b, v24.16b mov v22.16b, v24.16b mov v18.16b, v24.16b mov v6.16b, v24.16b mov v3.16b, v26.16b mov v23.16b, v26.16b mov v19.16b, v26.16b mov v7.16b, v26.16b tbx v31.16b, {v0.16b - v3.16b}, v31.16b tbx v30.16b, {v20.16b - v23.16b}, v30.16b tbx v29.16b, {v16.16b - v19.16b}, v29.16b tbx v28.16b, {v4.16b - v7.16b}, v28.16b bne .L3 .L2: stp q31, q30, [x0] stp q29, q28, [x0, 32] ret With -O1 it looks a lot better but there are still 4 redundant moves: neon_transform_nada: ldr q19, [x0] ldr q18, [x0, 16] ldr q17, [x0, 32] ldr q16, [x0, 48] cmp w1, 0 ble .L2 mov w2, 0 .L3: mov v4.16b, v0.16b mov v5.16b, v1.16b mov v6.16b, v2.16b mov v7.16b, v3.16b tbx v19.16b, {v4.16b - v7.16b}, v19.16b tbx v18.16b, {v4.16b - v7.16b}, v18.16b tbx v17.16b, {v4.16b - v7.16b}, v17.16b tbx v16.16b, {v4.16b - v7.16b}, v16.16b add w2, w2, 1 cmp w1, w2 bne .L3 .L2: str q19, [x0] str q18, [x0, 16] str q17, [x0, 32] str q16, [x0, 48] ret