https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86145
Bug ID: 86145 Summary: Inefficient homogeneous struct return Product: gcc Version: unknown Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: ktkachov at gcc dot gnu.org Target Milestone: --- GCC currently does a suboptimal job of returning structs. Consider the testcases: typedef struct { double x, y; } D2; typedef struct { double x, y, a; } D3; typedef struct { double x, y, a, b; } D4; typedef struct { float x, y; } F2; typedef struct { float x, y, a; } F3; typedef struct { float x, y, a, b; } F4; D2 f1(double x, double y) { D2 s = { x, y }; //this is actually optimal! return s; } D2 f1a(D2 *p) { return p[2]; } D3 f1b(D3 *p) { return p[2]; } D4 f1c(D4 *p) { return p[1]; } F2 f2(float x, float y) { F2 s = { x, y }; return s; } F2 f2a(F2 *p) { return p[3]; } F3 f2b(F3 *p) { return p[4]; } F4 f2c(F4 *p) { return p[1]; } For aarch64 we generate: f1: ret f1a: ldp x1, x0, [x0, 32] fmov d1, x0 fmov d0, x1 ret f1b: sub sp, sp, #64 ldr x0, [x0, 64] str x0, [sp, 56] ldp d0, d1, [sp, 40] ldr d2, [sp, 56] add sp, sp, 64 ret f1c: sub sp, sp, #64 ldp d0, d1, [sp, 32] ldp d2, d3, [sp, 48] add sp, sp, 64 ret f2: fmov x1, d0 mov x0, 0 bfi x0, x1, 0, 32 fmov x1, d1 bfi x0, x1, 32, 32 lsr x1, x0, 32 lsr w0, w0, 0 fmov s1, w1 fmov s0, w0 ret f2a: ldr x0, [x0, 24] lsr x1, x0, 32 lsr w0, w0, 0 fmov s1, w1 fmov s0, w0 ret f2b: sub sp, sp, #32 ldr x1, [x0, 48] ldr w0, [x0, 56] str x1, [sp, 16] str w0, [sp, 24] ldp s0, s1, [sp, 16] ldr s2, [sp, 24] add sp, sp, 32 ret f2c: ldp x1, x0, [x0, 16] lsr x3, x1, 32 lsr x2, x0, 32 fmov s1, w3 fmov s3, w2 lsr w1, w1, 0 lsr w0, w0, 0 fmov s0, w1 fmov s2, w0 ret This also appears on x86: f1: ret f1a: movq 32(%rdi), %rdx movq 40(%rdi), %rax movq %rdx, -8(%rsp) movsd -8(%rsp), %xmm0 movq %rax, -8(%rsp) movsd -8(%rsp), %xmm1 ret f1b: movdqu 48(%rsi), %xmm0 movq 64(%rsi), %rdx movq %rdi, %rax movups %xmm0, (%rdi) movq %rdx, 16(%rdi) ret f1c: movdqu 32(%rsi), %xmm0 movdqu 48(%rsi), %xmm1 movq %rdi, %rax movups %xmm0, (%rdi) movups %xmm1, 16(%rdi) ret f2: movd %xmm1, %eax salq $32, %rax movq %rax, %rdx movd %xmm0, %eax orq %rdx, %rax movq %rax, -8(%rsp) movq -8(%rsp), %xmm0 ret f2a: movq 24(%rdi), %rax movq %rax, -8(%rsp) movq -8(%rsp), %xmm0 ret f2b: movq 48(%rdi), %rax movl 56(%rdi), %edx movq %rax, -48(%rsp) movq -48(%rsp), %xmm0 movl %edx, -12(%rsp) movss -12(%rsp), %xmm1 ret f2c: movq 16(%rdi), %rdx movq 24(%rdi), %rax movq %rdx, -8(%rsp) movq -8(%rsp), %xmm0 movq %rax, -8(%rsp) movq -8(%rsp), %xmm1 ret The compiler does the structure load as an opaque TImode (or wider) move and then tries to unpack with subregs later on. Can we get the expander to expand struct components more intelligently?