https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86145

            Bug ID: 86145
           Summary: Inefficient homogeneous struct return
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---

GCC currently does a suboptimal job of returning structs.
Consider the testcases:

typedef struct { double x, y; } D2;
typedef struct { double x, y, a; } D3;
typedef struct { double x, y, a, b; } D4;
typedef struct { float x, y; } F2;
typedef struct { float x, y, a; } F3;
typedef struct { float x, y, a, b; } F4;

D2 f1(double x, double y)
{
  D2 s = { x, y }; //this is actually optimal!
  return s;
}

D2 f1a(D2 *p)
{
  return p[2];
}

D3 f1b(D3 *p)
{
  return p[2];
}

D4 f1c(D4 *p)
{
  return p[1];
}

F2 f2(float x, float y)
{
  F2 s = { x, y };
  return s;
}

F2 f2a(F2 *p)
{
  return p[3];
}

F3 f2b(F3 *p)
{
  return p[4];
}

F4 f2c(F4 *p)
{
  return p[1];
}

For aarch64 we generate:

f1:
        ret

f1a:
        ldp     x1, x0, [x0, 32]
        fmov    d1, x0
        fmov    d0, x1
        ret

f1b:
        sub     sp, sp, #64
        ldr     x0, [x0, 64]
        str     x0, [sp, 56]
        ldp     d0, d1, [sp, 40]
        ldr     d2, [sp, 56]
        add     sp, sp, 64
        ret

f1c:
        sub     sp, sp, #64
        ldp     d0, d1, [sp, 32]
        ldp     d2, d3, [sp, 48]
        add     sp, sp, 64
        ret

f2:
        fmov    x1, d0
        mov     x0, 0
        bfi     x0, x1, 0, 32
        fmov    x1, d1
        bfi     x0, x1, 32, 32
        lsr     x1, x0, 32
        lsr     w0, w0, 0
        fmov    s1, w1
        fmov    s0, w0
        ret

f2a:
        ldr     x0, [x0, 24]
        lsr     x1, x0, 32
        lsr     w0, w0, 0
        fmov    s1, w1
        fmov    s0, w0
        ret

f2b:
        sub     sp, sp, #32
        ldr     x1, [x0, 48]
        ldr     w0, [x0, 56]
        str     x1, [sp, 16]
        str     w0, [sp, 24]
        ldp     s0, s1, [sp, 16]
        ldr     s2, [sp, 24]
        add     sp, sp, 32
        ret

f2c:
        ldp     x1, x0, [x0, 16]
        lsr     x3, x1, 32
        lsr     x2, x0, 32
        fmov    s1, w3
        fmov    s3, w2
        lsr     w1, w1, 0
        lsr     w0, w0, 0
        fmov    s0, w1
        fmov    s2, w0
        ret

This also appears on x86:

f1:
        ret

f1a:
        movq    32(%rdi), %rdx
        movq    40(%rdi), %rax
        movq    %rdx, -8(%rsp)
        movsd   -8(%rsp), %xmm0
        movq    %rax, -8(%rsp)
        movsd   -8(%rsp), %xmm1
        ret

f1b:
        movdqu  48(%rsi), %xmm0
        movq    64(%rsi), %rdx
        movq    %rdi, %rax
        movups  %xmm0, (%rdi)
        movq    %rdx, 16(%rdi)
        ret

f1c:
        movdqu  32(%rsi), %xmm0
        movdqu  48(%rsi), %xmm1
        movq    %rdi, %rax
        movups  %xmm0, (%rdi)
        movups  %xmm1, 16(%rdi)
        ret

f2:
        movd    %xmm1, %eax
        salq    $32, %rax
        movq    %rax, %rdx
        movd    %xmm0, %eax
        orq     %rdx, %rax
        movq    %rax, -8(%rsp)
        movq    -8(%rsp), %xmm0
        ret

f2a:
        movq    24(%rdi), %rax
        movq    %rax, -8(%rsp)
        movq    -8(%rsp), %xmm0
        ret

f2b:
        movq    48(%rdi), %rax
        movl    56(%rdi), %edx
        movq    %rax, -48(%rsp)
        movq    -48(%rsp), %xmm0
        movl    %edx, -12(%rsp)
        movss   -12(%rsp), %xmm1
        ret

f2c:
        movq    16(%rdi), %rdx
        movq    24(%rdi), %rax
        movq    %rdx, -8(%rsp)
        movq    -8(%rsp), %xmm0
        movq    %rax, -8(%rsp)
        movq    -8(%rsp), %xmm1
        ret


The compiler does the structure load as an opaque TImode (or wider) move and
then tries to unpack with subregs later on. Can we get the expander to expand
struct components more intelligently?

Reply via email to