https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88963

--- Comment #9 from Devin Hussey <husseydevin at gmail dot com> ---
(In reply to Andrew Pinski from comment #6)
> Try using 128 (or 256) and you might see that aarch64 falls down similarly.

yup. Oof.

test:
        sub     sp, sp, #560
        stp     x29, x30, [sp]
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        mov     x19, 128
        mov     x20, x0
        add     x0, sp, 176
        str     x21, [sp, 32]
        mov     x21, x2
        mov     x2, x19
        bl      memcpy
        mov     x2, x19
        mov     x1, x21
        add     x0, sp, 304
        bl      memcpy
        ldr     q7, [sp, 176]
        mov     x2, x19
        ldr     q6, [sp, 192]
        add     x1, sp, 48
        ldr     q5, [sp, 208]
        mov     x0, x20
        ldr     q4, [sp, 224]
        ldr     q3, [sp, 240]
        ldr     q2, [sp, 256]
        ldr     q1, [sp, 272]
        ldr     q0, [sp, 288]
        ldr     q23, [sp, 304]
        ldr     q22, [sp, 320]
        ldr     q21, [sp, 336]
        ldr     q20, [sp, 352]
        ldr     q19, [sp, 368]
        ldr     q18, [sp, 384]
        ldr     q17, [sp, 400]
        ldr     q16, [sp, 416]
        add     v7.4s, v7.4s, v23.4s
        add     v6.4s, v6.4s, v22.4s
        add     v5.4s, v5.4s, v21.4s
        add     v4.4s, v4.4s, v20.4s
        add     v3.4s, v3.4s, v19.4s
        str     q7, [sp, 48]
        add     v2.4s, v2.4s, v18.4s
        str     q6, [sp, 64]
        add     v1.4s, v1.4s, v17.4s
        str     q5, [sp, 80]
        add     v0.4s, v0.4s, v16.4s
        str     q4, [sp, 96]
        str     q3, [sp, 112]
        str     q2, [sp, 128]
        str     q1, [sp, 144]
        str     q0, [sp, 160]
        bl      memcpy
        ldp     x29, x30, [sp]
        ldp     x19, x20, [sp, 16]
        ldr     x21, [sp, 32]
        add     sp, sp, 560
        ret

Reply via email to