https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88837

            Bug ID: 88837
           Summary: [SVE] Poor vector construction code in VL-specific
                    mode
           Product: gcc
           Version: unknown
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rsandifo at gcc dot gnu.org
  Target Milestone: ---

The reduction testcases in gcc.target/aarch64/sve/slp_5.c require an initial
vector in which all elements except the first two are zero.  For the default
VL-agnostic mode we generate reasonable code, e.g.:

vec_slp_int32_t:
.LFB4:
        .cfi_startproc
        ldp     s2, s1, [x1]
        cmp     w2, 0
        ble     .L19
        mov     x3, 0
        sbfiz   x2, x2, 1, 32
        mov     z0.b, #0
        whilelo p0.s, xzr, x2
        insr    z0.s, s1
        ptrue   p1.s, all
        insr    z0.s, s2
        .p2align 3,,7
.L20:
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        incw    x3
        add     z0.s, p0/m, z0.s, z1.s
        whilelo p0.s, x3, x2
        bne     .L20

But with -msve-vector-bits=256 the code is much worse:

vec_slp_int32_t:
.LFB4:
        .cfi_startproc
        ldp     w5, w4, [x1]
        cmp     w2, 0
        ble     .L31
        sub     sp, sp, #32
        .cfi_def_cfa_offset 32
        mov     z0.b, #0
        str     z0, [sp]
        mov     x3, 0
        sbfiz   x2, x2, 1, 32
        whilelo p0.s, xzr, x2
        ldr     x6, [sp]
        bfi     x6, x5, 0, 32
        mov     x5, x6
        bfi     x5, x4, 32, 32
        str     x5, [sp]
        ldr     z0, [sp]
        .p2align 3,,7
.L28:
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        add     x3, x3, 8
        add     z0.s, p0/m, z0.s, z1.s
        whilelo p0.s, x3, x2
        bne     .L28

We should try to optimise this, probably by implementing vec_init_optab for
fixed-length vectors.

Reply via email to