https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125392

            Bug ID: 125392
           Summary: inefficient SVE vectorization when loop contains
                    hazards
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Keywords: aarch64-sve
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
            Blocks: 53947
  Target Milestone: ---
            Target: aarch64*

Consider the following loop:

#include <stdint.h>

void f (uint8_t* restrict a, uint8_t *b, uint8_t* c, int n) {
  for (int i = 0; i < n; i++) {
    c[i] = a[i] + b[i];
  }
}

when compiled with -O3 -march=armv9-a

we generate a hazard check between b and c:

f:
        cmp     w3, 0
        ble     .L1
        mov     x4, 0
        whilewr p15.b, x1, x2
        b.nlast .L3
        whilelo p7.b, wzr, w3
.L4:
        ld1b    z31.b, p7/z, [x0, x4]
        ld1b    z30.b, p7/z, [x1, x4]
        add     z30.b, z30.b, z31.b
        st1b    z30.b, p7, [x2, x4]
        incb    x4
        whilelo p7.b, w4, w3
        b.any   .L4
.L1:
        ret
.L3:
        sxtw    x3, w3
.L5:
        ldrb    w5, [x1, x4]
        ldrb    w6, [x0, x4]
        add     w5, w5, w6
        strb    w5, [x2, x4]
        add     x4, x4, 1
        cmp     x3, x4
        bne     .L5
        ret

notice that this uses the SVE instructions to check for write-read hazards.
note that there is one for read-write hazards as well.

The current codegen says that if there is any hazard we jump to scalar code.
While valid this is inefficient.  The instructions return the safe amount to
processes during the loop body.

the above can be instead vectorized as

f:
        cmp     w3, 0
        ble     .L1
        mov     x4, 0
        whilewr p15.b, x1, x2
        b.nlast .L3
        whilelo p7.b, wzr, w3
.L4:
        ld1b    z31.b, p7/z, [x0, x4]
        ld1b    z30.b, p7/z, [x1, x4]
        add     z30.b, z30.b, z31.b
        st1b    z30.b, p7, [x2, x4]
        incb    x4
        whilelo p7.b, w4, w3
        b.any   .L4
.L1:
        ret
.L3:
        sxtw    x3, w3
.L5:
        ldrb    w5, [x1, x4]
        ldrb    w6, [x0, x4]
        add     w5, w5, w6
        strb    w5, [x2, x4]
        add     x4, x4, 1
        cmp     x3, x4
        bne     .L5
        ret

as

f:
        cmp     w3, 0
        ble     .L1
        mov     x4, 0
        whilewr p15.b, x1, x2
        b.none .L3
        whilelo p7.b, wzr, w3
.L4:
        and     p7.b, p7/z, p7.b, p15.b
        ld1b    z31.b, p7/z, [x0, x4]
        ld1b    z30.b, p7/z, [x1, x4]
        add     z30.b, z30.b, z31.b
        st1b    z30.b, p7, [x2, x4]
        incp    x4, p7
        whilelo p7.b, w4, w3
        b.any   .L4
.L1:
        ret
.L3:
        sxtw    x3, w3
.L5:
        ldrb    w5, [x1, x4]
        ldrb    w6, [x0, x4]
        add     w5, w5, w6
        strb    w5, [x2, x4]
        add     x4, x4, 1
        cmp     x3, x4
        bne     .L5
        ret

note that the scalar loop is still required since the mask can be empty and so
you need to be able to make progress.
This does add a new predicate instruction on the critical path, so we may
instead consider versioning the loop instead:

f:
        cmp     w3, 0
        ble     .L1
        mov     x4, 0
        whilewr p15.b, x1, x2
        b.none  .L7
        b.nlast .L3
        whilelo p7.b, wzr, w3
.L4:
        ld1b    z31.b, p7/z, [x0, x4]
        ld1b    z30.b, p7/z, [x1, x4]
        add     z30.b, z30.b, z31.b
        st1b    z30.b, p7, [x2, x4]
        incb    x4
        whilelo p7.b, w4, w3
        b.any   .L4
.L1:
        ret
.L3:
        whilelo p7.b, wzr, w3
.L5:
        and     p7.b, p7/z, p7.b, p15.b
        ld1b    z31.b, p7/z, [x0, x4]
        ld1b    z30.b, p7/z, [x1, x4]
        add     z30.b, z30.b, z31.b
        st1b    z30.b, p7, [x2, x4]
        incp    x4, p7
        whilelo p7.b, w4, w3
        b.any   .L5
.L6:
        ret
.L7:
        sxtw    x3, w3
.L8:
        ldrb    w5, [x1, x4]
        ldrb    w6, [x0, x4]
        add     w5, w5, w6
        strb    w5, [x2, x4]
        add     x4, x4, 1
        cmp     x3, x4
        bne     .L8
        ret

such that if there is no hazard at all don't take the extra dependency. 
Probably only useful at -O3.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

Reply via email to