11 Regression] aarch64, SVE2: Wrong code since r10-5853-g0a09a948 (wrong pattern for BCAX)

acoplan at gcc dot gnu.org via Gcc-bugs Thu, 05 Nov 2020 05:35:40 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97730


            Bug ID: 97730
           Summary: [10/11 Regression] aarch64, SVE2: Wrong code since
                    r10-5853-g0a09a948 (wrong pattern for BCAX)
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: acoplan at gcc dot gnu.org
  Target Milestone: ---

AArch64 GCC miscompiles the following testcase:

unsigned b = 0xce8e5a48, c = 0xb849691a;
unsigned a[8080];
int main() {
  a[0] = b;
  c = c;
  unsigned f = 0xb1e8;
  for (int h = 0; h < 5; h++)
    a[h] = (b & c) ^ f;
  if (a[0] != 0x8808f9e0)
    __builtin_abort();
}

at -O1 -ftree-vectorize -march=armv8.2-a+sve2 since
r10-5853-g0a09a9483825233f16e5b26bb0ffee76752339fc. Below is the generated code
from a trunk build, with some relevant lines annotated:

        .arch armv8.2-a+crc+sve2
        .file   "test.c"
        .text
        .align  2
        .global main
        .type   main, %function
main:
        adrp    x0, .LANCHOR0
        add     x3, x0, :lo12:.LANCHOR0
        ldr     w0, [x0, #:lo12:.LANCHOR0] // w0 <- b
        adrp    x2, a
        add     x1, x2, :lo12:a
        str     w0, [x2, #:lo12:a]         // a[0] <- w0
        mov     w2, 5
        whilelo p0.s, wzr, w2
        ptrue   p1.b, all
        ld1rw   z2.s, p1/z, [x3, 4]        // z2 <- {c, c, ... }
        mov     z1.s, w0                   // z1 <- {b, b, ... }
        mov     w0, 45544                  // w0 <- f (= 0xb1e8)
        mov     z0.s, w0                   // z0 <- {f, f, ... }
        bcax    z0.d, z0.d, z2.d, z1.d     // z0 ^= (z2 & ~z1)
        st1w    z0.s, p0, [x1]             // a[0, 1, ...] <- z0
        cntw    x0
        whilelo p0.s, w0, w2
        b.none  .L2
        incb    x1
        st1w    z0.s, p0, [x1]
.L2:
        adrp    x0, a
        ldr     w1, [x0, #:lo12:a]
        mov     w0, 63968
        movk    w0, 0x8808, lsl 16
        cmp     w1, w0
        bne     .L7
        mov     w0, 0
        ret
.L7:
        stp     x29, x30, [sp, -16]!
        mov     x29, sp
        bl      abort
        .size   main, .-main
        .global a
        .global c
        .global b
        .data
        .align  2
        .set    .LANCHOR0,. + 0
        .type   b, %object
        .size   b, 4
b:
        .word   -829531576
        .type   c, %object
        .size   c, 4
c:
        .word   -1203148518
        .bss
        .align  3
        .type   a, %object
        .size   a, 32320
a:
        .zero   32320
        .ident  "GCC: (unknown) 11.0.0 20201105 (experimental)"

The problem appears to be that the instruction:
  bcax    z0.d, z0.d, z2.d, z1.d
computes (~b & c) ^ f instead of (b & c) ^ f.

Looking at the SVE2 pattern for bcax (aarch64-sve2.md), it looks like we're
missing a not on one of the operands to the and rtx:

;; Unpredicated exclusive OR of AND.
(define_insn "@aarch64_sve2_bcax<mode>"
  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w")
        (xor:SVE_FULL_I
          (and:SVE_FULL_I
            (match_operand:SVE_FULL_I 2 "register_operand" "w, w")
            (match_operand:SVE_FULL_I 3 "register_operand" "w, w"))
          (match_operand:SVE_FULL_I 1 "register_operand" "0, w")))]
  "TARGET_SVE2"
  "@
  bcax\t%0.d, %0.d, %2.d, %3.d
  movprfx\t%0, %1\;bcax\t%0.d, %0.d, %2.d, %3.d"
  [(set_attr "movprfx" "*,yes")]
)

comparing this to the corresponding pattern for AdvSIMD bcax (aarch64-simd.md),
this becomes clear:

(define_insn "bcaxq<mode>4"
  [(set (match_operand:VQ_I 0 "register_operand" "=w")
        (xor:VQ_I
         (and:VQ_I
          (not:VQ_I (match_operand:VQ_I 3 "register_operand" "w"))
          (match_operand:VQ_I 2 "register_operand" "w"))
         (match_operand:VQ_I 1 "register_operand" "w")))]
  "TARGET_SIMD && TARGET_SHA3"
  "bcax\\t%0.16b, %1.16b, %2.16b, %3.16b"
  [(set_attr "type" "crypto_sha3")]
)

Indeed, changing the source file to print the value of a[0], we get 0x304190fa,
which is the result of computing (~b & c) ^ f.

[Bug target/97730] New: [10/11 Regression] aarch64, SVE2: Wrong code since r10-5853-g0a09a948 (wrong pattern for BCAX)

Reply via email to