https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125055

            Bug ID: 125055
           Summary: Wrong code vectorizing negation with -O2 -ftrapv
           Product: gcc
           Version: 17.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: acoplan at gcc dot gnu.org
  Target Milestone: ---

Consider the following testcase:

int a[8];
int b[8];
void foo (void) {
    for (unsigned i = 0; i != 8; ++i)
        b[i] = -a[i];
}

compiled with -O2 -ftrapv on aarch64, we get:

foo:
        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        ldp     q30, q31, [x0, 32]
        neg     v30.4s, v30.4s
        neg     v31.4s, v31.4s
        stp     q30, q31, [x0]
        ret

but this is wrong since if any a[i] is INT_MIN, the program should trap due to
the presence of -ftrapv.  If we add -fno-tree-vectorize, we instead get:

foo:
        stp     x29, x30, [sp, -48]!
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        adrp    x20, .LANCHOR0
        add     x20, x20, :lo12:.LANCHOR0
        str     x21, [sp, 32]
        add     x21, x20, 32
        mov     x19, 0
.L2:
        ldr     w0, [x19, x21]
        bl      __negvsi2
        str     w0, [x20, x19]
        add     x19, x19, 4
        cmp     x19, 32
        bne     .L2
        ldr     x21, [sp, 32]
        ldp     x19, x20, [sp, 16]
        ldp     x29, x30, [sp], 48
        ret

which looks correct; it calls a libgcc helper to handle the possible trap.  I
think there are many related testcases where -ftrapv isn't respected by the
vectorizer, e.g. even a simple addition loop such as:

int a[8];
int b[8];
void foo (void) {
    for (unsigned i = 0; i != 8; ++i)
      a[i] += b[i];       
}

is incorrectly vectorized in the presence of -ftrapv (the addition can of
course overflow), we don't check for it, instead generating:

foo:
        adrp    x0, .LANCHOR0
        add     x0, x0, :lo12:.LANCHOR0
        ldp     q28, q31, [x0]
        ldp     q29, q30, [x0, 32]
        add     v28.4s, v29.4s, v28.4s
        add     v30.4s, v31.4s, v30.4s
        stp     q28, q30, [x0]
        ret

Reply via email to