Hi! The following testcase is miscompiled on x86_64. The problem is that simplify_merge_mask optimization throws away an inner VEC_MERGE when there is an outer one with the same mask. This can be done only if the change doesn't have observable side-effects. The code already uses side_effects_p tests in various spots, that is needed, but as this testcase shows, not sufficient. Another issue is if there is a MEM load or store and not MEM_NOTRAP_P, as the testcase shows. And another problem can be vector integer division by zero (I think only mips has such insn), or various floating point operations if we care about floating point exceptions.
While fixing this, I've found that may_trap_p_1 doesn't really support vector operations very much, vector floating point arithmetics can cause exceptions like scalar floating point arithmetics; on the other side, the 4 VEC_* codes can't trap themselves, though their operands could. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2019-02-23 Jakub Jelinek <ja...@redhat.com> PR rtl-optimization/89445 * simplify-rtx.c (simplify_ternary_operation): Don't use simplify_merge_mask on operands that may trap. * rtlanal.c (may_trap_p_1): Use FLOAT_MODE_P instead of SCALAR_FLOAT_MODE_P checks. For integral division by zero, if second operand is CONST_VECTOR, check if any element could be zero. Don't expect traps for VEC_{MERGE,SELECT,CONCAT,DUPLICATE} unless their operands can trap. * gcc.target/i386/avx512f-pr89445.c: New test. --- gcc/simplify-rtx.c.jj 2019-01-10 11:43:14.390377646 +0100 +++ gcc/simplify-rtx.c 2019-02-22 19:01:08.977661098 +0100 @@ -6073,8 +6073,10 @@ simplify_ternary_operation (enum rtx_cod if (!side_effects_p (op2)) { - rtx top0 = simplify_merge_mask (op0, op2, 0); - rtx top1 = simplify_merge_mask (op1, op2, 1); + rtx top0 + = may_trap_p (op0) ? NULL_RTX : simplify_merge_mask (op0, op2, 0); + rtx top1 + = may_trap_p (op1) ? NULL_RTX : simplify_merge_mask (op1, op2, 1); if (top0 || top1) return simplify_gen_ternary (code, mode, mode, top0 ? top0 : op0, --- gcc/rtlanal.c.jj 2019-02-20 10:00:49.279492877 +0100 +++ gcc/rtlanal.c 2019-02-22 19:03:02.478790634 +0100 @@ -2846,10 +2846,28 @@ may_trap_p_1 (const_rtx x, unsigned flag case UMOD: if (HONOR_SNANS (x)) return 1; - if (SCALAR_FLOAT_MODE_P (GET_MODE (x))) + if (FLOAT_MODE_P (GET_MODE (x))) return flag_trapping_math; if (!CONSTANT_P (XEXP (x, 1)) || (XEXP (x, 1) == const0_rtx)) return 1; + if (GET_CODE (XEXP (x, 1)) == CONST_VECTOR) + { + /* For CONST_VECTOR, return 1 if any element is or might be zero. */ + unsigned int n_elts; + rtx op = XEXP (x, 1); + if (!GET_MODE_NUNITS (GET_MODE (op)).is_constant (&n_elts)) + { + if (!CONST_VECTOR_DUPLICATE_P (op)) + return 1; + for (unsigned i = 0; i < (unsigned int) XVECLEN (op, 0); i++) + if (CONST_VECTOR_ENCODED_ELT (op, i) == const0_rtx) + return 1; + } + else + for (unsigned i = 0; i < n_elts; i++) + if (CONST_VECTOR_ELT (op, i) == const0_rtx) + return 1; + } break; case EXPR_LIST: @@ -2898,12 +2916,16 @@ may_trap_p_1 (const_rtx x, unsigned flag case NEG: case ABS: case SUBREG: + case VEC_MERGE: + case VEC_SELECT: + case VEC_CONCAT: + case VEC_DUPLICATE: /* These operations don't trap even with floating point. */ break; default: /* Any floating arithmetic may trap. */ - if (SCALAR_FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math) + if (FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math) return 1; } --- gcc/testsuite/gcc.target/i386/avx512f-pr89445.c.jj 2019-02-22 19:19:17.709898754 +0100 +++ gcc/testsuite/gcc.target/i386/avx512f-pr89445.c 2019-02-22 19:18:58.115216531 +0100 @@ -0,0 +1,54 @@ +/* PR rtl-optimization/89445 */ +/* { dg-do run { target { avx512f && mmap } } } */ +/* { dg-options "-O2 -mavx512f" } */ + +#include "avx512f-check.h" + +#include <sys/mman.h> +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#ifndef MAP_ANON +#define MAP_ANON 0 +#endif +#ifndef MAP_FAILED +#define MAP_FAILED ((void *)-1) +#endif + +__attribute__ ((noipa)) +void daxpy (unsigned long n, double a, double const *__restrict x, + double *__restrict y) +{ + const __m512d v_a = _mm512_broadcastsd_pd (_mm_set_sd (a)); + const __mmask16 final = (1U << (n % 8u)) - 1; + __mmask16 mask = 65535u; + unsigned long i; + for (i = 0; i < n * sizeof (double); i += 8 * sizeof (double)) + { + if (i + 8 * sizeof (double) > n * sizeof (double)) + mask = final; + __m512d v_x = _mm512_maskz_loadu_pd (mask, (char const *) x + i); + __m512d v_y = _mm512_maskz_loadu_pd (mask, (char const *) y + i); + __m512d tmp = _mm512_fmadd_pd (v_x, v_a, v_y); + _mm512_mask_storeu_pd ((char *) y + i, mask, tmp); + } +} + +static const double x[] = { 1, 2, 3, 4 }; + +static void +avx512f_test (void) +{ + char *ptr + = (char *) mmap (NULL, 2 * 4096, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) + return; + + munmap (ptr + 4096, 4096); + double *y = (double *) (ptr + 4096 - sizeof (x)); + __builtin_memcpy (y, x, sizeof (x)); + daxpy (sizeof (x) / sizeof (x[0]), 1.0, x, y); + if (y[0] != 2.0 || y[1] != 4.0 || y[2] != 6.0 || y[3] != 8.0) + abort (); +} Jakub