[PATCH] Limit simplify_merge_mask optimization to cases where it can't trap (PR rtl-optimization/89445)

Jakub Jelinek Fri, 22 Feb 2019 16:29:27 -0800

Hi!

The following testcase is miscompiled on x86_64.  The problem is that
simplify_merge_mask optimization throws away an inner VEC_MERGE when there
is an outer one with the same mask.  This can be done only if the change
doesn't have observable side-effects.  The code already uses side_effects_p
tests in various spots, that is needed, but as this testcase shows, not
sufficient.  Another issue is if there is a MEM load or store and not
MEM_NOTRAP_P, as the testcase shows.  And another problem can be vector
integer division by zero (I think only mips has such insn), or various
floating point operations if we care about floating point exceptions.


While fixing this, I've found that may_trap_p_1 doesn't really support
vector operations very much, vector floating point arithmetics can cause
exceptions like scalar floating point arithmetics; on the other side, the
4 VEC_* codes can't trap themselves, though their operands could.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-02-23  Jakub Jelinek  <ja...@redhat.com>

        PR rtl-optimization/89445
        * simplify-rtx.c (simplify_ternary_operation): Don't use
        simplify_merge_mask on operands that may trap.
        * rtlanal.c (may_trap_p_1): Use FLOAT_MODE_P instead of
        SCALAR_FLOAT_MODE_P checks.  For integral division by zero, if
        second operand is CONST_VECTOR, check if any element could be zero.
        Don't expect traps for VEC_{MERGE,SELECT,CONCAT,DUPLICATE} unless
        their operands can trap.

        * gcc.target/i386/avx512f-pr89445.c: New test.

--- gcc/simplify-rtx.c.jj       2019-01-10 11:43:14.390377646 +0100
+++ gcc/simplify-rtx.c  2019-02-22 19:01:08.977661098 +0100
@@ -6073,8 +6073,10 @@ simplify_ternary_operation (enum rtx_cod
 
       if (!side_effects_p (op2))
        {
-         rtx top0 = simplify_merge_mask (op0, op2, 0);
-         rtx top1 = simplify_merge_mask (op1, op2, 1);
+         rtx top0
+           = may_trap_p (op0) ? NULL_RTX : simplify_merge_mask (op0, op2, 0);
+         rtx top1
+           = may_trap_p (op1) ? NULL_RTX : simplify_merge_mask (op1, op2, 1);
          if (top0 || top1)
            return simplify_gen_ternary (code, mode, mode,
                                         top0 ? top0 : op0,
--- gcc/rtlanal.c.jj    2019-02-20 10:00:49.279492877 +0100
+++ gcc/rtlanal.c       2019-02-22 19:03:02.478790634 +0100
@@ -2846,10 +2846,28 @@ may_trap_p_1 (const_rtx x, unsigned flag
     case UMOD:
       if (HONOR_SNANS (x))
        return 1;
-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)))
+      if (FLOAT_MODE_P (GET_MODE (x)))
        return flag_trapping_math;
       if (!CONSTANT_P (XEXP (x, 1)) || (XEXP (x, 1) == const0_rtx))
        return 1;
+      if (GET_CODE (XEXP (x, 1)) == CONST_VECTOR)
+       {
+         /* For CONST_VECTOR, return 1 if any element is or might be zero.  */
+         unsigned int n_elts;
+         rtx op = XEXP (x, 1);
+         if (!GET_MODE_NUNITS (GET_MODE (op)).is_constant (&n_elts))
+           {
+             if (!CONST_VECTOR_DUPLICATE_P (op))
+               return 1;
+             for (unsigned i = 0; i < (unsigned int) XVECLEN (op, 0); i++)
+               if (CONST_VECTOR_ENCODED_ELT (op, i) == const0_rtx)
+                 return 1;
+           }
+         else
+           for (unsigned i = 0; i < n_elts; i++)
+             if (CONST_VECTOR_ELT (op, i) == const0_rtx)
+               return 1;
+       }
       break;
 
     case EXPR_LIST:
@@ -2898,12 +2916,16 @@ may_trap_p_1 (const_rtx x, unsigned flag
     case NEG:
     case ABS:
     case SUBREG:
+    case VEC_MERGE:
+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_DUPLICATE:
       /* These operations don't trap even with floating point.  */
       break;
 
     default:
       /* Any floating arithmetic may trap.  */
-      if (SCALAR_FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
+      if (FLOAT_MODE_P (GET_MODE (x)) && flag_trapping_math)
        return 1;
     }
 
--- gcc/testsuite/gcc.target/i386/avx512f-pr89445.c.jj  2019-02-22 
19:19:17.709898754 +0100
+++ gcc/testsuite/gcc.target/i386/avx512f-pr89445.c     2019-02-22 
19:18:58.115216531 +0100
@@ -0,0 +1,54 @@
+/* PR rtl-optimization/89445 */
+/* { dg-do run { target { avx512f && mmap } } } */
+/* { dg-options "-O2 -mavx512f" } */
+
+#include "avx512f-check.h"
+
+#include <sys/mman.h>
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+#ifndef MAP_ANON
+#define MAP_ANON 0
+#endif
+#ifndef MAP_FAILED
+#define MAP_FAILED ((void *)-1)
+#endif
+
+__attribute__ ((noipa))
+void daxpy (unsigned long n, double a, double const *__restrict x,
+           double *__restrict y)
+{
+  const __m512d v_a = _mm512_broadcastsd_pd (_mm_set_sd (a));
+  const __mmask16 final = (1U << (n % 8u)) - 1;
+  __mmask16 mask = 65535u;
+  unsigned long i;
+  for (i = 0; i < n * sizeof (double); i += 8 * sizeof (double))
+    {
+      if (i + 8 * sizeof (double) > n * sizeof (double))
+       mask = final;
+      __m512d v_x = _mm512_maskz_loadu_pd (mask, (char const *) x + i);
+      __m512d v_y = _mm512_maskz_loadu_pd (mask, (char const *) y + i);
+      __m512d tmp = _mm512_fmadd_pd (v_x, v_a, v_y);
+      _mm512_mask_storeu_pd ((char *) y + i, mask, tmp);
+    }
+}
+
+static const double x[] = { 1, 2, 3, 4 };
+
+static void
+avx512f_test (void)
+{
+  char *ptr
+    = (char *) mmap (NULL, 2 * 4096, PROT_READ | PROT_WRITE,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (ptr == MAP_FAILED)
+    return;
+
+  munmap (ptr + 4096, 4096);
+  double *y = (double *) (ptr + 4096 - sizeof (x));
+  __builtin_memcpy (y, x, sizeof (x));
+  daxpy (sizeof (x) / sizeof (x[0]), 1.0, x, y);
+  if (y[0] != 2.0 || y[1] != 4.0 || y[2] != 6.0 || y[3] != 8.0)
+    abort ();
+}

        Jakub

[PATCH] Limit simplify_merge_mask optimization to cases where it can't trap (PR rtl-optimization/89445)

Reply via email to