Hi!

This patch makes veclower2 attempt to emit integer division/modulus of
vectors by constants using vector multiplication, shifts or masking.

It is somewhat similar to the vect_recog_divmod_pattern, but it needs
to analyze everything first, see if all divisions or modulos are doable
using the same sequence of vector insns, and then emit vector insns
as opposed to the scalar ones the pattern recognizer adds.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

The testcase additionally eyeballed even for -mavx2, which unlike -mavx
has vector >> vector shifts.

2012-06-27  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/53645
        * tree-vect-generic.c (add_rshift): New function.
        (expand_vector_divmod): New function.
        (expand_vector_operation): Use it for vector integer
        TRUNC_{DIV,MOD}_EXPR by VECTOR_CST.
        * tree-vect-patterns.c (vect_recog_divmod_pattern): Replace
        unused lguup variable with dummy_int.

        * gcc.c-torture/execute/pr53645.c: New test.

--- gcc/tree-vect-generic.c.jj  2012-06-26 10:00:42.935832834 +0200
+++ gcc/tree-vect-generic.c     2012-06-27 10:15:20.534103045 +0200
@@ -391,6 +391,515 @@ expand_vector_comparison (gimple_stmt_it
   return t;
 }
 
+/* Helper function of expand_vector_divmod.  Gimplify a RSHIFT_EXPR in type
+   of OP0 with shift counts in SHIFTCNTS array and return the temporary holding
+   the result if successful, otherwise return NULL_TREE.  */
+static tree
+add_rshift (gimple_stmt_iterator *gsi, tree type, tree op0, int *shiftcnts)
+{
+  optab op;
+  unsigned int i, nunits = TYPE_VECTOR_SUBPARTS (type);
+  bool scalar_shift = true;
+
+  for (i = 1; i < nunits; i++)
+    {
+      if (shiftcnts[i] != shiftcnts[0])
+       scalar_shift = false;
+    }
+
+  if (scalar_shift && shiftcnts[0] == 0)
+    return op0;
+
+  if (scalar_shift)
+    {
+      op = optab_for_tree_code (RSHIFT_EXPR, type, optab_scalar);
+      if (op != NULL
+         && optab_handler (op, TYPE_MODE (type)) != CODE_FOR_nothing)
+       return gimplify_build2 (gsi, RSHIFT_EXPR, type, op0,
+                               build_int_cst (NULL_TREE, shiftcnts[0]));
+    }
+
+  op = optab_for_tree_code (RSHIFT_EXPR, type, optab_vector);
+  if (op != NULL
+      && optab_handler (op, TYPE_MODE (type)) != CODE_FOR_nothing)
+    {
+      tree *vec = XALLOCAVEC (tree, nunits);
+      for (i = 0; i < nunits; i++)
+       vec[i] = build_int_cst (TREE_TYPE (type), shiftcnts[i]);
+      return gimplify_build2 (gsi, RSHIFT_EXPR, type, op0,
+                             build_vector (type, vec));
+    }
+
+  return NULL_TREE;
+}
+
+/* Try to expand integer vector division by constant using
+   widening multiply, shifts and additions.  */
+static tree
+expand_vector_divmod (gimple_stmt_iterator *gsi, tree type, tree op0,
+                     tree op1, enum tree_code code)
+{
+  bool use_pow2 = true;
+  bool has_vector_shift = true;
+  int mode = -1, this_mode;
+  int pre_shift = -1, post_shift;
+  unsigned int nunits = TYPE_VECTOR_SUBPARTS (type);
+  int *shifts = XALLOCAVEC (int, nunits * 4);
+  int *pre_shifts = shifts + nunits;
+  int *post_shifts = pre_shifts + nunits;
+  int *shift_temps = post_shifts + nunits;
+  unsigned HOST_WIDE_INT *mulc = XALLOCAVEC (unsigned HOST_WIDE_INT, nunits);
+  int prec = TYPE_PRECISION (TREE_TYPE (type));
+  int dummy_int;
+  unsigned int i, unsignedp = TYPE_UNSIGNED (TREE_TYPE (type));
+  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (TYPE_MODE (TREE_TYPE (type)));
+  optab op;
+  tree *vec;
+  unsigned char *sel;
+  tree cur_op, mhi, mlo, mulcst, perm_mask, wider_type, tem;
+
+  if (prec > HOST_BITS_PER_WIDE_INT)
+    return NULL_TREE;
+
+  op = optab_for_tree_code (RSHIFT_EXPR, type, optab_vector);
+  if (op == NULL
+      || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+    has_vector_shift = false;
+
+  /* Analysis phase.  Determine if all op1 elements are either power
+     of two and it is possible to expand it using shifts (or for remainder
+     using masking).  Additionally compute the multiplicative constants
+     and pre and post shifts if the division is to be expanded using
+     widening or high part multiplication plus shifts.  */
+  for (i = 0; i < nunits; i++)
+    {
+      tree cst = VECTOR_CST_ELT (op1, i);
+      unsigned HOST_WIDE_INT ml;
+
+      if (!host_integerp (cst, unsignedp) || integer_zerop (cst))
+       return NULL_TREE;
+      pre_shifts[i] = 0;
+      post_shifts[i] = 0;
+      mulc[i] = 0;
+      if (use_pow2
+         && (!integer_pow2p (cst) || tree_int_cst_sgn (cst) != 1))
+       use_pow2 = false;
+      if (use_pow2)
+       {
+         shifts[i] = tree_log2 (cst);
+         if (shifts[i] != shifts[0]
+             && code == TRUNC_DIV_EXPR
+             && !has_vector_shift)
+           use_pow2 = false;
+       }
+      if (mode == -2)
+       continue;
+      if (unsignedp)
+       {
+         unsigned HOST_WIDE_INT mh;
+         unsigned HOST_WIDE_INT d = tree_low_cst (cst, 1) & mask;
+
+         if (d >= ((unsigned HOST_WIDE_INT) 1 << (prec - 1)))
+           /* FIXME: Can transform this into op0 >= op1 ? 1 : 0.  */
+           return NULL_TREE;
+
+         if (d <= 1)
+           {
+             mode = -2;
+             continue;
+           }
+
+         /* Find a suitable multiplier and right shift count
+            instead of multiplying with D.  */
+         mh = choose_multiplier (d, prec, prec, &ml, &post_shift, &dummy_int);
+
+         /* If the suggested multiplier is more than SIZE bits, we can
+            do better for even divisors, using an initial right shift.  */
+         if ((mh != 0 && (d & 1) == 0)
+             || (!has_vector_shift && pre_shift != -1))
+           {
+             if (has_vector_shift)
+               pre_shift = floor_log2 (d & -d);
+             else if (pre_shift == -1)
+               {
+                 unsigned int j;
+                 for (j = 0; j < nunits; j++)
+                   {
+                     tree cst2 = VECTOR_CST_ELT (op1, j);
+                     unsigned HOST_WIDE_INT d2;
+                     int this_pre_shift;
+
+                     if (!host_integerp (cst2, 1))
+                       return NULL_TREE;
+                     d2 = tree_low_cst (cst2, 1) & mask;
+                     if (d2 == 0)
+                       return NULL_TREE;
+                     this_pre_shift = floor_log2 (d2 & -d2);
+                     if (pre_shift == -1 || this_pre_shift < pre_shift)
+                       pre_shift = this_pre_shift;
+                   }
+                 if (i != 0 && pre_shift != 0)
+                   {
+                     /* Restart.  */
+                     i = -1U;
+                     mode = -1;
+                     continue;
+                   }
+               }
+             if (pre_shift != 0)
+               {
+                 if ((d >> pre_shift) <= 1)
+                   {
+                     mode = -2;
+                     continue;
+                   }
+                 mh = choose_multiplier (d >> pre_shift, prec,
+                                         prec - pre_shift,
+                                         &ml, &post_shift, &dummy_int);
+                 gcc_assert (!mh);
+                 pre_shifts[i] = pre_shift;
+               }
+           }
+         if (!mh)
+           this_mode = 0;
+         else
+           this_mode = 1;
+       }
+      else
+       {
+         HOST_WIDE_INT d = tree_low_cst (cst, 0);
+         unsigned HOST_WIDE_INT abs_d;
+
+         if (d == -1)
+           return NULL_TREE;
+
+         /* Since d might be INT_MIN, we have to cast to
+            unsigned HOST_WIDE_INT before negating to avoid
+            undefined signed overflow.  */
+         abs_d = (d >= 0
+                 ? (unsigned HOST_WIDE_INT) d
+                 : - (unsigned HOST_WIDE_INT) d);
+
+         /* n rem d = n rem -d */
+         if (code == TRUNC_MOD_EXPR && d < 0)
+           d = abs_d;
+         else if (abs_d == (unsigned HOST_WIDE_INT) 1 << (prec - 1))
+           {
+             /* This case is not handled correctly below.  */
+             mode = -2;
+             continue;
+           }
+         if (abs_d <= 1)
+           {
+             mode = -2;
+             continue;
+           }
+
+         choose_multiplier (abs_d, prec, prec - 1, &ml,
+                            &post_shift, &dummy_int);
+         if (ml >= (unsigned HOST_WIDE_INT) 1 << (prec - 1))
+           {
+             this_mode = 4 + (d < 0);
+             ml |= (~(unsigned HOST_WIDE_INT) 0) << (prec - 1);
+           }
+         else
+           this_mode = 2 + (d < 0);
+       }
+      mulc[i] = ml;
+      post_shifts[i] = post_shift;
+      if ((i && !has_vector_shift && post_shifts[0] != post_shift)
+         || post_shift >= prec
+         || pre_shifts[i] >= prec)
+       this_mode = -2;
+
+      if (i == 0)
+       mode = this_mode;
+      else if (mode != this_mode)
+       mode = -2;
+    }
+
+  vec = XALLOCAVEC (tree, nunits);
+
+  if (use_pow2)
+    {
+      tree addend = NULL_TREE;
+      if (!unsignedp)
+       {
+         tree uns_type;
+
+         /* Both division and remainder sequences need
+            op0 < 0 ? mask : 0 computed.  It can be either computed as
+            (type) (((uns_type) (op0 >> (prec - 1))) >> (prec - shifts[i]))
+            if none of the shifts is 0, or as the conditional.  */
+         for (i = 0; i < nunits; i++)
+           if (shifts[i] == 0)
+             break;
+         uns_type
+           = build_vector_type (build_nonstandard_integer_type (prec, 1),
+                                nunits);
+         if (i == nunits && TYPE_MODE (uns_type) == TYPE_MODE (type))
+           {
+             for (i = 0; i < nunits; i++)
+               shift_temps[i] = prec - 1;
+             cur_op = add_rshift (gsi, type, op0, shift_temps);
+             if (cur_op != NULL_TREE)
+               {
+                 cur_op = gimplify_build1 (gsi, VIEW_CONVERT_EXPR,
+                                           uns_type, cur_op);
+                 for (i = 0; i < nunits; i++)
+                   shift_temps[i] = prec - shifts[i];
+                 cur_op = add_rshift (gsi, uns_type, cur_op, shift_temps);
+                 if (cur_op != NULL_TREE)
+                   addend = gimplify_build1 (gsi, VIEW_CONVERT_EXPR,
+                                             type, cur_op);
+               }
+           }
+         if (addend == NULL_TREE
+             && expand_vec_cond_expr_p (type, type))
+           {
+             tree zero, cst, cond;
+             gimple stmt;
+
+             zero = build_zero_cst (type);
+             cond = build2 (LT_EXPR, type, op0, zero);
+             for (i = 0; i < nunits; i++)
+               vec[i] = build_int_cst (TREE_TYPE (type),
+                                       ((unsigned HOST_WIDE_INT) 1
+                                        << shifts[i]) - 1);
+             cst = build_vector (type, vec);
+             addend = create_tmp_reg (type, NULL);
+             add_referenced_var (addend);
+             addend = make_ssa_name (addend, NULL);
+             stmt = gimple_build_assign_with_ops3 (VEC_COND_EXPR, addend,
+                                                   cond, cst, zero);
+             gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+           }
+       }
+      if (code == TRUNC_DIV_EXPR)
+       {
+         if (unsignedp)
+           {
+             /* q = op0 >> shift;  */
+             cur_op = add_rshift (gsi, type, op0, shifts);
+             if (cur_op != NULL_TREE)
+               return cur_op;
+           }
+         else if (addend != NULL_TREE)
+           {
+             /* t1 = op0 + addend;
+                q = t1 >> shift;  */
+             op = optab_for_tree_code (PLUS_EXPR, type, optab_default);
+             if (op != NULL
+                 && optab_handler (op, TYPE_MODE (type)) != CODE_FOR_nothing)
+               {
+                 cur_op = gimplify_build2 (gsi, PLUS_EXPR, type, op0, addend);
+                 cur_op = add_rshift (gsi, type, cur_op, shifts);
+                 if (cur_op != NULL_TREE)
+                   return cur_op;
+               }
+           }
+       }
+      else
+       {
+         tree mask;
+         for (i = 0; i < nunits; i++)
+           vec[i] = build_int_cst (TREE_TYPE (type),
+                                   ((unsigned HOST_WIDE_INT) 1
+                                    << shifts[i]) - 1);
+         mask = build_vector (type, vec);
+         op = optab_for_tree_code (BIT_AND_EXPR, type, optab_default);
+         if (op != NULL
+             && optab_handler (op, TYPE_MODE (type)) != CODE_FOR_nothing)
+           {
+             if (unsignedp)
+               /* r = op0 & mask;  */
+               return gimplify_build2 (gsi, BIT_AND_EXPR, type, op0, mask);
+             else if (addend != NULL_TREE)
+               {
+                 /* t1 = op0 + addend;
+                    t2 = t1 & mask;
+                    r = t2 - addend;  */
+                 op = optab_for_tree_code (PLUS_EXPR, type, optab_default);
+                 if (op != NULL
+                     && optab_handler (op, TYPE_MODE (type))
+                        != CODE_FOR_nothing)
+                   {
+                     cur_op = gimplify_build2 (gsi, PLUS_EXPR, type, op0,
+                                               addend);
+                     cur_op = gimplify_build2 (gsi, BIT_AND_EXPR, type,
+                                               cur_op, mask);
+                     op = optab_for_tree_code (MINUS_EXPR, type,
+                                               optab_default);
+                     if (op != NULL
+                         && optab_handler (op, TYPE_MODE (type))
+                            != CODE_FOR_nothing)
+                       return gimplify_build2 (gsi, MINUS_EXPR, type,
+                                               cur_op, addend);
+                   }
+               }
+           }
+       }
+    }
+
+  if (mode == -2 || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
+    return NULL_TREE;
+
+  op = optab_for_tree_code (VEC_WIDEN_MULT_LO_EXPR, type, optab_default);
+  if (op == NULL
+      || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+    return NULL_TREE;
+  op = optab_for_tree_code (VEC_WIDEN_MULT_HI_EXPR, type, optab_default);
+  if (op == NULL
+      || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+    return NULL_TREE;
+  sel = XALLOCAVEC (unsigned char, nunits);
+  for (i = 0; i < nunits; i++)
+    sel[i] = 2 * i + (BYTES_BIG_ENDIAN ? 0 : 1);
+  if (!can_vec_perm_p (TYPE_MODE (type), false, sel))
+    return NULL_TREE;
+  wider_type
+    = build_vector_type (build_nonstandard_integer_type (prec * 2, unsignedp),
+                        nunits / 2);
+  if (GET_MODE_CLASS (TYPE_MODE (wider_type)) != MODE_VECTOR_INT
+      || GET_MODE_BITSIZE (TYPE_MODE (wider_type))
+        != GET_MODE_BITSIZE (TYPE_MODE (type)))
+    return NULL_TREE;
+
+  cur_op = op0;
+
+  switch (mode)
+    {
+    case 0:
+      gcc_assert (unsignedp);
+      /* t1 = oprnd0 >> pre_shift;
+        t2 = (type) (t1 w* ml >> prec);
+        q = t2 >> post_shift;  */
+      cur_op = add_rshift (gsi, type, cur_op, pre_shifts);
+      if (cur_op == NULL_TREE)
+       return NULL_TREE;
+      break;
+    case 1:
+      gcc_assert (unsignedp);
+      for (i = 0; i < nunits; i++)
+       {
+         shift_temps[i] = 1;
+         post_shifts[i]--;
+       }
+      break;
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      gcc_assert (!unsignedp);
+      for (i = 0; i < nunits; i++)
+       shift_temps[i] = prec - 1;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  for (i = 0; i < nunits; i++)
+    vec[i] = build_int_cst (TREE_TYPE (type), mulc[i]);
+  mulcst = build_vector (type, vec);
+  for (i = 0; i < nunits; i++)
+    vec[i] = build_int_cst (TREE_TYPE (type), sel[i]);
+  perm_mask = build_vector (type, vec);
+  mhi = gimplify_build2 (gsi, VEC_WIDEN_MULT_HI_EXPR, wider_type,
+                        cur_op, mulcst);
+  mhi = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, type, mhi);
+  mlo = gimplify_build2 (gsi, VEC_WIDEN_MULT_LO_EXPR, wider_type,
+                        cur_op, mulcst);
+  mlo = gimplify_build1 (gsi, VIEW_CONVERT_EXPR, type, mlo);
+  if (BYTES_BIG_ENDIAN)
+    cur_op = gimplify_build3 (gsi, VEC_PERM_EXPR, type, mhi, mlo, perm_mask);
+  else
+    cur_op = gimplify_build3 (gsi, VEC_PERM_EXPR, type, mlo, mhi, perm_mask);
+
+  switch (mode)
+    {
+    case 0:
+      /* t1 = oprnd0 >> pre_shift;
+        t2 = (type) (t1 w* ml >> prec);
+        q = t2 >> post_shift;  */
+      cur_op = add_rshift (gsi, type, cur_op, post_shifts);
+      break;
+    case 1:
+      /* t1 = (type) (oprnd0 w* ml >> prec);
+        t2 = oprnd0 - t1;
+        t3 = t2 >> 1;
+        t4 = t1 + t3;
+        q = t4 >> (post_shift - 1);  */
+      op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
+      if (op == NULL
+         || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+       return NULL_TREE;
+      tem = gimplify_build2 (gsi, MINUS_EXPR, type, op0, cur_op);
+      tem = add_rshift (gsi, type, tem, shift_temps);
+      op = optab_for_tree_code (PLUS_EXPR, type, optab_default);
+      if (op == NULL
+         || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+       return NULL_TREE;
+      tem = gimplify_build2 (gsi, PLUS_EXPR, type, cur_op, tem);
+      cur_op = add_rshift (gsi, type, tem, post_shifts);
+      if (cur_op == NULL_TREE)
+       return NULL_TREE;
+      break;
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+      /* t1 = (type) (oprnd0 w* ml >> prec);
+        t2 = t1; [ iff (mode & 2) != 0 ]
+        t2 = t1 + oprnd0; [ iff (mode & 2) == 0 ]
+        t3 = t2 >> post_shift;
+        t4 = oprnd0 >> (prec - 1);
+        q = t3 - t4; [ iff (mode & 1) == 0 ]
+        q = t4 - t3; [ iff (mode & 1) != 0 ]  */
+      if ((mode & 2) == 0)
+       {
+         op = optab_for_tree_code (PLUS_EXPR, type, optab_default);
+         if (op == NULL
+             || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+           return NULL_TREE;
+         cur_op = gimplify_build2 (gsi, PLUS_EXPR, type, cur_op, op0);
+       }
+      cur_op = add_rshift (gsi, type, cur_op, post_shifts);
+      if (cur_op == NULL_TREE)
+       return NULL_TREE;
+      tem = add_rshift (gsi, type, op0, shift_temps);
+      if (tem == NULL_TREE)
+       return NULL_TREE;
+      op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
+      if (op == NULL
+         || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+       return NULL_TREE;
+      if ((mode & 1) == 0)
+       cur_op = gimplify_build2 (gsi, MINUS_EXPR, type, cur_op, tem);
+      else
+       cur_op = gimplify_build2 (gsi, MINUS_EXPR, type, tem, cur_op);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  if (code == TRUNC_DIV_EXPR)
+    return cur_op;
+
+  /* We divided.  Now finish by:
+     t1 = q * oprnd1;
+     r = oprnd0 - t1;  */
+  op = optab_for_tree_code (MULT_EXPR, type, optab_default);
+  if (op == NULL
+      || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+    return NULL_TREE;
+  tem = gimplify_build2 (gsi, MULT_EXPR, type, cur_op, op1);
+  op = optab_for_tree_code (MINUS_EXPR, type, optab_default);
+  if (op == NULL
+      || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing)
+    return NULL_TREE;
+  return gimplify_build2 (gsi, MINUS_EXPR, type, op0, tem);
+}
+
 static tree
 expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree 
compute_type,
                         gimple assign, enum tree_code code)
@@ -454,6 +963,23 @@ expand_vector_operation (gimple_stmt_ite
 
          return expand_vector_comparison (gsi, type, rhs1, rhs2, code);
        }
+
+      case TRUNC_DIV_EXPR:
+      case TRUNC_MOD_EXPR:
+       {
+         tree rhs1 = gimple_assign_rhs1 (assign);
+         tree rhs2 = gimple_assign_rhs2 (assign);
+         tree ret;
+
+         if (!VECTOR_INTEGER_TYPE_P (type) || TREE_CODE (rhs2) != VECTOR_CST)
+           break;
+
+         ret = expand_vector_divmod (gsi, type, rhs1, rhs2, code);
+         if (ret != NULL_TREE)
+           return ret;
+         break;
+       }
+
       default:
        break;
       }
--- gcc/tree-vect-patterns.c.jj 2012-06-25 08:38:28.000000000 +0200
+++ gcc/tree-vect-patterns.c    2012-06-26 12:09:08.197243961 +0200
@@ -1971,7 +1971,7 @@ vect_recog_divmod_pattern (VEC (gimple,
   else
     {
       unsigned HOST_WIDE_INT ml;
-      int lgup, post_shift;
+      int post_shift;
       HOST_WIDE_INT d = tree_low_cst (oprnd1, 0);
       unsigned HOST_WIDE_INT abs_d;
       bool add = false;
@@ -2008,7 +2008,7 @@ vect_recog_divmod_pattern (VEC (gimple,
        /* This case is not handled correctly below.  */
        return NULL;
 
-      choose_multiplier (abs_d, prec, prec - 1, &ml, &post_shift, &lgup);
+      choose_multiplier (abs_d, prec, prec - 1, &ml, &post_shift, &dummy_int);
       if (ml >= (unsigned HOST_WIDE_INT) 1 << (prec - 1))
        {
          add = true;
--- gcc/testsuite/gcc.c-torture/execute/pr53645.c.jj    2012-06-27 
10:21:02.012160537 +0200
+++ gcc/testsuite/gcc.c-torture/execute/pr53645.c       2012-06-27 
10:20:32.000000000 +0200
@@ -0,0 +1,96 @@
+/* PR tree-optimization/53645 */
+
+typedef unsigned int UV __attribute__((vector_size (16)));
+typedef int SV __attribute__((vector_size (16)));
+extern void abort (void);
+
+#define TEST(a, b, c, d) \
+__attribute__((noinline)) void         \
+uq##a##b##c##d (UV *x, UV *y)          \
+{                                      \
+  *x = *y / ((UV) { a, b, c, d });     \
+}                                      \
+                                       \
+__attribute__((noinline)) void         \
+ur##a##b##c##d (UV *x, UV *y)          \
+{                                      \
+  *x = *y % ((UV) { a, b, c, d });     \
+}                                      \
+                                       \
+__attribute__((noinline)) void         \
+sq##a##b##c##d (SV *x, SV *y)          \
+{                                      \
+  *x = *y / ((SV) { a, b, c, d });     \
+}                                      \
+                                       \
+__attribute__((noinline)) void         \
+sr##a##b##c##d (SV *x, SV *y)          \
+{                                      \
+  *x = *y % ((SV) { a, b, c, d });     \
+}
+
+#define TESTS \
+TEST (4, 4, 4, 4)                      \
+TEST (1, 4, 2, 8)                      \
+TEST (3, 3, 3, 3)                      \
+TEST (6, 5, 6, 5)                      \
+TEST (14, 14, 14, 6)                   \
+TEST (7, 7, 7, 7)                      \
+
+TESTS
+
+UV u[] =
+  { ((UV) { 73U, 65531U, 0U, 174U }),
+    ((UV) { 1U, 8173U, ~0U, ~0U - 63 }) };
+SV s[] =
+  { ((SV) { 73, -9123, 32761, 8191 }),
+    ((SV) { 9903, -1, -7323, 0 }) };
+
+int
+main ()
+{
+  UV ur, ur2;
+  SV sr, sr2;
+  int i;
+#undef TEST
+#define TEST(a, b, c, d)                               \
+    uq##a##b##c##d (&ur, u + i);                       \
+    if (ur[0] != u[i][0] / a || ur[3] != u[i][3] / d)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&ur) : "memory");                \
+    if (ur[2] != u[i][2] / c || ur[1] != u[i][1] / b)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&ur) : "memory");                \
+    ur##a##b##c##d (&ur, u + i);                       \
+    if (ur[0] != u[i][0] % a || ur[3] != u[i][3] % d)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&ur) : "memory");                \
+    if (ur[2] != u[i][2] % c || ur[1] != u[i][1] % b)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&ur) : "memory");
+  for (i = 0; i < sizeof (u) / sizeof (u[0]); i++)
+    {
+      TESTS
+    }
+#undef TEST
+#define TEST(a, b, c, d)                               \
+    sq##a##b##c##d (&sr, s + i);                       \
+    if (sr[0] != s[i][0] / a || sr[3] != s[i][3] / d)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&sr) : "memory");                \
+    if (sr[2] != s[i][2] / c || sr[1] != s[i][1] / b)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&sr) : "memory");                \
+    sr##a##b##c##d (&sr, s + i);                       \
+    if (sr[0] != s[i][0] % a || sr[3] != s[i][3] % d)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&sr) : "memory");                \
+    if (sr[2] != s[i][2] % c || sr[1] != s[i][1] % b)  \
+     abort ();                                         \
+    asm volatile ("" : : "r" (&sr) : "memory");
+  for (i = 0; i < sizeof (s) / sizeof (s[0]); i++)
+    {
+      TESTS
+    }
+  return 0;
+}

        Jakub

Reply via email to