[PATCH][vectorizer][2/2] Hook up mult synthesis logic into vectorisation of mult-by-constant

Kyrill Tkachov Wed, 15 Jun 2016 06:25:15 -0700

Hi all,

This is a respin of https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00952.html 
following feedback.
I've changed the code to cast the operand to an unsigned type before applying 
the multiplication algorithm
and cast it back to the signed type at the end.
Whether to perform the cast is now determined by the function 
cast_mult_synth_to_unsigned in which I've implemented
the cases that Marc mentioned in [1]. Please do let me know
if there are any other cases that need to be handled.


I've added a couple of TODO notes in the places that need to be extended to 
handle shifts as a series of additions
for targets that support vector addition but not vector shifts [2].

tree-vect-patterns.c already includes expmed.h (must have been added in the 
time since I first wrote the patch
last November...) so it picks up the definition of choose_mult_variant moved 
there in patch 1/2.

Bootstrapped and tested on aarch64, arm, x86_64.
Does this look better?

Thanks,
Kyrill

[1] https://gcc.gnu.org/ml/gcc-patches/2016-06/msg01023.html
[2] https://gcc.gnu.org/ml/gcc-patches/2016-06/msg00967.html

2016-06-15  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    PR target/65951
    * tree-vect-patterns.c: Include mult-synthesis.h
    (target_supports_mult_synth_alg): New function.
    (apply_binop_and_append_stmt): Likewise.
    (vect_synth_mult_by_constant): Likewise.
    (target_has_vecop_for_code): Likewise.
    (cast_mult_synth_to_unsigned): Likewise.
    (vect_recog_mult_pattern): Use above functions to synthesize vector
    multiplication by integer constants.

2016-06-15  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gcc.dg/vect/vect-mult-const-pattern-1.c: New test.
    * gcc.dg/vect/vect-mult-const-pattern-2.c: Likewise.

diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-1.c b/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e5dba82d7fa955a6a37a0eabf980127e464ac77b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-1.c
@@ -0,0 +1,41 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 256
+
+__attribute__ ((noinline)) void
+foo (long long *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= 123;
+}
+
+int
+main (void)
+{
+  check_vect ();
+  long long data[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / 123 != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-2.c b/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..83019c96910b866e364a7c2e00261a1ded13cb53
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mult-const-pattern-2.c
@@ -0,0 +1,41 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+#include <stdarg.h>
+#include "tree-vect.h"
+
+#define N 256
+
+__attribute__ ((noinline)) void
+foo (long long *arr)
+{
+  for (int i = 0; i < N; i++)
+    arr[i] *= -19594LL;
+}
+
+int
+main (void)
+{
+  check_vect ();
+  long long data[N];
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      data[i] = i;
+      __asm__ volatile ("");
+    }
+
+  foo (data);
+  for (i = 0; i < N; i++)
+    {
+      if (data[i] / -19594LL != i)
+      __builtin_abort ();
+      __asm__ volatile ("");
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vect_recog_mult_pattern: detected" 2 "vect"  { target aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 8a2221f935063002ecd02d2b20af5cb4bd7d9fee..698a022d4b80795a0b5a22a9c433fe482fa1bdaa 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -2131,32 +2131,281 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
   return pattern_stmt;
 }
 
-/* Detect multiplication by constant which are postive or negatives of power 2,
-   and convert them to shift patterns.
+/* Return true iff the target has a vector optab implementing the operation
+   CODE on type VECTYPE.  */
 
-   Mult with constants that are postive power of two.
-   type a_t;
-   type b_t
-   S1: b_t = a_t * n
+static bool
+target_has_vecop_for_code (tree_code code, tree vectype)
+{
+  optab voptab = optab_for_tree_code (code, vectype, optab_vector);
+  return voptab
+	 && optab_handler (voptab, TYPE_MODE (vectype)) != CODE_FOR_nothing;
+}
 
-   or
+/* Return true iff we need to cast the operand of the
+   mutliplication by a constant CST described by the algorithm ALG and VAR
+   on type SCALTYPE to an unsigned type to avoid overflow.  */
+
+static bool
+cast_mult_synth_to_unsigned (struct algorithm *alg, mult_variant var,
+			      tree scaltype, tree cst)
+{
+  if (TYPE_OVERFLOW_WRAPS (scaltype))
+    return false;
+
+  widest_int val = wi::to_widest (cst);
+  /* If multiplying by a negative divisor of the minimum value we risk
+     signed overflow.  */
+  if (wi::neg_p (val)
+      && wi::multiple_of_p (wi::to_widest (TYPE_MIN_VALUE (scaltype)),
+			     val, SIGNED))
+    return true;
+
+  /* If the final step is a subtraction we should cast.  */
+  if (var == negate_variant)
+    return true;
+
+  if (var == add_variant)
+    return false;
+
+  alg_code acode = alg->op[alg->ops - 1];
+  if (acode == alg_sub_t_m2
+      || acode == alg_sub_t2_m
+      || acode == alg_sub_factor)
+    return true;
+
+  return false;
+}
+
+/* Verify that the target has optabs of the vector form of SCALTYPE to perform
+   all the steps needed by the multiplication-by-immediate synthesis algorithm
+   described by ALG and VAR.  Return true iff that is the case.  */
+
+static bool
+target_supports_mult_synth_alg (struct algorithm *alg, mult_variant var,
+				 tree scaltype)
+{
+  if (alg->op[0] != alg_zero && alg->op[0] != alg_m)
+    return false;
+
+  tree vectype = get_vectype_for_scalar_type (scaltype);
+
+  if (!vectype)
+    return false;
+
+  /* All synthesis algorithms require shifts, so bail out early if
+     target cannot vectorize them.
+     TODO: If the target doesn't support vector shifts but supports vector
+     addition we could synthesize shifts that way.  vect_synth_mult_by_constant
+     would need to be updated to do that.  */
+  if (!vect_supportable_shift (LSHIFT_EXPR, scaltype))
+    return false;
+
+  bool supports_vminus = target_has_vecop_for_code (MINUS_EXPR, vectype);
+  bool supports_vplus = target_has_vecop_for_code (PLUS_EXPR, vectype);
+
+  if (var == negate_variant
+      && !target_has_vecop_for_code (NEGATE_EXPR, vectype))
+    return false;
+
+  if (var == add_variant && !supports_vplus)
+    return false;
+
+  for (int i = 1; i < alg->ops; i++)
+    {
+      switch (alg->op[i])
+	{
+	case alg_shift:
+	  break;
+	case alg_add_t_m2:
+	case alg_add_t2_m:
+	case alg_add_factor:
+	  if (!supports_vplus)
+	    return false;
+	  break;
+	case alg_sub_t_m2:
+	case alg_sub_t2_m:
+	case alg_sub_factor:
+	  if (!supports_vminus)
+	    return false;
+	  break;
+	case alg_unknown:
+	case alg_m:
+	case alg_zero:
+	case alg_impossible:
+	  return false;
+	default:
+	  gcc_unreachable ();
+	}
+    }
+
+  return true;
+}
+
+/* Helper for vect_synth_mult_by_constant.  Apply a binary operation
+   CODE to operands OP1 and OP2, creating a new temporary SSA var in
+   the process.  Append the resulting assignment statement to the sequence
+   in STMT_VINFO.  Return the newly created SSA variable which is the result
+   of the binary operation.  */
+
+static tree
+apply_binop_and_append_stmt (tree_code code, tree op1, tree op2,
+			     stmt_vec_info stmt_vinfo)
+{
+  tree itype = TREE_TYPE (op1);
+  tree tmp_var = vect_recog_temp_ssa_var (itype, NULL);
+  gimple *stmt = gimple_build_assign (tmp_var, code, op1, op2);
+  append_pattern_def_seq (stmt_vinfo, stmt);
+  return tmp_var;
+}
+
+/* Synthesize a multiplication of OP by an INTEGER_CST VAL using shifts
+   and simple arithmetic operations to be vectorized.  Record the statements
+   produced in STMT_VINFO and return the last statement in the sequence or
+   NULL if it's not possible to synthesize such a multiplication.
+   This function mirrors the behavior of expand_mult_const in expmed.c but
+   works on tree-ssa form.  */
+
+static gimple *
+vect_synth_mult_by_constant (tree op, tree val,
+			     stmt_vec_info stmt_vinfo)
+{
+  tree itype = TREE_TYPE (op);
+  machine_mode mode = TYPE_MODE (itype);
+  struct algorithm alg;
+  mult_variant variant;
+  if (!tree_fits_shwi_p (val))
+    return NULL;
+
+  HOST_WIDE_INT hwval = tree_to_shwi (val);
+  /* Use MAX_COST here as we don't want to limit the sequence on rtx costs.
+     The vectorizer's benefit analysis will decide whether it's beneficial
+     to do this.  */
+  bool possible = choose_mult_variant (mode, hwval, &alg,
+					&variant, MAX_COST);
+  if (!possible)
+    return NULL;
+
+  bool cast_to_unsigned_p
+    = cast_mult_synth_to_unsigned (&alg, variant, itype, val);
+
+  tree multtype = cast_to_unsigned_p ? unsigned_type_for (itype) : itype;
+
+  if (!target_supports_mult_synth_alg (&alg, variant, multtype))
+    return NULL;
+
+  tree accumulator;
+
+  /* Clear out the sequence of statements so we can populate it below.  */
+  STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) = NULL;
+  gimple *stmt = NULL;
 
-   Mult with constants that are negative power of two.
-   S2: b_t = a_t * -n
+  if (alg.op[0] == alg_zero)
+    accumulator = build_int_cst (multtype, 0);
+  else if (cast_to_unsigned_p)
+    {
+      accumulator = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (accumulator, CONVERT_EXPR, op);
+      append_pattern_def_seq (stmt_vinfo, stmt);
+    }
+  else
+    accumulator = op;
+
+  bool needs_fixup = (variant == negate_variant)
+		      || (variant == add_variant);
+
+  for (int i = 1; i < alg.ops; i++)
+    {
+      tree shft_log = build_int_cst (multtype, alg.log[i]);
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      tree tmp_var = NULL_TREE;
+
+      /* TODO: Targets that don't support vector shifts could synthesize
+	 them using vector additions.  target_supports_mult_synth_alg would
+	 need to be updated to allow this.  */
+      switch (alg.op[i])
+	{
+	case alg_shift:
+	  stmt = gimple_build_assign (accum_tmp, LSHIFT_EXPR, accumulator,
+				       shft_log);
+	  break;
+	case alg_add_t_m2:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, op,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
+				       tmp_var);
+	  break;
+	case alg_sub_t_m2:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, op,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, accumulator,
+				       tmp_var);
+	  break;
+	case alg_add_t2_m:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, tmp_var, op);
+	  break;
+	case alg_sub_t2_m:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var, op);
+	  break;
+	case alg_add_factor:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator,
+				       tmp_var);
+	  break;
+	case alg_sub_factor:
+	  tmp_var = apply_binop_and_append_stmt (LSHIFT_EXPR, accumulator,
+						  shft_log, stmt_vinfo);
+	  stmt = gimple_build_assign (accum_tmp, MINUS_EXPR, tmp_var,
+				      accumulator);
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      /* We don't want to append the last stmt in the sequence to stmt_vinfo
+	 but rather return it directly.  */
+      if ((i < alg.ops - 1) || needs_fixup || cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+      accumulator = accum_tmp;
+    }
+  if (variant == negate_variant)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (accum_tmp, NEGATE_EXPR, accumulator);
+      accumulator = accum_tmp;
+      if (cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+    }
+  else if (variant == add_variant)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (multtype, NULL);
+      stmt = gimple_build_assign (accum_tmp, PLUS_EXPR, accumulator, op);
+      accumulator = accum_tmp;
+      if (cast_to_unsigned_p)
+	append_pattern_def_seq (stmt_vinfo, stmt);
+    }
+
+  /* Move back to a signed if needed.  */
+  if (cast_to_unsigned_p)
+    {
+      tree accum_tmp = vect_recog_temp_ssa_var (itype, NULL);
+      stmt = gimple_build_assign (accum_tmp, CONVERT_EXPR, accumulator);
+    }
+  return stmt;
+}
+
+/* Detect multiplication by constant and convert it into a sequence of
+   shifts and additions, subtractions, negations.  We reuse the
+   choose_mult_variant algorithms from expmed.c
 
    Input/Output:
 
    STMTS: Contains a stmt from which the pattern search begins,
-   i.e. the mult stmt.  Convert the mult operation to LSHIFT if
-   constant operand is a power of 2.
-   type a_t, b_t
-   S1': b_t = a_t << log2 (n)
-
-   Convert the mult operation to LSHIFT and followed by a NEGATE
-   if constant operand is a negative power of 2.
-   type a_t, b_t, res_T;
-   S2': b_t = a_t << log2 (n)
-   S3': res_T  = - (b_t)
+   i.e. the mult stmt.
 
  Output:
 
@@ -2164,8 +2413,8 @@ vect_recog_vector_vector_shift_pattern (vec<gimple *> *stmts,
 
   * TYPE_OUT: The type of the output of this pattern.
 
-  * Return value: A new stmt that will be used to replace the multiplication
-    S1 or S2 stmt.  */
+  * Return value: A new stmt that will be used to replace
+    the multiplication.  */
 
 static gimple *
 vect_recog_mult_pattern (vec<gimple *> *stmts,
@@ -2173,11 +2422,8 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
 {
   gimple *last_stmt = stmts->pop ();
   tree oprnd0, oprnd1, vectype, itype;
-  gimple *pattern_stmt, *def_stmt;
-  optab optab;
+  gimple *pattern_stmt;
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
-  int power2_val, power2_neg_val;
-  tree shift;
 
   if (!is_gimple_assign (last_stmt))
     return NULL;
@@ -2201,52 +2447,17 @@ vect_recog_mult_pattern (vec<gimple *> *stmts,
 
   /* If the target can handle vectorized multiplication natively,
      don't attempt to optimize this.  */
-  optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
-  if (optab != unknown_optab)
+  optab mul_optab = optab_for_tree_code (MULT_EXPR, vectype, optab_default);
+  if (mul_optab != unknown_optab)
     {
       machine_mode vec_mode = TYPE_MODE (vectype);
-      int icode = (int) optab_handler (optab, vec_mode);
+      int icode = (int) optab_handler (mul_optab, vec_mode);
       if (icode != CODE_FOR_nothing)
-	return NULL;
-    }
-
-  /* If target cannot handle vector left shift then we cannot
-     optimize and bail out.  */
-  optab = optab_for_tree_code (LSHIFT_EXPR, vectype, optab_vector);
-  if (!optab
-      || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
-    return NULL;
-
-  power2_val = wi::exact_log2 (oprnd1);
-  power2_neg_val = wi::exact_log2 (wi::neg (oprnd1));
-
-  /* Handle constant operands that are postive or negative powers of 2.  */
-  if (power2_val != -1)
-    {
-      shift = build_int_cst (itype, power2_val);
-      pattern_stmt
-	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-			       LSHIFT_EXPR, oprnd0, shift);
+       return NULL;
     }
-  else if (power2_neg_val != -1)
-    {
-      /* If the target cannot handle vector NEGATE then we cannot
-	 do the optimization.  */
-      optab = optab_for_tree_code (NEGATE_EXPR, vectype, optab_vector);
-      if (!optab
-	  || optab_handler (optab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
-	return NULL;
 
-      shift = build_int_cst (itype, power2_neg_val);
-      def_stmt
-	= gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-			       LSHIFT_EXPR, oprnd0, shift);
-      new_pattern_def_seq (stmt_vinfo, def_stmt);
-      pattern_stmt
-	 = gimple_build_assign (vect_recog_temp_ssa_var (itype, NULL),
-				NEGATE_EXPR, gimple_assign_lhs (def_stmt));
-    }
-  else
+  pattern_stmt = vect_synth_mult_by_constant (oprnd0, oprnd1, stmt_vinfo);
+  if (!pattern_stmt)
     return NULL;
 
   /* Pattern detected.  */

[PATCH][vectorizer][2/2] Hook up mult synthesis logic into vectorisation of mult-by-constant

Reply via email to