61114, make direct vector reductions endianness-neutral

Alan Lawrence Fri, 24 Oct 2014 04:55:47 -0700

This is the first half of my previous patch series(https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01456.html), that is the partmaking the REDUC_..._EXPR tree codes endian-neutral, and adding a newreduce-to-scalar optab in place of the endianness-dependentreduc_[us](plus|min|max)_optab.

I'm leaving the vec_shr portion out of this patch series, as the link betweenthe two halves is only the end goal of removing an "if (BYTES_BIG_ENDIAN)" fromtree-vect-loop.c; this series removes that from one code path so can stand alone.

Patches 1-6 are as previously posted apart from rebasing and removing theold/poisoned AArch64 patterns as per maintainer's request. Patches 1, 2, 4, 5and 6 have already been approved; patch 3 was discussed somewhat but I think wedecided against most of the ideas raised, I have added comment toscalar_reduc_to_vector. I now reread Richie's "Otherwise the patch looks good tome" and wonder if I should have taken that as an approval but I didn't read itthat way at the time...???

Patches 7-11 migrate migrate ARM, x86, IA64 (I think), and mostly PowerPC, tothe new reduc_(plus|[us](min|max))_scal_optab. I have not managed to work outhow to do the same for MIPS (specifically what I need to add tomips_expand_vec_reduc), and have had no response from the maintainers, so amleaving that for now. Also I haven't migrated (or worked out how to target)rs6000/paired.md, help would be most welcome.

The suggestion was then to "complete" the migration, by removing the old optabs.There are a few options here and I'll follow up with appropriate patchesaccording to feedback received. I see options:

(1) just delete the old optabs (and the migration code). This wouldperformance-regress the MIPS backend, but should not break it, although oneshould really do *something* with the then-unused reduc_[us](plus|min|max)_optabin config/mips/loongson.md.

(2) also renaming reduc_..._scal_optab back to reduc_..._optab; would break theMIPS backend if something were not done with it's existing patterns.

(2a) Alternatively I could just use a different new name, e.g. reduce_....,reduct_...., vec_reduc_..., anything that's less of a mouthful thanreduc_..._scal. Whilst being only-very-slightly-different from the currentreduc_... might be confusing, so might changing the meaning of the optab, andits signature, with the existing name, so am open to suggestions?


Cheers, Alan

commit 9819291c17610dcdcca19a3d9ea3a4260df0577e
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Thu Aug 21 13:05:43 2014 +0100

    Temporarily remove gimple_fold

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 3dba1b2..a49da89 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1188,6 +1188,9 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
   return NULL_TREE;
 }
 
+/* Handling of reduction operations temporarily removed so as to decouple
+   changes to tree codes from AArch64 NEON Intrinsics.  */
+#if 0
 bool
 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
@@ -1259,6 +1262,7 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 
   return changed;
 }
+#endif
 
 void
 aarch64_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index db5ff59..27d82f3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10015,8 +10015,8 @@ aarch64_asan_shadow_offset (void)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
-#undef TARGET_GIMPLE_FOLD_BUILTIN
-#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
+//#undef TARGET_GIMPLE_FOLD_BUILTIN
+//#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr

commit bf6d5d32c552ce1c6ccd890f501db4f39291088f
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Tue Jul 29 11:46:01 2014 +0100

    Make tree codes produce scalar, with NOP_EXPRs. (tree-vect-loop.c mess)

diff --git a/gcc/expr.c b/gcc/expr.c
index a6233f3..c792028 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -9044,7 +9044,17 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
       {
         op0 = expand_normal (treeop0);
         this_optab = optab_for_tree_code (code, type, optab_default);
-        temp = expand_unop (mode, this_optab, op0, target, unsignedp);
+        enum machine_mode vec_mode = TYPE_MODE (TREE_TYPE (treeop0));
+        temp = expand_unop (vec_mode, this_optab, op0, NULL_RTX, unsignedp);
+        gcc_assert (temp);
+        /* The tree code produces a scalar result, but (somewhat by convention)
+           the optab produces a vector with the result in element 0 if
+           little-endian, or element N-1 if big-endian.  So pull the scalar
+           result out of that element.  */
+        int index = BYTES_BIG_ENDIAN ? GET_MODE_NUNITS (vec_mode) - 1 : 0;
+        int bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
+        temp = extract_bit_field (temp, bitsize, bitsize * index, unsignedp,
+				  target, mode, mode);
         gcc_assert (temp);
         return temp;
       }
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index 9f1bc09..9bb86f9 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -8246,12 +8246,13 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0)
     case REDUC_MAX_EXPR:
     case REDUC_PLUS_EXPR:
       {
-	unsigned int nelts = TYPE_VECTOR_SUBPARTS (type), i;
+	unsigned int nelts, i;
 	tree *elts;
 	enum tree_code subcode;
 
 	if (TREE_CODE (op0) != VECTOR_CST)
 	  return NULL_TREE;
+        nelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
 
 	elts = XALLOCAVEC (tree, nelts);
 	if (!vec_cst_ctor_to_array (op0, elts))
@@ -8270,10 +8271,9 @@ fold_unary_loc (location_t loc, enum tree_code code, tree type, tree op0)
 	    elts[0] = const_binop (subcode, elts[0], elts[i]);
 	    if (elts[0] == NULL_TREE || !CONSTANT_CLASS_P (elts[0]))
 	      return NULL_TREE;
-	    elts[i] = build_zero_cst (TREE_TYPE (type));
 	  }
 
-	return build_vector (type, elts);
+	return elts[0];
       }
 
     default:
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index cdab639..14fff81 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3539,12 +3539,21 @@ verify_gimple_assign_unary (gimple stmt)
 
         return false;
       }
-
-    case VEC_UNPACK_HI_EXPR:
-    case VEC_UNPACK_LO_EXPR:
     case REDUC_MAX_EXPR:
     case REDUC_MIN_EXPR:
     case REDUC_PLUS_EXPR:
+      if (!VECTOR_TYPE_P (rhs1_type)
+	  || !useless_type_conversion_p (lhs_type, TREE_TYPE (rhs1_type)))
+        {
+	  error ("reduction should convert from vector to element type");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  return true;
+	}
+      return false;
+
+    case VEC_UNPACK_HI_EXPR:
+    case VEC_UNPACK_LO_EXPR:
     case VEC_UNPACK_FLOAT_HI_EXPR:
     case VEC_UNPACK_FLOAT_LO_EXPR:
       /* FIXME.  */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fd1166f..8d97e17 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1892,9 +1892,9 @@ vect_analyze_loop (struct loop *loop)
 
    Output:
    REDUC_CODE - the corresponding tree-code to be used to reduce the
-      vector of partial results into a single scalar result (which
-      will also reside in a vector) or ERROR_MARK if the operation is
-      a supported reduction operation, but does not have such tree-code.
+      vector of partial results into a single scalar result, or ERROR_MARK
+      if the operation is a supported reduction operation, but does not have
+      such a tree-code.
 
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
@@ -4168,6 +4168,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
   if (reduc_code != ERROR_MARK && !slp_reduc)
     {
       tree tmp;
+      tree vec_elem_type;
 
       /*** Case 1:  Create:
            v_out2 = reduc_expr <v_out1>  */
@@ -4176,14 +4177,26 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
         dump_printf_loc (MSG_NOTE, vect_location,
 			 "Reduce using direct vector reduction.\n");
 
-      vec_dest = vect_create_destination_var (scalar_dest, vectype);
-      tmp = build1 (reduc_code, vectype, new_phi_result);
-      epilog_stmt = gimple_build_assign (vec_dest, tmp);
-      new_temp = make_ssa_name (vec_dest, epilog_stmt);
+      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      if (!useless_type_conversion_p (scalar_type, vec_elem_type))
+	{
+          tree tmp_dest =
+	      vect_create_destination_var (scalar_dest, vec_elem_type);
+	  tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
+	  epilog_stmt = gimple_build_assign (tmp_dest, tmp);
+	  new_temp = make_ssa_name (tmp_dest, epilog_stmt);
+	  gimple_assign_set_lhs (epilog_stmt, new_temp);
+	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+	  tmp = build1 (NOP_EXPR, scalar_type, new_temp);
+	}
+      else
+	tmp = build1 (reduc_code, scalar_type, new_phi_result);
+      epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
+      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
       gimple_assign_set_lhs (epilog_stmt, new_temp);
       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-      extract_scalar_result = true;
+      scalar_results.safe_push (new_temp);
     }
   else
     {
diff --git a/gcc/tree.def b/gcc/tree.def
index bd39e4b..c830e4b 100644
--- a/gcc/tree.def
+++ b/gcc/tree.def
@@ -1161,10 +1161,9 @@ DEFTREECODE (TRANSACTION_EXPR, "transaction_expr", tcc_expression, 1)
    result (e.g. summing the elements of the vector, finding the minimum over
    the vector elements, etc).
    Operand 0 is a vector.
-   The expression returns a vector of the same type, with the first
-   element in the vector holding the result of the reduction of all elements
-   of the operand.  The content of the other elements in the returned vector
-   is undefined.  */
+   The expression returns a scalar, with type the same as the elements of the
+   vector, holding the result of the reduction of all elements of the operand.
+   */
 DEFTREECODE (REDUC_MAX_EXPR, "reduc_max_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_MIN_EXPR, "reduc_min_expr", tcc_unary, 1)
 DEFTREECODE (REDUC_PLUS_EXPR, "reduc_plus_expr", tcc_unary, 1)

commit ab17754b481aa163cbdda053c0cb7ce65e1aa8e9
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Thu Sep 25 16:58:13 2014 +0100

    Add new optabs (2nd posted version: comment scalar_reduc_to_vector; revert sp)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ccebc70..e7347cc 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4717,29 +4717,48 @@ it is unspecified which of the two operands is returned as the result.
 @cindex @code{reduc_smax_@var{m}} instruction pattern
 @item @samp{reduc_smin_@var{m}}, @samp{reduc_smax_@var{m}}
 Find the signed minimum/maximum of the elements of a vector. The vector is
-operand 1, and the scalar result is stored in the least significant bits of
+operand 1, and the result is stored in the least significant bits of
 operand 0 (also a vector). The output and input vector should have the same
-modes.
+modes. These are legacy optabs, and platforms should prefer to implement
+@samp{reduc_smin_scal_@var{m}} and @samp{reduc_smax_scal_@var{m}}.
 
 @cindex @code{reduc_umin_@var{m}} instruction pattern
 @cindex @code{reduc_umax_@var{m}} instruction pattern
 @item @samp{reduc_umin_@var{m}}, @samp{reduc_umax_@var{m}}
 Find the unsigned minimum/maximum of the elements of a vector. The vector is
-operand 1, and the scalar result is stored in the least significant bits of
+operand 1, and the result is stored in the least significant bits of
 operand 0 (also a vector). The output and input vector should have the same
-modes.
+modes. These are legacy optabs, and platforms should prefer to implement
+@samp{reduc_umin_scal_@var{m}} and @samp{reduc_umax_scal_@var{m}}.
 
 @cindex @code{reduc_splus_@var{m}} instruction pattern
-@item @samp{reduc_splus_@var{m}}
-Compute the sum of the signed elements of a vector. The vector is operand 1,
-and the scalar result is stored in the least significant bits of operand 0
-(also a vector). The output and input vector should have the same modes.
-
 @cindex @code{reduc_uplus_@var{m}} instruction pattern
-@item @samp{reduc_uplus_@var{m}}
-Compute the sum of the unsigned elements of a vector. The vector is operand 1,
-and the scalar result is stored in the least significant bits of operand 0
+@item @samp{reduc_splus_@var{m}}, @samp{reduc_uplus_@var{m}}
+Compute the sum of the signed/unsigned elements of a vector. The vector is
+operand 1, and the result is stored in the least significant bits of operand 0
 (also a vector). The output and input vector should have the same modes.
+These are legacy optabs, and platforms should prefer to implement
+@samp{reduc_plus_scal_@var{m}}.
+
+@cindex @code{reduc_smin_scal_@var{m}} instruction pattern
+@cindex @code{reduc_smax_scal_@var{m}} instruction pattern
+@item @samp{reduc_smin_scal_@var{m}}, @samp{reduc_smax_scal_@var{m}}
+Find the signed minimum/maximum of the elements of a vector. The vector is
+operand 1, and operand 0 is the scalar result, with mode equal to the mode of
+the elements of the input vector.
+
+@cindex @code{reduc_umin_scal_@var{m}} instruction pattern
+@cindex @code{reduc_umax_scal_@var{m}} instruction pattern
+@item @samp{reduc_umin_scal_@var{m}}, @samp{reduc_umax_scal_@var{m}}
+Find the unsigned minimum/maximum of the elements of a vector. The vector is
+operand 1, and operand 0 is the scalar result, with mode equal to the mode of
+the elements of the input vector.
+
+@cindex @code{reduc_plus_scal_@var{m}} instruction pattern
+@item @samp{reduc_plus_scal_@var{m}}
+Compute the sum of the elements of a vector. The vector is operand 1, and
+operand 0 is the scalar result, with mode equal to the mode of the elements of
+the input vector.
 
 @cindex @code{sdot_prod@var{m}} instruction pattern
 @item @samp{sdot_prod@var{m}}
diff --git a/gcc/expr.c b/gcc/expr.c
index c792028..3763614 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -9045,6 +9045,24 @@ expand_expr_real_2 (sepops ops, rtx target, enum machine_mode tmode,
         op0 = expand_normal (treeop0);
         this_optab = optab_for_tree_code (code, type, optab_default);
         enum machine_mode vec_mode = TYPE_MODE (TREE_TYPE (treeop0));
+
+	if (optab_handler (this_optab, vec_mode) != CODE_FOR_nothing)
+	  {
+	    struct expand_operand ops[2];
+	    enum insn_code icode = optab_handler (this_optab, vec_mode);
+
+	    create_output_operand (&ops[0], target, mode);
+	    create_input_operand (&ops[1], op0, vec_mode);
+	    if (maybe_expand_insn (icode, 2, ops))
+	      {
+		target = ops[0].value;
+		if (GET_MODE (target) != mode)
+		  return gen_lowpart (tmode, target);
+		return target;
+	      }
+	  }
+	/* Fall back to optab with vector result, and then extract scalar.  */
+	this_optab = scalar_reduc_to_vector (this_optab, type);
         temp = expand_unop (vec_mode, this_optab, op0, NULL_RTX, unsignedp);
         gcc_assert (temp);
         /* The tree code produces a scalar result, but (somewhat by convention)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index d55a6bb..f0547e5 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -500,13 +500,15 @@ optab_for_tree_code (enum tree_code code, const_tree type,
       return fma_optab;
 
     case REDUC_MAX_EXPR:
-      return TYPE_UNSIGNED (type) ? reduc_umax_optab : reduc_smax_optab;
+      return TYPE_UNSIGNED (type)
+	     ? reduc_umax_scal_optab : reduc_smax_scal_optab;
 
     case REDUC_MIN_EXPR:
-      return TYPE_UNSIGNED (type) ? reduc_umin_optab : reduc_smin_optab;
+      return TYPE_UNSIGNED (type)
+	     ? reduc_umin_scal_optab : reduc_smin_scal_optab;
 
     case REDUC_PLUS_EXPR:
-      return TYPE_UNSIGNED (type) ? reduc_uplus_optab : reduc_splus_optab;
+      return reduc_plus_scal_optab;
 
     case VEC_LSHIFT_EXPR:
       return vec_shl_optab;
@@ -602,7 +604,26 @@ optab_for_tree_code (enum tree_code code, const_tree type,
       return unknown_optab;
     }
 }
-
+
+/* Given optab UNOPTAB that reduces a vector to a scalar, find instead the old
+   optab that produces a vector with the reduction result in one element,
+   for a tree with type TYPE.  */
+
+optab
+scalar_reduc_to_vector (optab unoptab, const_tree type)
+{
+  switch (unoptab)
+    {
+    case reduc_plus_scal_optab:
+      return TYPE_UNSIGNED (type) ? reduc_uplus_optab : reduc_splus_optab;
+
+    case reduc_smin_scal_optab: return reduc_smin_optab;
+    case reduc_umin_scal_optab: return reduc_umin_optab;
+    case reduc_smax_scal_optab: return reduc_smax_optab;
+    case reduc_umax_scal_optab: return reduc_umax_optab;
+    default: return unknown_optab;
+    }
+}
 
 /* Expand vector widening operations.
 
diff --git a/gcc/optabs.def b/gcc/optabs.def
index b755470..131ea04 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -243,12 +243,20 @@ OPTAB_D (sin_optab, "sin$a2")
 OPTAB_D (sincos_optab, "sincos$a3")
 OPTAB_D (tan_optab, "tan$a2")
 
+/* Vector reduction to a scalar.  */
+OPTAB_D (reduc_smax_scal_optab, "reduc_smax_scal_$a")
+OPTAB_D (reduc_smin_scal_optab, "reduc_smin_scal_$a")
+OPTAB_D (reduc_plus_scal_optab, "reduc_plus_scal_$a")
+OPTAB_D (reduc_umax_scal_optab, "reduc_umax_scal_$a")
+OPTAB_D (reduc_umin_scal_optab, "reduc_umin_scal_$a")
+/* (Old) Vector reduction, returning a vector with the result in one lane.  */
 OPTAB_D (reduc_smax_optab, "reduc_smax_$a")
 OPTAB_D (reduc_smin_optab, "reduc_smin_$a")
 OPTAB_D (reduc_splus_optab, "reduc_splus_$a")
 OPTAB_D (reduc_umax_optab, "reduc_umax_$a")
 OPTAB_D (reduc_umin_optab, "reduc_umin_$a")
 OPTAB_D (reduc_uplus_optab, "reduc_uplus_$a")
+
 OPTAB_D (sdot_prod_optab, "sdot_prod$I$a")
 OPTAB_D (ssum_widen_optab, "widen_ssum$I$a3")
 OPTAB_D (udot_prod_optab, "udot_prod$I$a")
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 089b15a..1f8e44d 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -162,6 +162,11 @@ enum optab_subtype
    vector shifts and rotates */
 extern optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
 
+/* Given an optab that reduces a vector to a scalar, find instead the old
+   optab that produces a vector with the reduction result in one element,
+   for a tree with the specified type.  */
+extern optab scalar_reduc_to_vector (optab, const_tree type);
+
 /* The various uses that a comparison can have; used by can_compare_p:
    jumps, conditional moves, store flag operations.  */
 enum can_compare_purpose
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 8d97e17..e8b108d 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5102,15 +5102,17 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
 
           epilog_reduc_code = ERROR_MARK;
         }
-
-      if (reduc_optab
-          && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
+      else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
         {
-          if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-			     "reduc op not supported by target.\n");
+          optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
+          if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
+            {
+              if (dump_enabled_p ())
+	        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "reduc op not supported by target.\n");
 
-          epilog_reduc_code = ERROR_MARK;
+	      epilog_reduc_code = ERROR_MARK;
+	    }
         }
     }
   else

commit 0a87420deca7f94d4e4f637ab89606b8d14a7775
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Wed Aug 13 17:23:31 2014 +0100

    AArch64: use new reduc_plus_scal_optab, inc. for __builtins; remove old.

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2367436..7e77b2e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -214,9 +214,8 @@
   BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
   BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
 
-  /* Implemented by reduc_<sur>plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_splus_, 10)
-  BUILTIN_VDQ (UNOP, reduc_uplus_, 10)
+  /* Implemented by aarch64_reduc_plus_<mode>.  */
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
 
   /* Implemented by reduc_<maxmin_uns>_<mode>.  */
   BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cab26a3..a5b9f3d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1760,25 +1760,52 @@
 
 ;; 'across lanes' add.
 
-(define_insn "reduc_<sur>plus_<mode>"
+(define_expand "reduc_plus_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand" "=w")
+   (unspec:VDQ [(match_operand:VDQ 1 "register_operand" "w")]
+	       UNSPEC_ADDV)]
+  "TARGET_SIMD"
+  {
+    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx scratch = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
+    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+    DONE;
+  }
+)
+
+(define_expand "reduc_plus_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand" "=w")
+   (match_operand:V2F 1 "register_operand" "w")]
+  "TARGET_SIMD"
+  {
+    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx scratch = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
+    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+    DONE;
+  }
+)
+
+(define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
-		    SUADDV))]
+		    UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
-(define_insn "reduc_<sur>plus_v2si"
+(define_insn "aarch64_reduc_plus_internalv2si"
  [(set (match_operand:V2SI 0 "register_operand" "=w")
        (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
-		    SUADDV))]
+		    UNSPEC_ADDV))]
  "TARGET_SIMD"
  "addp\\t%0.2s, %1.2s, %1.2s"
   [(set_attr "type" "neon_reduc_add")]
 )
 
-(define_insn "reduc_splus_<mode>"
+(define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:V2F 0 "register_operand" "=w")
        (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
 		   UNSPEC_FADDV))]
@@ -1796,14 +1823,17 @@
   [(set_attr "type" "neon_fp_reduc_add_s_q")]
 )
 
-(define_expand "reduc_splus_v4sf"
- [(set (match_operand:V4SF 0 "register_operand")
+(define_expand "reduc_plus_scal_v4sf"
+ [(set (match_operand:SF 0 "register_operand")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
 		    UNSPEC_FADDV))]
  "TARGET_SIMD"
 {
-  emit_insn (gen_aarch64_addpv4sf (operands[0], operands[1]));
-  emit_insn (gen_aarch64_addpv4sf (operands[0], operands[0]));
+  rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
+  rtx scratch = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
+  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9b1873f..d17e7fe 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -13209,121 +13209,103 @@ vaddd_u64 (uint64_t __a, uint64_t __b)
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_splus_v8qi (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v8qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_splus_v4hi (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v4hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_splus_v2si (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v2si (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vaddv_u8 (uint8x8_t __a)
 {
-  return vget_lane_u8 ((uint8x8_t)
-		__builtin_aarch64_reduc_uplus_v8qi ((int8x8_t) __a),
-		0);
+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v8qi ((int8x8_t) __a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vaddv_u16 (uint16x4_t __a)
 {
-  return vget_lane_u16 ((uint16x4_t)
-		__builtin_aarch64_reduc_uplus_v4hi ((int16x4_t) __a),
-		0);
+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v4hi ((int16x4_t) __a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vaddv_u32 (uint32x2_t __a)
 {
-  return vget_lane_u32 ((uint32x2_t)
-		__builtin_aarch64_reduc_uplus_v2si ((int32x2_t) __a),
-		0);
+  return (int32_t) __builtin_aarch64_reduc_plus_scal_v2si ((int32x2_t) __a);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vaddvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_splus_v16qi (__a),
-			0);
+  return __builtin_aarch64_reduc_plus_scal_v16qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_splus_v8hi (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v8hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vaddvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_splus_v4si (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v4si (__a);
 }
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
 vaddvq_s64 (int64x2_t __a)
 {
-  return vgetq_lane_s64 (__builtin_aarch64_reduc_splus_v2di (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v2di (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vaddvq_u8 (uint8x16_t __a)
 {
-  return vgetq_lane_u8 ((uint8x16_t)
-		__builtin_aarch64_reduc_uplus_v16qi ((int8x16_t) __a),
-		0);
+  return (uint8_t) __builtin_aarch64_reduc_plus_scal_v16qi ((int8x16_t) __a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vaddvq_u16 (uint16x8_t __a)
 {
-  return vgetq_lane_u16 ((uint16x8_t)
-		__builtin_aarch64_reduc_uplus_v8hi ((int16x8_t) __a),
-		0);
+  return (uint16_t) __builtin_aarch64_reduc_plus_scal_v8hi ((int16x8_t) __a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vaddvq_u32 (uint32x4_t __a)
 {
-  return vgetq_lane_u32 ((uint32x4_t)
-		__builtin_aarch64_reduc_uplus_v4si ((int32x4_t) __a),
-		0);
+  return (uint32_t) __builtin_aarch64_reduc_plus_scal_v4si ((int32x4_t) __a);
 }
 
 __extension__ static __inline uint64_t __attribute__ ((__always_inline__))
 vaddvq_u64 (uint64x2_t __a)
 {
-  return vgetq_lane_u64 ((uint64x2_t)
-		__builtin_aarch64_reduc_uplus_v2di ((int64x2_t) __a),
-		0);
+  return (uint64_t) __builtin_aarch64_reduc_plus_scal_v2di ((int64x2_t) __a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddv_f32 (float32x2_t __a)
 {
-  float32x2_t __t = __builtin_aarch64_reduc_splus_v2sf (__a);
-  return vget_lane_f32 (__t, 0);
+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vaddvq_f32 (float32x4_t __a)
 {
-  float32x4_t __t = __builtin_aarch64_reduc_splus_v4sf (__a);
-  return vgetq_lane_f32 (__t, 0);
+  return __builtin_aarch64_reduc_plus_scal_v4sf (__a);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vaddvq_f64 (float64x2_t __a)
 {
-  float64x2_t __t = __builtin_aarch64_reduc_splus_v2df (__a);
-  return vgetq_lane_f64 (__t, 0);
+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
 }
 
 /* vbsl  */
@@ -18875,7 +18857,7 @@ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vpaddd_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_splus_v2df (__a), 0);
+  return __builtin_aarch64_reduc_plus_scal_v2df (__a);
 }
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index efd006f..74c71fc 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -213,8 +213,7 @@
     UNSPEC_FMINNMV	; Used in aarch64-simd.md.
     UNSPEC_FMINV	; Used in aarch64-simd.md.
     UNSPEC_FADDV	; Used in aarch64-simd.md.
-    UNSPEC_SADDV	; Used in aarch64-simd.md.
-    UNSPEC_UADDV	; Used in aarch64-simd.md.
+    UNSPEC_ADDV		; Used in aarch64-simd.md.
     UNSPEC_SMAXV	; Used in aarch64-simd.md.
     UNSPEC_SMINV	; Used in aarch64-simd.md.
     UNSPEC_UMAXV	; Used in aarch64-simd.md.
@@ -859,8 +858,6 @@
 (define_int_iterator FMAXMINV [UNSPEC_FMAXV UNSPEC_FMINV
 			       UNSPEC_FMAXNMV UNSPEC_FMINNMV])
 
-(define_int_iterator SUADDV [UNSPEC_SADDV UNSPEC_UADDV])
-
 (define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD
 			      UNSPEC_SRHADD UNSPEC_URHADD
 			      UNSPEC_SHSUB UNSPEC_UHSUB
@@ -965,7 +962,6 @@
 		      (UNSPEC_SUBHN2 "") (UNSPEC_RSUBHN2 "r")
 		      (UNSPEC_SQXTN "s") (UNSPEC_UQXTN "u")
 		      (UNSPEC_USQADD "us") (UNSPEC_SUQADD "su")
-		      (UNSPEC_SADDV "s") (UNSPEC_UADDV "u")
 		      (UNSPEC_SSLI  "s") (UNSPEC_USLI  "u")
 		      (UNSPEC_SSRI  "s") (UNSPEC_USRI  "u")
 		      (UNSPEC_USRA  "u") (UNSPEC_SSRA  "s")

commit c15a1d55498eade13cef550b7635e5791bb48d79
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Wed Aug 13 17:23:58 2014 +0100

    AArch64: reduc_[us](min|max)_scal_optab, inc _nan variant and __builtins.
    
    Also combine V2F and V4SF variants.

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 7e77b2e..4bedb4a 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -217,13 +217,13 @@
   /* Implemented by aarch64_reduc_plus_<mode>.  */
   BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
 
-  /* Implemented by reduc_<maxmin_uns>_<mode>.  */
-  BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
-  BUILTIN_VDQIF (UNOP, reduc_smin_, 10)
-  BUILTIN_VDQ_BHSI (UNOP, reduc_umax_, 10)
-  BUILTIN_VDQ_BHSI (UNOP, reduc_umin_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smax_nan_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smin_nan_, 10)
+  /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
+  BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
+  BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+  BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10)
+  BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10)
 
   /* Implemented by <maxmin><mode>3.
      smax variants map to fmaxnm,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index a5b9f3d..c17e1ee 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1847,7 +1847,40 @@
 
 ;; 'across lanes' max and min ops.
 
-(define_insn "reduc_<maxmin_uns>_<mode>"
+;; Template for outputting a scalar, so we can create __builtins which can be
+;; gimple_fold'd to the REDUC_(MAX|MIN)_EXPR tree code.  (This is FP smax/smin).
+(define_expand "reduc_<maxmin_uns>_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand")
+   (unspec:VDQF [(match_operand:VDQF 1 "register_operand")]
+		FMAXMINV)]
+  "TARGET_SIMD"
+  {
+    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx scratch = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
+							      operands[1]));
+    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+    DONE;
+  }
+)
+
+;; Likewise for integer cases, signed and unsigned.
+(define_expand "reduc_<maxmin_uns>_scal_<mode>"
+  [(match_operand:<VEL> 0 "register_operand")
+   (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")]
+		    MAXMINV)]
+  "TARGET_SIMD"
+  {
+    rtx elt = GEN_INT (ENDIAN_LANE_N (<MODE>mode, 0));
+    rtx scratch = gen_reg_rtx (<MODE>mode);
+    emit_insn (gen_aarch64_reduc_<maxmin_uns>_internal<mode> (scratch,
+							      operands[1]));
+    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
+    DONE;
+  }
+)
+
+(define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>"
  [(set (match_operand:VDQV_S 0 "register_operand" "=w")
        (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
 		    MAXMINV))]
@@ -1856,7 +1889,7 @@
   [(set_attr "type" "neon_reduc_minmax<q>")]
 )
 
-(define_insn "reduc_<maxmin_uns>_v2si"
+(define_insn "aarch64_reduc_<maxmin_uns>_internalv2si"
  [(set (match_operand:V2SI 0 "register_operand" "=w")
        (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
 		    MAXMINV))]
@@ -1865,24 +1898,15 @@
   [(set_attr "type" "neon_reduc_minmax")]
 )
 
-(define_insn "reduc_<maxmin_uns>_<mode>"
- [(set (match_operand:V2F 0 "register_operand" "=w")
-       (unspec:V2F [(match_operand:V2F 1 "register_operand" "w")]
+(define_insn "aarch64_reduc_<maxmin_uns>_internal<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")]
 		    FMAXMINV))]
  "TARGET_SIMD"
- "<maxmin_uns_op>p\\t%<Vetype>0, %1.<Vtype>"
+ "<maxmin_uns_op><vp>\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_fp_reduc_minmax_<Vetype><q>")]
 )
 
-(define_insn "reduc_<maxmin_uns>_v4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
-		    FMAXMINV))]
- "TARGET_SIMD"
- "<maxmin_uns_op>v\\t%s0, %1.4s"
-  [(set_attr "type" "neon_fp_reduc_minmax_s_q")]
-)
-
 ;; aarch64_simd_bsl may compile to any of bsl/bif/bit depending on register
 ;; allocation.
 ;; Operand 1 is the mask, operands 2 and 3 are the bitfields from which
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d17e7fe..9b0ff30 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -17688,106 +17688,91 @@ vmaxnmq_f64 (float64x2_t __a, float64x2_t __b)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smax_nan_v2sf (__a),
-			0);
+  return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_smax_v8qi (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v8qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smax_v4hi (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v4hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smax_v2si (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v2si (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vmaxv_u8 (uint8x8_t __a)
 {
-  return vget_lane_u8 ((uint8x8_t)
-		__builtin_aarch64_reduc_umax_v8qi ((int8x8_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vmaxv_u16 (uint16x4_t __a)
 {
-  return vget_lane_u16 ((uint16x4_t)
-		__builtin_aarch64_reduc_umax_v4hi ((int16x4_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vmaxv_u32 (uint32x2_t __a)
 {
-  return vget_lane_u32 ((uint32x2_t)
-		__builtin_aarch64_reduc_umax_v2si ((int32x2_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_nan_v4sf (__a),
-			 0);
+  return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_nan_v2df (__a),
-			 0);
+  return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vmaxvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smax_v16qi (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v16qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vmaxvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smax_v8hi (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v8hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vmaxvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smax_v4si (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v4si (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vmaxvq_u8 (uint8x16_t __a)
 {
-  return vgetq_lane_u8 ((uint8x16_t)
-		__builtin_aarch64_reduc_umax_v16qi ((int8x16_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vmaxvq_u16 (uint16x8_t __a)
 {
-  return vgetq_lane_u16 ((uint16x8_t)
-		__builtin_aarch64_reduc_umax_v8hi ((int16x8_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vmaxvq_u32 (uint32x4_t __a)
 {
-  return vgetq_lane_u32 ((uint32x4_t)
-		__builtin_aarch64_reduc_umax_v4si ((int32x4_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a);
 }
 
 /* vmaxnmv  */
@@ -17795,20 +17780,19 @@ vmaxvq_u32 (uint32x4_t __a)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smax_v2sf (__a),
-			0);
+  return __builtin_aarch64_reduc_smax_scal_v2sf (__a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vmaxnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smax_v4sf (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v4sf (__a);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vmaxnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smax_v2df (__a), 0);
+  return __builtin_aarch64_reduc_smax_scal_v2df (__a);
 }
 
 /* vmin  */
@@ -17934,107 +17918,91 @@ vminnmq_f64 (float64x2_t __a, float64x2_t __b)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smin_nan_v2sf (__a),
-			0);
+  return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminv_s8 (int8x8_t __a)
 {
-  return vget_lane_s8 (__builtin_aarch64_reduc_smin_v8qi (__a),
-		       0);
+  return __builtin_aarch64_reduc_smin_scal_v8qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminv_s16 (int16x4_t __a)
 {
-  return vget_lane_s16 (__builtin_aarch64_reduc_smin_v4hi (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v4hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminv_s32 (int32x2_t __a)
 {
-  return vget_lane_s32 (__builtin_aarch64_reduc_smin_v2si (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v2si (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vminv_u8 (uint8x8_t __a)
 {
-  return vget_lane_u8 ((uint8x8_t)
-		__builtin_aarch64_reduc_umin_v8qi ((int8x8_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vminv_u16 (uint16x4_t __a)
 {
-  return vget_lane_u16 ((uint16x4_t)
-		__builtin_aarch64_reduc_umin_v4hi ((int16x4_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vminv_u32 (uint32x2_t __a)
 {
-  return vget_lane_u32 ((uint32x2_t)
-		__builtin_aarch64_reduc_umin_v2si ((int32x2_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_nan_v4sf (__a),
-			 0);
+  return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_nan_v2df (__a),
-			 0);
+  return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a);
 }
 
 __extension__ static __inline int8_t __attribute__ ((__always_inline__))
 vminvq_s8 (int8x16_t __a)
 {
-  return vgetq_lane_s8 (__builtin_aarch64_reduc_smin_v16qi (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v16qi (__a);
 }
 
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vminvq_s16 (int16x8_t __a)
 {
-  return vgetq_lane_s16 (__builtin_aarch64_reduc_smin_v8hi (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v8hi (__a);
 }
 
 __extension__ static __inline int32_t __attribute__ ((__always_inline__))
 vminvq_s32 (int32x4_t __a)
 {
-  return vgetq_lane_s32 (__builtin_aarch64_reduc_smin_v4si (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v4si (__a);
 }
 
 __extension__ static __inline uint8_t __attribute__ ((__always_inline__))
 vminvq_u8 (uint8x16_t __a)
 {
-  return vgetq_lane_u8 ((uint8x16_t)
-		__builtin_aarch64_reduc_umin_v16qi ((int8x16_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a);
 }
 
 __extension__ static __inline uint16_t __attribute__ ((__always_inline__))
 vminvq_u16 (uint16x8_t __a)
 {
-  return vgetq_lane_u16 ((uint16x8_t)
-		__builtin_aarch64_reduc_umin_v8hi ((int16x8_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a);
 }
 
 __extension__ static __inline uint32_t __attribute__ ((__always_inline__))
 vminvq_u32 (uint32x4_t __a)
 {
-  return vgetq_lane_u32 ((uint32x4_t)
-		__builtin_aarch64_reduc_umin_v4si ((int32x4_t) __a),
-		0);
+  return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a);
 }
 
 /* vminnmv  */
@@ -18042,19 +18010,19 @@ vminvq_u32 (uint32x4_t __a)
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmv_f32 (float32x2_t __a)
 {
-  return vget_lane_f32 (__builtin_aarch64_reduc_smin_v2sf (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v2sf (__a);
 }
 
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vminnmvq_f32 (float32x4_t __a)
 {
-  return vgetq_lane_f32 (__builtin_aarch64_reduc_smin_v4sf (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v4sf (__a);
 }
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vminnmvq_f64 (float64x2_t __a)
 {
-  return vgetq_lane_f64 (__builtin_aarch64_reduc_smin_v2df (__a), 0);
+  return __builtin_aarch64_reduc_smin_scal_v2df (__a);
 }
 
 /* vmla */

commit 846d5932041e04bbf386efbc739aee9749051bc7
Author: Alan Lawrence <alan.lawre...@arm.com>
Date:   Wed Aug 13 17:25:13 2014 +0100

    AArch64: Reintroduce gimple_fold for min+max+plus

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index a49da89..283469b 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1188,9 +1188,6 @@ aarch64_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED, tree *args,
   return NULL_TREE;
 }
 
-/* Handling of reduction operations temporarily removed so as to decouple
-   changes to tree codes from AArch64 NEON Intrinsics.  */
-#if 0
 bool
 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
@@ -1200,19 +1197,6 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   tree fndecl;
   gimple new_stmt = NULL;
 
-  /* The operations folded below are reduction operations.  These are
-     defined to leave their result in the 0'th element (from the perspective
-     of GCC).  The architectural instruction we are folding will leave the
-     result in the 0'th element (from the perspective of the architecture).
-     For big-endian systems, these perspectives are not aligned.
-
-     It is therefore wrong to perform this fold on big-endian.  There
-     are some tricks we could play with shuffling, but the mid-end is
-     inconsistent in the way it treats reduction operations, so we will
-     end up in difficulty.  Until we fix the ambiguity - just bail out.  */
-  if (BYTES_BIG_ENDIAN)
-    return false;
-
   if (call)
     {
       fndecl = gimple_call_fndecl (stmt);
@@ -1224,23 +1208,28 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 			? gimple_call_arg_ptr (stmt, 0)
 			: &error_mark_node);
 
+	  /* We use gimple's REDUC_(PLUS|MIN|MAX)_EXPRs for float, signed int
+	     and unsigned int; it will distinguish according to the types of
+	     the arguments to the __builtin.  */
 	  switch (fcode)
 	    {
-	      BUILTIN_VALL (UNOP, reduc_splus_, 10)
-		new_stmt = gimple_build_assign_with_ops (
+	      BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+	        new_stmt = gimple_build_assign_with_ops (
 						REDUC_PLUS_EXPR,
 						gimple_call_lhs (stmt),
 						args[0],
 						NULL_TREE);
 		break;
-	      BUILTIN_VDQIF (UNOP, reduc_smax_, 10)
+	      BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
+	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
 		new_stmt = gimple_build_assign_with_ops (
 						REDUC_MAX_EXPR,
 						gimple_call_lhs (stmt),
 						args[0],
 						NULL_TREE);
 		break;
-	      BUILTIN_VDQIF (UNOP, reduc_smin_, 10)
+	      BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+	      BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
 		new_stmt = gimple_build_assign_with_ops (
 						REDUC_MIN_EXPR,
 						gimple_call_lhs (stmt),
@@ -1262,7 +1251,6 @@ aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 
   return changed;
 }
-#endif
 
 void
 aarch64_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 27d82f3..db5ff59 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10015,8 +10015,8 @@ aarch64_asan_shadow_offset (void)
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
-//#undef TARGET_GIMPLE_FOLD_BUILTIN
-//#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
+#undef TARGET_GIMPLE_FOLD_BUILTIN
+#define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr

[PATCH v2 0-6/11] Fix PR/61114, make direct vector reductions endianness-neutral

Reply via email to