This patch uses IFN_COND_* to vectorise conditionally-executed,
potentially-trapping arithmetic, such as most floating-point
ops with -ftrapping-math.  E.g.:

    if (cond) { ... x = a + b; ... }

becomes:

    ...
    x = IFN_COND_ADD (cond, a, b);
    ...

When this transformation is done on its own, the value of x for
!cond isn't important.

However, the patch also looks for the equivalent of:

    y = cond ? x : a;

in which the "then" value is the result of the conditionally-executed
operation and the "else" value is the first operand of that operation.
This "else" value is the one guaranteed by IFN_COND_* and so we can
replace y with x.

The patch also adds new conditional functions for multiplication
and division, which previously weren't needed.  This enables an
extra fully-masked reduction (of dubious value) in gcc.dg/vect/pr53773.c.

Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
and x86_64-linux-gnu.  OK to install?

Richard


2018-05-16  Richard Sandiford  <richard.sandif...@linaro.org>

gcc/
        * internal-fn.def (IFN_COND_MUL, IFN_COND_DIV, IFN_COND_MOD): New
        internal functions.
        * internal-fn.h (vectorized_internal_fn_supported_p): Declare.
        * internal-fn.c (FOR_EACH_CODE_MAPPING): Handle IFN_COND_MUL,
        IFN_COND_DIV and IFN_COND_MOD.
        (get_conditional_internal_fn): Handle RDIV_EXPR.
        (can_interpret_as_conditional_op_p): Use RDIV_EXPR for floating-point
        divisions.
        (internal_fn_mask_index): Handle conditional internal functions.
        (vectorized_internal_fn_supported_p): New function.
        * optabs.def (cond_smul_optab, cond_sdiv_optab, cond_smod_optab)
        (cond_udiv_optab, cond_umod_optab): New optabs.
        * tree-if-conv.c: Include internal-fn.h.
        (any_pred_load_store): Replace with...
        (need_to_predicate): ...this new variable.
        (redundant_ssa_names): New variable.
        (ifcvt_can_use_mask_load_store): Move initial checks to...
        (ifcvt_can_predicate): ...this new function.  Handle tree codes
        for which a conditional internal function exists.
        (if_convertible_gimple_assign_stmt_p): Use ifcvt_can_predicate
        instead of ifcvt_can_use_mask_load_store.  Update after variable
        name change.
        (predicate_load_or_store): New function, split out from
        predicate_mem_writes.
        (check_redundant_cond_expr, predicate_rhs_code): New functions.
        (predicate_mem_writes): Rename to...
        (predicate_statements): ...this.  Use predicate_load_or_store
        and predicate_rhs_code.
        (combine_blocks, tree_if_conversion): Update after above name changes.
        (ifcvt_local_dce): Handle redundant_ssa_names.
        * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Handle
        general conditional functions.
        * tree-vect-stmts.c (vectorizable_call): Likewise.
        * config/aarch64/aarch64-sve.md (cond_<optab><mode>): New pattern
        for SVE_COND_INT2_SD_OP.
        * config/aarch64/iterators.md (UNSPEC_COND_MUL, UNSPEC_COND_SDIV)
        (UNSPEC_UDIV): New unspecs.
        (SVE_COND_INT2_OP): Include UNSPEC_MUL.
        (SVE_COND_INT2_SD_OP): New int iterator.
        (SVE_COND_FP2_OP): Include UNSPEC_MUL and UNSPEC_SDIV.
        (optab, sve_int_op): Handle UNSPEC_COND_MUL, UNSPEC_COND_SDIV
        and UNSPEC_COND_UDIV.
        (sve_fp_op): Handle UNSPEC_COND_MUL and UNSPEC_COND_SDIV.

gcc/testsuite/
        * gcc.dg/vect/pr53773.c: Do not expect a scalar tail when using
        fully-masked loops with a fixed vector length.
        * gcc.target/aarch64/sve/cond_arith_1.c: New test.
        * gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_arith_2.c: Likewise.
        * gcc.target/aarch64/sve/cond_arith_2_run.c: Likewise.
        * gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
        * gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.

Index: gcc/internal-fn.def
===================================================================
--- gcc/internal-fn.def 2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.def 2018-05-16 11:06:14.513574219 +0100
@@ -149,6 +149,11 @@ DEF_INTERNAL_OPTAB_FN (COND_FNMA_REV, EC
 
 DEF_INTERNAL_OPTAB_FN (COND_ADD, ECF_CONST, cond_add, cond_binary)
 DEF_INTERNAL_OPTAB_FN (COND_SUB, ECF_CONST, cond_sub, cond_binary)
+DEF_INTERNAL_OPTAB_FN (COND_MUL, ECF_CONST, cond_smul, cond_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_DIV, ECF_CONST, first,
+                             cond_sdiv, cond_udiv, cond_binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MOD, ECF_CONST, first,
+                             cond_smod, cond_umod, cond_binary)
 DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MIN, ECF_CONST, first,
                              cond_smin, cond_umin, cond_binary)
 DEF_INTERNAL_SIGNED_OPTAB_FN (COND_MAX, ECF_CONST, first,
Index: gcc/internal-fn.h
===================================================================
--- gcc/internal-fn.h   2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.h   2018-05-16 11:06:14.513574219 +0100
@@ -206,4 +206,6 @@ extern void expand_internal_call (gcall
 extern void expand_internal_call (internal_fn, gcall *);
 extern void expand_PHI (internal_fn, gcall *);
 
+extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
+
 #endif
Index: gcc/internal-fn.c
===================================================================
--- gcc/internal-fn.c   2018-05-16 11:06:14.191592902 +0100
+++ gcc/internal-fn.c   2018-05-16 11:06:14.513574219 +0100
@@ -3208,6 +3208,9 @@ #define DEF_INTERNAL_FN(CODE, FLAGS, FNS
 #define FOR_EACH_CODE_MAPPING(T) \
   T (PLUS_EXPR, IFN_COND_ADD) \
   T (MINUS_EXPR, IFN_COND_SUB) \
+  T (MULT_EXPR, IFN_COND_MUL) \
+  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
+  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
   T (MIN_EXPR, IFN_COND_MIN) \
   T (MAX_EXPR, IFN_COND_MAX) \
   T (BIT_AND_EXPR, IFN_COND_AND) \
@@ -3229,13 +3232,16 @@ get_conditional_internal_fn (tree_code c
 #define CASE(CODE, IFN) case CODE: return IFN;
       FOR_EACH_CODE_MAPPING(CASE)
 #undef CASE
+    case RDIV_EXPR:
+      return IFN_COND_DIV;
     default:
       return IFN_LAST;
     }
 }
 
 /* If IFN implements the conditional form of a tree code, return that
-   tree code, otherwise return ERROR_MARK.  */
+   tree code, otherwise return ERROR_MARK.  If the codes for integer
+   and floating-point operations are different, return the integer one.  */
 
 static tree_code
 conditional_internal_fn_code (internal_fn ifn)
@@ -3285,13 +3291,19 @@ can_interpret_as_conditional_op_p (gimpl
        tree_code code = conditional_internal_fn_code (ifn);
        if (code != ERROR_MARK)
          {
-           *code_out = code;
            *cond_out = gimple_call_arg (call, 0);
            if (integer_truep (*cond_out))
              *cond_out = NULL_TREE;
            unsigned int nargs = gimple_call_num_args (call) - 1;
            for (unsigned int i = 0; i < 3; ++i)
              ops[i] = i < nargs ? gimple_call_arg (call, i + 1) : NULL_TREE;
+
+           /* CODE is set for integer operations.  Adjust it if
+              floating-point ones are different.  */
+           if (code == TRUNC_DIV_EXPR && FLOAT_TYPE_P (TREE_TYPE (ops[0])))
+             code = RDIV_EXPR;
+
+           *code_out = code;
            return true;
          }
       }
@@ -3362,6 +3374,10 @@ internal_fn_mask_index (internal_fn fn)
 {
   switch (fn)
     {
+    case IFN_COND_FMA_REV:
+    case IFN_COND_FNMA_REV:
+      return 0;
+
     case IFN_MASK_LOAD:
     case IFN_MASK_LOAD_LANES:
     case IFN_MASK_STORE:
@@ -3375,7 +3391,7 @@ internal_fn_mask_index (internal_fn fn)
       return 4;
 
     default:
-      return -1;
+      return conditional_internal_fn_code (fn) != ERROR_MARK ? 0 : -1;
     }
 }
 
@@ -3440,6 +3456,26 @@ expand_internal_call (gcall *stmt)
   expand_internal_call (gimple_call_internal_fn (stmt), stmt);
 }
 
+/* If TYPE is a vector type, return true if IFN is a direct internal
+   function that is supported for that type.  If TYPE is a scalar type,
+   return true if IFN is a direct internal function that is supported for
+   the target's preferred vector version of TYPE.  */
+
+bool
+vectorized_internal_fn_supported_p (internal_fn ifn, tree type)
+{
+  scalar_mode smode;
+  if (!VECTOR_TYPE_P (type) && is_a <scalar_mode> (TYPE_MODE (type), &smode))
+    {
+      machine_mode vmode = targetm.vectorize.preferred_simd_mode (smode);
+      if (VECTOR_MODE_P (vmode))
+       type = build_vector_type_for_mode (type, vmode);
+    }
+
+  return (VECTOR_MODE_P (TYPE_MODE (type))
+         && direct_internal_fn_supported_p (ifn, type, OPTIMIZE_FOR_SPEED));
+}
+
 void
 expand_PHI (internal_fn, gcall *)
 {
Index: gcc/optabs.def
===================================================================
--- gcc/optabs.def      2018-05-16 11:06:14.191592902 +0100
+++ gcc/optabs.def      2018-05-16 11:06:14.513574219 +0100
@@ -222,8 +222,13 @@ OPTAB_D (notcc_optab, "not$acc")
 OPTAB_D (movcc_optab, "mov$acc")
 OPTAB_D (cond_add_optab, "cond_add$a")
 OPTAB_D (cond_sub_optab, "cond_sub$a")
+OPTAB_D (cond_smul_optab, "cond_mul$a")
 OPTAB_D (cond_fma_rev_optab, "cond_fma_rev$a")
 OPTAB_D (cond_fnma_rev_optab, "cond_fnma_rev$a")
+OPTAB_D (cond_sdiv_optab, "cond_div$a")
+OPTAB_D (cond_smod_optab, "cond_mod$a")
+OPTAB_D (cond_udiv_optab, "cond_udiv$a")
+OPTAB_D (cond_umod_optab, "cond_umod$a")
 OPTAB_D (cond_and_optab, "cond_and$a")
 OPTAB_D (cond_ior_optab, "cond_ior$a")
 OPTAB_D (cond_xor_optab, "cond_xor$a")
Index: gcc/tree-if-conv.c
===================================================================
--- gcc/tree-if-conv.c  2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-if-conv.c  2018-05-16 11:06:14.517573987 +0100
@@ -116,15 +116,18 @@ Software Foundation; either version 3, o
 #include "builtins.h"
 #include "params.h"
 #include "cfganal.h"
+#include "internal-fn.h"
 
 /* Only handle PHIs with no more arguments unless we are asked to by
    simd pragma.  */
 #define MAX_PHI_ARG_NUM \
   ((unsigned) PARAM_VALUE (PARAM_MAX_TREE_IF_CONVERSION_PHI_ARGS))
 
-/* Indicate if new load/store that needs to be predicated is introduced
-   during if conversion.  */
-static bool any_pred_load_store;
+/* True if we've converted a statement that was only executed when some
+   condition C was true, and if for correctness we need to predicate the
+   statement to ensure that it is a no-op when C is false.  See
+   predicate_statements for the kinds of predication we support.  */
+static bool need_to_predicate;
 
 /* Indicate if there are any complicated PHIs that need to be handled in
    if-conversion.  Complicated PHI has more than two arguments and can't
@@ -193,6 +196,9 @@ innermost_loop_behavior_hash::equal (con
 /* Hash table to store <base reference, DR> pairs.  */
 static hash_map<tree_operand_hash, data_reference_p> *baseref_DR_map;
 
+/* List of redundant SSA names: the first should be replaced by the second.  */
+static vec< std::pair<tree, tree> > redundant_ssa_names;
+
 /* Structure used to predicate basic blocks.  This is attached to the
    ->aux field of the BBs in the loop to be if-converted.  */
 struct bb_predicate {
@@ -919,19 +925,10 @@ ifcvt_memrefs_wont_trap (gimple *stmt, v
 static bool
 ifcvt_can_use_mask_load_store (gimple *stmt)
 {
-  tree lhs, ref;
-  machine_mode mode;
-  basic_block bb = gimple_bb (stmt);
-  bool is_load;
-
-  if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
-      || bb->loop_father->dont_vectorize
-      || !gimple_assign_single_p (stmt)
-      || gimple_has_volatile_ops (stmt))
-    return false;
-
   /* Check whether this is a load or store.  */
-  lhs = gimple_assign_lhs (stmt);
+  tree lhs = gimple_assign_lhs (stmt);
+  bool is_load;
+  tree ref;
   if (gimple_store_p (stmt))
     {
       if (!is_gimple_val (gimple_assign_rhs1 (stmt)))
@@ -952,7 +949,7 @@ ifcvt_can_use_mask_load_store (gimple *s
 
   /* Mask should be integer mode of the same size as the load/store
      mode.  */
-  mode = TYPE_MODE (TREE_TYPE (lhs));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
   if (!int_mode_for_mode (mode).exists () || VECTOR_MODE_P (mode))
     return false;
 
@@ -962,6 +959,32 @@ ifcvt_can_use_mask_load_store (gimple *s
   return false;
 }
 
+/* Return true if STMT could be converted from an operation that is
+   unconditional to one that is conditional on a bb predicate mask.  */
+
+static bool
+ifcvt_can_predicate (gimple *stmt)
+{
+  basic_block bb = gimple_bb (stmt);
+
+  if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
+      || bb->loop_father->dont_vectorize
+      || gimple_has_volatile_ops (stmt))
+    return false;
+
+  if (gimple_assign_single_p (stmt))
+    return ifcvt_can_use_mask_load_store (stmt);
+
+  tree_code code = gimple_assign_rhs_code (stmt);
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));
+  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+  if (!types_compatible_p (lhs_type, rhs_type))
+    return false;
+  internal_fn cond_fn = get_conditional_internal_fn (code);
+  return (cond_fn != IFN_LAST
+         && vectorized_internal_fn_supported_p (cond_fn, lhs_type));
+}
+
 /* Return true when STMT is if-convertible.
 
    GIMPLE_ASSIGN statement is not if-convertible if,
@@ -1006,10 +1029,10 @@ if_convertible_gimple_assign_stmt_p (gim
        || ! ifcvt_memrefs_wont_trap (stmt, refs))
       && gimple_could_trap_p (stmt))
     {
-      if (ifcvt_can_use_mask_load_store (stmt))
+      if (ifcvt_can_predicate (stmt))
        {
          gimple_set_plf (stmt, GF_PLF_2, true);
-         any_pred_load_store = true;
+         need_to_predicate = true;
          return true;
        }
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1020,7 +1043,7 @@ if_convertible_gimple_assign_stmt_p (gim
   /* When if-converting stores force versioning, likewise if we
      ended up generating store data races.  */
   if (gimple_vdef (stmt))
-    any_pred_load_store = true;
+    need_to_predicate = true;
 
   return true;
 }
@@ -2052,7 +2075,7 @@ insert_gimplified_predicates (loop_p loo
       stmts = bb_predicate_gimplified_stmts (bb);
       if (stmts)
        {
-         if (any_pred_load_store)
+         if (need_to_predicate)
            {
              /* Insert the predicate of the BB just after the label,
                 as the if-conversion of memory writes will use this
@@ -2080,7 +2103,7 @@ insert_gimplified_predicates (loop_p loo
     }
 }
 
-/* Helper function for predicate_mem_writes. Returns index of existent
+/* Helper function for predicate_statements. Returns index of existent
    mask if it was created for given SIZE and -1 otherwise.  */
 
 static int
@@ -2094,6 +2117,126 @@ mask_exists (int size, vec<int> vec)
   return -1;
 }
 
+/* Helper function for predicate_statements.  STMT is a memory read or
+   write and it needs to be predicated by MASK.  Return a statement
+   that does so.  */
+
+static gimple *
+predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
+{
+  gcall *new_stmt;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  tree ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
+  mark_addressable (ref);
+  tree addr = force_gimple_operand_gsi (gsi, build_fold_addr_expr (ref),
+                                       true, NULL_TREE, true, GSI_SAME_STMT);
+  tree ptr = build_int_cst (reference_alias_ptr_type (ref),
+                           get_object_alignment (ref));
+  /* Copy points-to info if possible.  */
+  if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
+    copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
+                  ref);
+  if (TREE_CODE (lhs) == SSA_NAME)
+    {
+      new_stmt
+       = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
+                                     ptr, mask);
+      gimple_call_set_lhs (new_stmt, lhs);
+      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+    }
+  else
+    {
+      new_stmt
+       = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+                                     mask, rhs);
+      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+    }
+  gimple_call_set_nothrow (new_stmt, true);
+  return new_stmt;
+}
+
+/* STMT uses OP_LHS.  Check whether it has the form:
+
+     ... = OP_MASK ? OP_LHS : X;
+
+   Return X if so, otherwise return null.  OP_MASK is an SSA_NAME that is
+   known to have value OP_COND.  */
+
+static tree
+check_redundant_cond_expr (gimple *stmt, tree op_mask, tree op_cond,
+                          tree op_lhs)
+{
+  gassign *assign = dyn_cast <gassign *> (stmt);
+  if (!assign || gimple_assign_rhs_code (assign) != COND_EXPR)
+    return NULL_TREE;
+
+  tree use_cond = gimple_assign_rhs1 (assign);
+  tree if_true = gimple_assign_rhs2 (assign);
+  tree if_false = gimple_assign_rhs3 (assign);
+
+  if ((use_cond == op_mask || operand_equal_p (use_cond, op_cond, 0))
+      && if_true == op_lhs)
+    return if_false;
+
+  return NULL_TREE;
+}
+
+/* Helper function for predicate_statements.  STMT is a potentially-trapping
+   arithmetic operation that needs to be predicated by MASK, an SSA_NAME that
+   has value COND.  Return a statement that does so.  */
+
+static gimple *
+predicate_rhs_code (gassign *stmt, tree mask, tree cond)
+{
+  tree lhs = gimple_assign_lhs (stmt);
+  tree_code code = gimple_assign_rhs_code (stmt);
+  unsigned int nops = gimple_num_ops (stmt);
+
+  /* Construct the arguments to the conditional internal function.   */
+  auto_vec<tree, 8> args;
+  args.safe_grow (nops);
+  args[0] = mask;
+  for (unsigned int i = 1; i < nops; ++i)
+    args[i] = gimple_op (stmt, i);
+
+  /* Look for uses of the result to see whether they are COND_EXPRs that can
+     be folded into the conditional call, swapping arguments 1 and 2 if
+     necessary.  */
+  imm_use_iterator imm_iter;
+  gimple *use_stmt;
+  bool can_swap_p = commutative_tree_code (code);
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+    {
+      tree which = check_redundant_cond_expr (use_stmt, mask, cond, lhs);
+      if (can_swap_p && which == args[2])
+       std::swap (args[1], args[2]);
+      if (which == args[1])
+       {
+         /* We have:
+
+              LHS = IFN_COND (MASK, ARGS[1], ...);
+              X = MASK ? LHS : ARGS[1];
+
+            which makes X equivalent to LHS.  */
+         tree use_lhs = gimple_assign_lhs (use_stmt);
+         redundant_ssa_names.safe_push (std::make_pair (use_lhs, lhs));
+         can_swap_p = false;
+       }
+    }
+
+  /* Create and insert the call.  */
+  internal_fn cond_fn = get_conditional_internal_fn (code);
+  gcall *new_stmt = gimple_build_call_internal_vec (cond_fn, args);
+  gimple_call_set_lhs (new_stmt, lhs);
+  gimple_call_set_nothrow (new_stmt, true);
+
+  return new_stmt;
+}
+
 /* Predicate each write to memory in LOOP.
 
    This function transforms control flow constructs containing memory
@@ -2158,7 +2301,7 @@ mask_exists (int size, vec<int> vec)
    |   goto bb_1
    | end_bb_4
 
-   predicate_mem_writes is then predicating the memory write as follows:
+   predicate_statements is then predicating the memory write as follows:
 
    | bb_0
    |   i = 0
@@ -2202,7 +2345,7 @@ mask_exists (int size, vec<int> vec)
 */
 
 static void
-predicate_mem_writes (loop_p loop)
+predicate_statements (loop_p loop)
 {
   unsigned int i, orig_loop_num_nodes = loop->num_nodes;
   auto_vec<int, 1> vect_sizes;
@@ -2214,7 +2357,6 @@ predicate_mem_writes (loop_p loop)
       basic_block bb = ifc_bbs[i];
       tree cond = bb_predicate (bb);
       bool swap;
-      gimple *stmt;
       int index;
 
       if (is_true_predicate (cond))
@@ -2232,7 +2374,8 @@ predicate_mem_writes (loop_p loop)
 
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
        {
-         if (!gimple_assign_single_p (stmt = gsi_stmt (gsi)))
+         gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
+         if (!stmt)
            ;
          else if (is_false_predicate (cond)
                   && gimple_vdef (stmt))
@@ -2245,19 +2388,13 @@ predicate_mem_writes (loop_p loop)
          else if (gimple_plf (stmt, GF_PLF_2))
            {
              tree lhs = gimple_assign_lhs (stmt);
-             tree rhs = gimple_assign_rhs1 (stmt);
-             tree ref, addr, ptr, mask;
-             gcall *new_stmt;
+             tree mask;
+             gimple *new_stmt;
              gimple_seq stmts = NULL;
              machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
              /* We checked before setting GF_PLF_2 that an equivalent
                 integer mode exists.  */
              int bitsize = GET_MODE_BITSIZE (mode).to_constant ();
-             ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
-             mark_addressable (ref);
-             addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
-                                              true, NULL_TREE, true,
-                                              GSI_SAME_STMT);
              if (!vect_sizes.is_empty ()
                  && (index = mask_exists (bitsize, vect_sizes)) != -1)
                /* Use created mask.  */
@@ -2285,30 +2422,10 @@ predicate_mem_writes (loop_p loop)
                  vect_sizes.safe_push (bitsize);
                  vect_masks.safe_push (mask);
                }
-             ptr = build_int_cst (reference_alias_ptr_type (ref),
-                                  get_object_alignment (ref));
-             /* Copy points-to info if possible.  */
-             if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
-               copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
-                              ref);
-             if (TREE_CODE (lhs) == SSA_NAME)
-               {
-                 new_stmt
-                   = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
-                                                 ptr, mask);
-                 gimple_call_set_lhs (new_stmt, lhs);
-                 gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-               }
+             if (gimple_assign_single_p (stmt))
+               new_stmt = predicate_load_or_store (&gsi, stmt, mask);
              else
-               {
-                 new_stmt
-                   = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
-                                                 mask, rhs);
-                 gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-                 gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-                 SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
-               }
-             gimple_call_set_nothrow (new_stmt, true);
+               new_stmt = predicate_rhs_code (stmt, mask, cond);
 
              gsi_replace (&gsi, new_stmt, true);
            }
@@ -2392,8 +2509,8 @@ combine_blocks (struct loop *loop)
   insert_gimplified_predicates (loop);
   predicate_all_scalar_phis (loop);
 
-  if (any_pred_load_store)
-    predicate_mem_writes (loop);
+  if (need_to_predicate)
+    predicate_statements (loop);
 
   /* Merge basic blocks: first remove all the edges in the loop,
      except for those from the exit block.  */
@@ -2733,6 +2850,12 @@ ifcvt_local_dce (basic_block bb)
   enum gimple_code code;
   use_operand_p use_p;
   imm_use_iterator imm_iter;
+  std::pair <tree, tree> *name_pair;
+  unsigned int i;
+
+  FOR_EACH_VEC_ELT (redundant_ssa_names, i, name_pair)
+    replace_uses_by (name_pair->first, name_pair->second);
+  redundant_ssa_names.release ();
 
   worklist.create (64);
   /* Consider all phi as live statements.  */
@@ -2833,7 +2956,7 @@ tree_if_conversion (struct loop *loop)
  again:
   rloop = NULL;
   ifc_bbs = NULL;
-  any_pred_load_store = false;
+  need_to_predicate = false;
   any_complicated_phi = false;
 
   /* Apply more aggressive if-conversion when loop or its outer loop were
@@ -2854,7 +2977,7 @@ tree_if_conversion (struct loop *loop)
       || !dbg_cnt (if_conversion_tree))
     goto cleanup;
 
-  if ((any_pred_load_store || any_complicated_phi)
+  if ((need_to_predicate || any_complicated_phi)
       && ((!flag_tree_loop_vectorize && !loop->force_vectorize)
          || loop->dont_vectorize))
     goto cleanup;
@@ -2864,7 +2987,7 @@ tree_if_conversion (struct loop *loop)
      Either version this loop, or if the pattern is right for outer-loop
      vectorization, version the outer loop.  In the latter case we will
      still if-convert the original inner loop.  */
-  if (any_pred_load_store
+  if (need_to_predicate
       || any_complicated_phi
       || flag_tree_loop_if_convert != 1)
     {
Index: gcc/tree-vect-patterns.c
===================================================================
--- gcc/tree-vect-patterns.c    2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-vect-patterns.c    2018-05-16 11:06:14.517573987 +0100
@@ -3955,64 +3955,67 @@ vect_recog_mask_conversion_pattern (vec<
 
   /* Check for MASK_LOAD ans MASK_STORE calls requiring mask conversion.  */
   if (is_gimple_call (last_stmt)
-      && gimple_call_internal_p (last_stmt)
-      && (gimple_call_internal_fn (last_stmt) == IFN_MASK_STORE
-         || gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD))
+      && gimple_call_internal_p (last_stmt))
     {
       gcall *pattern_stmt;
-      bool load = (gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD);
 
-      if (load)
+      internal_fn ifn = gimple_call_internal_fn (last_stmt);
+      int mask_argno = internal_fn_mask_index (ifn);
+      if (mask_argno < 0)
+       return NULL;
+
+      bool store_p = internal_store_fn_p (ifn);
+      if (store_p)
        {
-         lhs = gimple_call_lhs (last_stmt);
-         vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+         int rhs_index = internal_fn_stored_value_index (ifn);
+         tree rhs = gimple_call_arg (last_stmt, rhs_index);
+         vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs));
        }
       else
        {
-         rhs2 = gimple_call_arg (last_stmt, 3);
-         vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs2));
+         lhs = gimple_call_lhs (last_stmt);
+         vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
        }
 
-      rhs1 = gimple_call_arg (last_stmt, 2);
-      rhs1_type = search_type_for_mask (rhs1, vinfo);
-      if (!rhs1_type)
+      tree mask_arg = gimple_call_arg (last_stmt, mask_argno);
+      tree mask_arg_type = search_type_for_mask (mask_arg, vinfo);
+      if (!mask_arg_type)
        return NULL;
-      vectype2 = get_mask_type_for_scalar_type (rhs1_type);
+      vectype2 = get_mask_type_for_scalar_type (mask_arg_type);
 
       if (!vectype1 || !vectype2
          || known_eq (TYPE_VECTOR_SUBPARTS (vectype1),
                       TYPE_VECTOR_SUBPARTS (vectype2)))
        return NULL;
 
-      tmp = build_mask_conversion (rhs1, vectype1, stmt_vinfo, vinfo);
+      tmp = build_mask_conversion (mask_arg, vectype1, stmt_vinfo, vinfo);
 
-      if (load)
+      auto_vec<tree, 8> args;
+      unsigned int nargs = gimple_call_num_args (last_stmt);
+      args.safe_grow (nargs);
+      for (unsigned int i = 0; i < nargs; ++i)
+       args[i] = ((int) i == mask_argno
+                  ? tmp
+                  : gimple_call_arg (last_stmt, i));
+      pattern_stmt = gimple_build_call_internal_vec (ifn, args);
+
+      if (!store_p)
        {
          lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
-         pattern_stmt
-           = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-                                         gimple_call_arg (last_stmt, 0),
-                                         gimple_call_arg (last_stmt, 1),
-                                         tmp);
          gimple_call_set_lhs (pattern_stmt, lhs);
        }
-      else
-         pattern_stmt
-           = gimple_build_call_internal (IFN_MASK_STORE, 4,
-                                         gimple_call_arg (last_stmt, 0),
-                                         gimple_call_arg (last_stmt, 1),
-                                         tmp,
-                                         gimple_call_arg (last_stmt, 3));
-
       gimple_call_set_nothrow (pattern_stmt, true);
 
       pattern_stmt_info = new_stmt_vec_info (pattern_stmt, vinfo);
       set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
-      STMT_VINFO_DATA_REF (pattern_stmt_info)
-       = STMT_VINFO_DATA_REF (stmt_vinfo);
-      STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
-       = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
-      DR_STMT (STMT_VINFO_DATA_REF (stmt_vinfo)) = pattern_stmt;
+      if (STMT_VINFO_DATA_REF (stmt_vinfo))
+       {
+         STMT_VINFO_DATA_REF (pattern_stmt_info)
+           = STMT_VINFO_DATA_REF (stmt_vinfo);
+         STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
+           = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
+         DR_STMT (STMT_VINFO_DATA_REF (stmt_vinfo)) = pattern_stmt;
+       }
 
       *type_out = vectype1;
       *type_in = vectype1;
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       2018-05-16 11:06:14.191592902 +0100
+++ gcc/tree-vect-stmts.c       2018-05-16 11:06:14.518573929 +0100
@@ -3016,7 +3016,8 @@ vectorizable_call (gimple *gs, gimple_st
   int ndts = 3;
   gimple *new_stmt = NULL;
   int ncopies, j;
-  vec<tree> vargs = vNULL;
+  auto_vec<tree, 8> vargs;
+  auto_vec<tree, 8> orig_vargs;
   enum { NARROW, NONE, WIDEN } modifier;
   size_t i, nargs;
   tree lhs;
@@ -3059,18 +3060,34 @@ vectorizable_call (gimple *gs, gimple_st
     return false;
 
   /* Ignore the argument of IFN_GOMP_SIMD_LANE, it is magic.  */
-  if (gimple_call_internal_p (stmt)
-      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+  combined_fn cfn = gimple_call_combined_fn (stmt);
+  if (cfn == CFN_GOMP_SIMD_LANE)
     {
       nargs = 0;
       rhs_type = unsigned_type_node;
     }
 
+  int mask_opno = -1;
+  if (internal_fn_p (cfn))
+    mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
+
   for (i = 0; i < nargs; i++)
     {
       tree opvectype;
 
       op = gimple_call_arg (stmt, i);
+      if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt[i], &opvectype))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "use not simple.\n");
+         return false;
+       }
+
+      /* Skip the mask argument to an internal function.  This operand
+        has been converted via a pattern if necessary.  */
+      if ((int) i == mask_opno)
+       continue;
 
       /* We can only handle calls with arguments of the same type.  */
       if (rhs_type
@@ -3084,14 +3101,6 @@ vectorizable_call (gimple *gs, gimple_st
       if (!rhs_type)
        rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt[i], &opvectype))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "use not simple.\n");
-         return false;
-       }
-
       if (!vectype_in)
        vectype_in = opvectype;
       else if (opvectype
@@ -3149,7 +3158,6 @@ vectorizable_call (gimple *gs, gimple_st
      to vectorize other operations in the loop.  */
   fndecl = NULL_TREE;
   internal_fn ifn = IFN_LAST;
-  combined_fn cfn = gimple_call_combined_fn (stmt);
   tree callee = gimple_call_fndecl (stmt);
 
   /* First try using an internal function.  */
@@ -3213,6 +3221,7 @@ vectorizable_call (gimple *gs, gimple_st
      needs to be generated.  */
   gcc_assert (ncopies >= 1);
 
+  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
@@ -3226,7 +3235,13 @@ vectorizable_call (gimple *gs, gimple_st
            add_stmt_cost (stmt_info->vinfo->target_cost_data, ncopies / 2,
                           vec_promote_demote, stmt_info, 0, vect_body);
        }
-
+      if (loop_vinfo && mask_opno >= 0)
+       {
+         unsigned int nvectors = (slp_node
+                                  ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
+                                  : ncopies);
+         vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+       }
       return true;
     }
 
@@ -3239,25 +3254,24 @@ vectorizable_call (gimple *gs, gimple_st
   scalar_dest = gimple_call_lhs (stmt);
   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
   prev_stmt_info = NULL;
   if (modifier == NONE || ifn != IFN_LAST)
     {
       tree prev_res = NULL_TREE;
+      vargs.safe_grow (nargs);
+      orig_vargs.safe_grow (nargs);
       for (j = 0; j < ncopies; ++j)
        {
          /* Build argument list for the vectorized call.  */
-         if (j == 0)
-           vargs.create (nargs);
-         else
-           vargs.truncate (0);
-
          if (slp_node)
            {
              auto_vec<vec<tree> > vec_defs (nargs);
              vec<tree> vec_oprnds0;
 
              for (i = 0; i < nargs; i++)
-               vargs.quick_push (gimple_call_arg (stmt, i));
+               vargs[i] = gimple_call_arg (stmt, i);
              vect_get_slp_defs (vargs, slp_node, &vec_defs);
              vec_oprnds0 = vec_defs[0];
 
@@ -3272,6 +3286,9 @@ vectorizable_call (gimple *gs, gimple_st
                    }
                  if (modifier == NARROW)
                    {
+                     /* We don't define any narrowing conditional functions
+                        at present.  */
+                     gcc_assert (mask_opno < 0);
                      tree half_res = make_ssa_name (vectype_in);
                      gcall *call
                        = gimple_build_call_internal_vec (ifn, vargs);
@@ -3290,6 +3307,17 @@ vectorizable_call (gimple *gs, gimple_st
                    }
                  else
                    {
+                     if (mask_opno >= 0 && masked_loop_p)
+                       {
+                         unsigned int vec_num = vec_oprnds0.length ();
+                         /* Always true for SLP.  */
+                         gcc_assert (ncopies == 1);
+                         tree mask = vect_get_loop_mask (gsi, masks, vec_num,
+                                                         vectype_out, i);
+                         vargs[mask_opno] = prepare_load_store_mask
+                           (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+                       }
+
                      gcall *call;
                      if (ifn != IFN_LAST)
                        call = gimple_build_call_internal_vec (ifn, vargs);
@@ -3319,17 +3347,22 @@ vectorizable_call (gimple *gs, gimple_st
                vec_oprnd0
                  = vect_get_vec_def_for_operand (op, stmt);
              else
-               {
-                 vec_oprnd0 = gimple_call_arg (new_stmt, i);
-                 vec_oprnd0
-                    = vect_get_vec_def_for_stmt_copy (dt[i], vec_oprnd0);
-               }
+               vec_oprnd0
+                 = vect_get_vec_def_for_stmt_copy (dt[i], orig_vargs[i]);
+
+             orig_vargs[i] = vargs[i] = vec_oprnd0;
+           }
 
-             vargs.quick_push (vec_oprnd0);
+         if (mask_opno >= 0 && masked_loop_p)
+           {
+             tree mask = vect_get_loop_mask (gsi, masks, ncopies,
+                                             vectype_out, j);
+             vargs[mask_opno]
+               = prepare_load_store_mask (TREE_TYPE (mask), mask,
+                                          vargs[mask_opno], gsi);
            }
 
-         if (gimple_call_internal_p (stmt)
-             && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+         if (cfn == CFN_GOMP_SIMD_LANE)
            {
              tree cst = build_index_vector (vectype_out, j * nunits_out, 1);
              tree new_var
@@ -3341,6 +3374,9 @@ vectorizable_call (gimple *gs, gimple_st
            }
          else if (modifier == NARROW)
            {
+             /* We don't define any narrowing conditional functions at
+                present.  */
+             gcc_assert (mask_opno < 0);
              tree half_res = make_ssa_name (vectype_in);
              gcall *call = gimple_build_call_internal_vec (ifn, vargs);
              gimple_call_set_lhs (call, half_res);
@@ -3380,6 +3416,8 @@ vectorizable_call (gimple *gs, gimple_st
     }
   else if (modifier == NARROW)
     {
+      /* We don't define any narrowing conditional functions at present.  */
+      gcc_assert (mask_opno < 0);
       for (j = 0; j < ncopies; ++j)
        {
          /* Build argument list for the vectorized call.  */
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md   2018-05-16 11:06:14.191592902 +0100
+++ gcc/config/aarch64/aarch64-sve.md   2018-05-16 11:06:14.511574335 +0100
@@ -1769,6 +1769,17 @@ (define_insn "cond_<optab><mode>"
   "<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn "cond_<optab><mode>"
+  [(set (match_operand:SVE_SDI 0 "register_operand" "=w")
+       (unspec:SVE_SDI
+         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+          (match_operand:SVE_SDI 2 "register_operand" "0")
+          (match_operand:SVE_SDI 3 "register_operand" "w")]
+         SVE_COND_INT2_SD_OP))]
+  "TARGET_SVE"
+  "<sve_int_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>"
+)
+
 ;; Set operand 0 to the last active element in operand 3, or to tied
 ;; operand 1 if no elements are active.
 (define_insn "fold_extract_last_<mode>"
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md     2018-05-16 11:06:14.191592902 +0100
+++ gcc/config/aarch64/iterators.md     2018-05-16 11:06:14.512574277 +0100
@@ -442,6 +442,9 @@ (define_c_enum "unspec"
     UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_COND_ADD    ; Used in aarch64-sve.md.
     UNSPEC_COND_SUB    ; Used in aarch64-sve.md.
+    UNSPEC_COND_MUL    ; Used in aarch64-sve.md.
+    UNSPEC_COND_SDIV   ; Used in aarch64-sve.md.
+    UNSPEC_COND_UDIV   ; Used in aarch64-sve.md.
     UNSPEC_COND_SMAX   ; Used in aarch64-sve.md.
     UNSPEC_COND_UMAX   ; Used in aarch64-sve.md.
     UNSPEC_COND_SMIN   ; Used in aarch64-sve.md.
@@ -1502,13 +1505,17 @@ (define_int_iterator UNPACK_UNSIGNED [UN
 (define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART])
 
 (define_int_iterator SVE_COND_INT2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB
+                                      UNSPEC_COND_MUL
                                       UNSPEC_COND_SMAX UNSPEC_COND_UMAX
                                       UNSPEC_COND_SMIN UNSPEC_COND_UMIN
                                       UNSPEC_COND_AND
                                       UNSPEC_COND_ORR
                                       UNSPEC_COND_EOR])
 
-(define_int_iterator SVE_COND_FP2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB])
+(define_int_iterator SVE_COND_INT2_SD_OP [UNSPEC_COND_SDIV UNSPEC_COND_UDIV])
+
+(define_int_iterator SVE_COND_FP2_OP [UNSPEC_COND_ADD UNSPEC_COND_SUB
+                                     UNSPEC_COND_MUL UNSPEC_COND_SDIV])
 
 (define_int_iterator SVE_COND_FP3_OP [UNSPEC_COND_FMLA UNSPEC_COND_FMLS])
 
@@ -1541,6 +1548,9 @@ (define_int_attr optab [(UNSPEC_ANDF "an
                        (UNSPEC_XORV "xor")
                        (UNSPEC_COND_ADD "add")
                        (UNSPEC_COND_SUB "sub")
+                       (UNSPEC_COND_MUL "mul")
+                       (UNSPEC_COND_SDIV "div")
+                       (UNSPEC_COND_UDIV "udiv")
                        (UNSPEC_COND_SMAX "smax")
                        (UNSPEC_COND_UMAX "umax")
                        (UNSPEC_COND_SMIN "smin")
@@ -1759,6 +1769,9 @@ (define_int_attr cmp_op [(UNSPEC_COND_LT
 
 (define_int_attr sve_int_op [(UNSPEC_COND_ADD "add")
                             (UNSPEC_COND_SUB "sub")
+                            (UNSPEC_COND_MUL "mul")
+                            (UNSPEC_COND_SDIV "sdiv")
+                            (UNSPEC_COND_UDIV "udiv")
                             (UNSPEC_COND_SMAX "smax")
                             (UNSPEC_COND_UMAX "umax")
                             (UNSPEC_COND_SMIN "smin")
@@ -1769,5 +1782,7 @@ (define_int_attr sve_int_op [(UNSPEC_CON
 
 (define_int_attr sve_fp_op [(UNSPEC_COND_ADD "fadd")
                            (UNSPEC_COND_SUB "fsub")
+                           (UNSPEC_COND_MUL "fmul")
+                           (UNSPEC_COND_SDIV "fdiv")
                            (UNSPEC_COND_FMLA "fmla")
                            (UNSPEC_COND_FMLS "fmls")])
Index: gcc/testsuite/gcc.dg/vect/pr53773.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/pr53773.c 2018-05-16 11:06:14.191592902 +0100
+++ gcc/testsuite/gcc.dg/vect/pr53773.c 2018-05-16 11:06:14.515574103 +0100
@@ -14,5 +14,8 @@ foo (int integral, int decimal, int powe
   return integral+decimal;
 }
 
-/* { dg-final { scan-tree-dump-times "\\* 10" 2 "optimized" } } */
+/* We can avoid a scalar tail when using fully-masked loops with a fixed
+   vector length.  */
+/* { dg-final { scan-tree-dump-times "\\* 10" 2 "optimized" { target { { ! 
vect_fully_masked } || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "\\* 10" 0 "optimized" { target { 
vect_fully_masked && { ! vect_variable_length } } } } } */
 
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c 2018-05-16 
11:06:14.515574103 +0100
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP)                           \
+  void __attribute__ ((noinline, noclone))             \
+  test_##TYPE##_##NAME (TYPE *__restrict x,            \
+                       TYPE *__restrict y,             \
+                       TYPE *__restrict z,             \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];       \
+  }
+
+#define TEST_INT_TYPE(TYPE) \
+  TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+  TEST (TYPE, add, +) \
+  TEST (TYPE, sub, -) \
+  TEST (TYPE, mul, *) \
+  TEST (TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int8_t) \
+  TEST_INT_TYPE (uint8_t) \
+  TEST_INT_TYPE (int16_t) \
+  TEST_INT_TYPE (uint16_t) \
+  TEST_INT_TYPE (int32_t) \
+  TEST_INT_TYPE (uint32_t) \
+  TEST_INT_TYPE (int64_t) \
+  TEST_INT_TYPE (uint64_t) \
+  TEST_FP_TYPE (float) \
+  TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */         \
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */         \
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+   division is done directly in the narrow type, rather than being widened
+   to int first.  */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* We fail to optimize away the SEL for the int8_t and int16_t loops,
+   because the 32-bit result is converted before selection.  */
+/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c     2018-05-16 
11:06:14.516574045 +0100
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_1.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP)                                   \
+  {                                                            \
+    TYPE x[N], y[N], z[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       y[i] = i * i;                                           \
+       z[i] = ((i + 2) % 3) * (i + 1);                         \
+       pred[i] = i % 3;                                        \
+      }                                                                \
+    test_##TYPE##_##NAME (x, y, z, pred, N);                   \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i];       \
+       if (x[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c 2018-05-16 
11:06:14.516574045 +0100
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP)                           \
+  void __attribute__ ((noinline, noclone))                             \
+  test_##DATA_TYPE##_##PRED_TYPE##_##NAME (DATA_TYPE *__restrict x,    \
+                                          DATA_TYPE *__restrict y,     \
+                                          DATA_TYPE *__restrict z,     \
+                                          PRED_TYPE *__restrict pred,  \
+                                          int n)                       \
+  {                                                                    \
+    for (int i = 0; i < n; ++i)                                                
\
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];                       \
+  }
+
+#define TEST_INT_TYPE(DATA_TYPE, PRED_TYPE) \
+  TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_FP_TYPE(DATA_TYPE, PRED_TYPE) \
+  TEST (DATA_TYPE, PRED_TYPE, add, +) \
+  TEST (DATA_TYPE, PRED_TYPE, sub, -) \
+  TEST (DATA_TYPE, PRED_TYPE, mul, *) \
+  TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int32_t, int8_t) \
+  TEST_INT_TYPE (uint32_t, int8_t) \
+  TEST_INT_TYPE (int32_t, int16_t) \
+  TEST_INT_TYPE (uint32_t, int16_t) \
+  TEST_INT_TYPE (int64_t, int8_t) \
+  TEST_INT_TYPE (uint64_t, int8_t) \
+  TEST_INT_TYPE (int64_t, int16_t) \
+  TEST_INT_TYPE (uint64_t, int16_t) \
+  TEST_INT_TYPE (int64_t, int32_t) \
+  TEST_INT_TYPE (uint64_t, int32_t) \
+  TEST_FP_TYPE (float, int8_t) \
+  TEST_FP_TYPE (float, int16_t) \
+  TEST_FP_TYPE (double, int8_t) \
+  TEST_FP_TYPE (double, int16_t) \
+  TEST_FP_TYPE (double, int32_t)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c     2018-05-16 
11:06:14.516574045 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_2.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP)                   \
+  {                                                            \
+    DATA_TYPE x[N], y[N], z[N];                                        \
+    PRED_TYPE pred[N];                                         \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       y[i] = i * i;                                           \
+       z[i] = ((i + 2) % 3) * (i + 1);                         \
+       pred[i] = i % 3;                                        \
+      }                                                                \
+    test_##DATA_TYPE##_##PRED_TYPE##_##NAME (x, y, z, pred, N);        \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       DATA_TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i];  \
+       if (x[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c 2018-05-16 
11:06:14.516574045 +0100
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP)                           \
+  void __attribute__ ((noinline, noclone))             \
+  test_##TYPE##_##NAME (TYPE *__restrict x,            \
+                       TYPE *__restrict y,             \
+                       TYPE *__restrict z,             \
+                       TYPE *__restrict pred, int n)   \
+  {                                                    \
+    for (int i = 0; i < n; ++i)                                \
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : 1;          \
+  }
+
+#define TEST_INT_TYPE(TYPE) \
+  TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+  TEST (TYPE, add, +) \
+  TEST (TYPE, sub, -) \
+  TEST (TYPE, mul, *) \
+  TEST (TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int8_t) \
+  TEST_INT_TYPE (uint8_t) \
+  TEST_INT_TYPE (int16_t) \
+  TEST_INT_TYPE (uint16_t) \
+  TEST_INT_TYPE (int32_t) \
+  TEST_INT_TYPE (uint32_t) \
+  TEST_INT_TYPE (int64_t) \
+  TEST_INT_TYPE (uint64_t) \
+  TEST_FP_TYPE (float) \
+  TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */         \
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */         \
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+   division is done directly in the narrow type, rather than being widened
+   to int first.  */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
===================================================================
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c     2018-05-16 
11:06:14.516574045 +0100
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_3.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP)                                   \
+  {                                                            \
+    TYPE x[N], y[N], z[N], pred[N];                            \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       x[i] = -1;                                              \
+       y[i] = i * i;                                           \
+       z[i] = ((i + 2) % 3) * (i + 1);                         \
+       pred[i] = i % 3;                                        \
+      }                                                                \
+    test_##TYPE##_##NAME (x, y, z, pred, N);                   \
+    for (int i = 0; i < N; ++i)                                        \
+      {                                                                \
+       TYPE expected = i % 3 != 1 ? y[i] OP z[i] : 1;          \
+       if (x[i] != expected)                                   \
+         __builtin_abort ();                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}

Reply via email to