On Fri, 21 Jun 2019 at 08:57, Jakub Jelinek <[email protected]> wrote:
>
> Hi!
>
> The following patch adds exclusive scan support for simd, it is similar to
> the inclusive scan, just we need to swap the input and scan phases and
> use slightly different pattern at the start of the scan phase, so that it
> computes what we need.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.
>
> 2019-06-21 Jakub Jelinek <[email protected]>
>
> * omp-low.c (lower_rec_simd_input_clauses): Add rvar2 argument,
> create another "omp scan inscan exclusive" array if
> !ctx->scan_inclusive.
> (lower_rec_input_clauses): Handle exclusive scan inscan reductions.
> (lower_omp_scan): Likewise.
> * tree-vectorizer.h (struct _stmt_vec_info): Use 3-bit instead of
> 2-bit bitfield for simd_lane_access_p member.
> * tree-vect-data-refs.c (vect_analyze_data_refs): Also handle
> aux == (void *)-4 as simd lane access.
> * tree-vect-stmts.c (check_scan_store): Handle exclusive scan. Update
> comment with permutations to show the canonical permutation order.
> (vectorizable_scan_store): Handle exclusive scan.
> (vectorizable_store): Call vectorizable_scan_store even for
> STMT_VINFO_SIMD_LANE_ACCESS_P > 3.
>
> * gcc.dg/vect/vect-simd-12.c: New test.
> * gcc.dg/vect/vect-simd-13.c: New test.
> * gcc.dg/vect/vect-simd-14.c: New test.
> * gcc.dg/vect/vect-simd-15.c: New test.
> * gcc.target/i386/sse2-vect-simd-12.c: New test.
> * gcc.target/i386/sse2-vect-simd-13.c: New test.
> * gcc.target/i386/sse2-vect-simd-14.c: New test.
> * gcc.target/i386/sse2-vect-simd-15.c: New test.
> * gcc.target/i386/avx2-vect-simd-12.c: New test.
> * gcc.target/i386/avx2-vect-simd-13.c: New test.
> * gcc.target/i386/avx2-vect-simd-14.c: New test.
> * gcc.target/i386/avx2-vect-simd-15.c: New test.
> * gcc.target/i386/avx512f-vect-simd-12.c: New test.
> * gcc.target/i386/avx512f-vect-simd-13.c: New test.
> * gcc.target/i386/avx512f-vect-simd-14.c: New test.
> * gcc.target/i386/avx512bw-vect-simd-15.c: New test.
> * g++.dg/vect/simd-6.cc: New test.
> * g++.dg/vect/simd-7.cc: New test.
> * g++.dg/vect/simd-8.cc: New test.
> * g++.dg/vect/simd-9.cc: New test.
> * c-c++-common/gomp/scan-2.c: Don't expect any diagnostics.
>
> --- gcc/omp-low.c.jj 2019-06-20 13:26:29.085150770 +0200
> +++ gcc/omp-low.c 2019-06-20 15:46:25.964253058 +0200
> @@ -3692,7 +3692,8 @@ struct omplow_simd_context {
> static bool
> lower_rec_simd_input_clauses (tree new_var, omp_context *ctx,
> omplow_simd_context *sctx, tree &ivar,
> - tree &lvar, tree *rvar = NULL)
> + tree &lvar, tree *rvar = NULL,
> + tree *rvar2 = NULL)
> {
> if (known_eq (sctx->max_vf, 0U))
> {
> @@ -3767,6 +3768,25 @@ lower_rec_simd_input_clauses (tree new_v
> *rvar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar,
> sctx->lastlane, NULL_TREE, NULL_TREE);
> TREE_THIS_NOTRAP (*rvar) = 1;
> +
> + if (!ctx->scan_inclusive)
> + {
> + /* And for exclusive scan yet another one, which will
> + hold the value during the scan phase. */
> + tree savar = create_tmp_var_raw (atype);
> + if (TREE_ADDRESSABLE (new_var))
> + TREE_ADDRESSABLE (savar) = 1;
> + DECL_ATTRIBUTES (savar)
> + = tree_cons (get_identifier ("omp simd array"), NULL,
> + tree_cons (get_identifier ("omp simd inscan "
> + "exclusive"), NULL,
> + DECL_ATTRIBUTES (savar)));
> + gimple_add_tmp_var (savar);
> + ctx->cb.decl_map->put (iavar, savar);
> + *rvar2 = build4 (ARRAY_REF, TREE_TYPE (new_var), savar,
> + sctx->idx, NULL_TREE, NULL_TREE);
> + TREE_THIS_NOTRAP (*rvar2) = 1;
> + }
> }
> ivar = build4 (ARRAY_REF, TREE_TYPE (new_var), iavar, sctx->idx,
> NULL_TREE, NULL_TREE);
> @@ -5185,14 +5205,15 @@ lower_rec_input_clauses (tree clauses, g
> new_vard = TREE_OPERAND (new_var, 0);
> gcc_assert (DECL_P (new_vard));
> }
> - tree rvar = NULL_TREE, *rvarp = NULL;
> + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE;
> if (is_simd
> && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
> && OMP_CLAUSE_REDUCTION_INSCAN (c))
> rvarp = &rvar;
> if (is_simd
> && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
> - ivar, lvar, rvarp))
> + ivar, lvar, rvarp,
> + &rvar2))
> {
> if (new_vard == new_var)
> {
> @@ -5220,6 +5241,14 @@ lower_rec_input_clauses (tree clauses, g
> (c, ivar2, build_outer_var_ref (var,
> ctx));
> gimplify_and_add (x, &llist[0]);
>
> + if (rvar2)
> + {
> + x = lang_hooks.decls.omp_clause_default_ctor
> + (c, unshare_expr (rvar2),
> + build_outer_var_ref (var, ctx));
> + gimplify_and_add (x, &llist[0]);
> + }
> +
> /* For types that need construction, add another
> private var which will be default constructed
> and optionally initialized with
> @@ -5229,7 +5258,9 @@ lower_rec_input_clauses (tree clauses, g
> iteration. */
> tree nv = create_tmp_var_raw (TREE_TYPE (ivar));
> gimple_add_tmp_var (nv);
> - ctx->cb.decl_map->put (TREE_OPERAND (ivar, 0),
> + ctx->cb.decl_map->put (TREE_OPERAND (rvar2
> + ? rvar2
> + : ivar, 0),
> nv);
> x = lang_hooks.decls.omp_clause_default_ctor
> (c, nv, build_outer_var_ref (var, ctx));
> @@ -5296,6 +5327,18 @@ lower_rec_input_clauses (tree clauses, g
> gimplify_stmt (&dtor, &tseq);
> gimple_seq_add_seq (&llist[1], tseq);
> }
> +
> + if (rvar2)
> + {
> + x = lang_hooks.decls.omp_clause_dtor (c, rvar2);
> + if (x)
> + {
> + tseq = NULL;
> + dtor = x;
> + gimplify_stmt (&dtor, &tseq);
> + gimple_seq_add_seq (&llist[1], tseq);
> + }
> + }
> break;
> }
> if (x)
> @@ -5390,6 +5433,24 @@ lower_rec_input_clauses (tree clauses, g
> gimple_seq_add_seq (ilist, tseq);
> }
> OMP_CLAUSE_REDUCTION_GIMPLE_INIT (c) = NULL;
> + if (!ctx->scan_inclusive)
> + {
> + tree nv2
> + = create_tmp_var_raw (TREE_TYPE (new_var));
> + gimple_add_tmp_var (nv2);
> + ctx->cb.decl_map->put (nv, nv2);
> + x = lang_hooks.decls.omp_clause_default_ctor
> + (c, nv2, build_outer_var_ref (var, ctx));
> + gimplify_and_add (x, ilist);
> + x = lang_hooks.decls.omp_clause_dtor (c, nv2);
> + if (x)
> + {
> + tseq = NULL;
> + dtor = x;
> + gimplify_stmt (&dtor, &tseq);
> + gimple_seq_add_seq (dlist, tseq);
> + }
> + }
> x = lang_hooks.decls.omp_clause_dtor (c, nv);
> if (x)
> {
> @@ -5399,6 +5460,21 @@ lower_rec_input_clauses (tree clauses, g
> gimple_seq_add_seq (dlist, tseq);
> }
> }
> + else if (!ctx->scan_inclusive
> + && TREE_ADDRESSABLE (TREE_TYPE (new_var)))
> + {
> + tree nv2 = create_tmp_var_raw (TREE_TYPE (new_var));
> + gimple_add_tmp_var (nv2);
> + ctx->cb.decl_map->put (new_vard, nv2);
> + x = lang_hooks.decls.omp_clause_dtor (c, nv2);
> + if (x)
> + {
> + tseq = NULL;
> + dtor = x;
> + gimplify_stmt (&dtor, &tseq);
> + gimple_seq_add_seq (dlist, tseq);
> + }
> + }
> DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
> goto do_dtor;
> }
> @@ -5487,14 +5563,15 @@ lower_rec_input_clauses (tree clauses, g
> new_vard = TREE_OPERAND (new_var, 0);
> gcc_assert (DECL_P (new_vard));
> }
> - tree rvar = NULL_TREE, *rvarp = NULL;
> + tree rvar = NULL_TREE, *rvarp = NULL, rvar2 = NULL_TREE;
> if (is_simd
> && OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
> && OMP_CLAUSE_REDUCTION_INSCAN (c))
> rvarp = &rvar;
> if (is_simd
> && lower_rec_simd_input_clauses (new_var, ctx, &sctx,
> - ivar, lvar, rvarp))
> + ivar, lvar, rvarp,
> + &rvar2))
> {
> if (new_vard != new_var)
> {
> @@ -8573,18 +8650,40 @@ lower_omp_scan (gimple_stmt_iterator *gs
> gimple_seq before = NULL;
> omp_context *octx = ctx->outer;
> gcc_assert (octx);
> + if (!octx->scan_inclusive && !has_clauses)
> + {
> + gimple_stmt_iterator gsi2 = *gsi_p;
> + gsi_next (&gsi2);
> + gimple *stmt2 = gsi_stmt (gsi2);
> + /* For exclusive scan, swap GIMPLE_OMP_SCAN without clauses
> + with following GIMPLE_OMP_SCAN with clauses, so that input_phase,
> + the one with exclusive clause(s), comes first. */
> + if (stmt2
> + && gimple_code (stmt2) == GIMPLE_OMP_SCAN
> + && gimple_omp_scan_clauses (as_a <gomp_scan *> (stmt2)) != NULL)
> + {
> + gsi_remove (gsi_p, false);
> + gsi_insert_after (gsi_p, stmt, GSI_SAME_STMT);
> + ctx = maybe_lookup_ctx (stmt2);
> + gcc_assert (ctx);
> + lower_omp_scan (gsi_p, ctx);
> + return;
> + }
> + }
> +
> bool input_phase = has_clauses ^ octx->scan_inclusive;
> if (gimple_code (octx->stmt) == GIMPLE_OMP_FOR
> && (gimple_omp_for_kind (octx->stmt) & GF_OMP_FOR_SIMD)
> - && !gimple_omp_for_combined_into_p (octx->stmt)
> - && octx->scan_inclusive)
> + && !gimple_omp_for_combined_into_p (octx->stmt))
> {
> if (tree c = omp_find_clause (gimple_omp_for_clauses (octx->stmt),
> OMP_CLAUSE__SIMDUID_))
> {
> tree uid = OMP_CLAUSE__SIMDUID__DECL (c);
> lane = create_tmp_var (unsigned_type_node);
> - tree t = build_int_cst (integer_type_node, 1 + !input_phase);
> + tree t = build_int_cst (integer_type_node,
> + input_phase ? 1
> + : octx->scan_inclusive ? 2 : 3);
> gimple *g
> = gimple_build_call_internal (IFN_GOMP_SIMD_LANE, 2, uid, t);
> gimple_call_set_lhs (g, lane);
> @@ -8601,6 +8700,8 @@ lower_omp_scan (gimple_stmt_iterator *gs
> tree val = new_var;
> tree var2 = NULL_TREE;
> tree var3 = NULL_TREE;
> + tree var4 = NULL_TREE;
> + tree lane0 = NULL_TREE;
> tree new_vard = new_var;
> if (omp_is_reference (var))
> {
> @@ -8623,16 +8724,26 @@ lower_omp_scan (gimple_stmt_iterator *gs
> DECL_ATTRIBUTES (v)))
> {
> val = unshare_expr (val);
> + lane0 = TREE_OPERAND (val, 1);
> TREE_OPERAND (val, 1) = lane;
> var2 = lookup_decl (v, octx);
> + if (!octx->scan_inclusive)
> + var4 = lookup_decl (var2, octx);
> if (input_phase
> && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> - var3 = maybe_lookup_decl (var2, octx);
> + var3 = maybe_lookup_decl (var4 ? var4 : var2, octx);
> if (!input_phase)
> {
> var2 = build4 (ARRAY_REF, TREE_TYPE (val),
> var2, lane, NULL_TREE, NULL_TREE);
> TREE_THIS_NOTRAP (var2) = 1;
> + if (!octx->scan_inclusive)
> + {
> + var4 = build4 (ARRAY_REF, TREE_TYPE (val),
> + var4, lane, NULL_TREE,
> + NULL_TREE);
> + TREE_THIS_NOTRAP (var4) = 1;
> + }
> }
> else
> var2 = val;
> @@ -8643,12 +8754,28 @@ lower_omp_scan (gimple_stmt_iterator *gs
> else
> {
> var2 = build_outer_var_ref (var, octx);
> - if (input_phase && OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> + if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> {
> var3 = maybe_lookup_decl (new_vard, octx);
> - if (var3 == new_vard)
> + if (var3 == new_vard || var3 == NULL_TREE)
> var3 = NULL_TREE;
> + else if (!octx->scan_inclusive && !input_phase)
> + {
> + var4 = maybe_lookup_decl (var3, octx);
> + if (var4 == var3 || var4 == NULL_TREE)
> + {
> + if (TREE_ADDRESSABLE (TREE_TYPE (new_var)))
> + {
> + var4 = var3;
> + var3 = NULL_TREE;
> + }
> + else
> + var4 = NULL_TREE;
> + }
> + }
> }
> + if (!octx->scan_inclusive && !input_phase && var4 ==
> NULL_TREE)
> + var4 = create_tmp_var (TREE_TYPE (val));
> }
> if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c))
> {
> @@ -8689,9 +8816,17 @@ lower_omp_scan (gimple_stmt_iterator *gs
> }
> else
> {
> + tree x;
> + if (!octx->scan_inclusive)
> + {
> + tree v4 = unshare_expr (var4);
> + tree v2 = unshare_expr (var2);
> + x = lang_hooks.decls.omp_clause_assign_op (c, v4, v2);
> + gimplify_and_add (x, &before);
> + }
> gimple_seq tseq = OMP_CLAUSE_REDUCTION_GIMPLE_MERGE (c);
> - tree x = (DECL_HAS_VALUE_EXPR_P (new_vard)
> - ? DECL_VALUE_EXPR (new_vard) : NULL_TREE);
> + x = (DECL_HAS_VALUE_EXPR_P (new_vard)
> + ? DECL_VALUE_EXPR (new_vard) : NULL_TREE);
> tree vexpr = val;
> if (x && omp_is_reference (var))
> vexpr = build_fold_addr_expr_loc (clause_loc, val);
> @@ -8706,8 +8841,18 @@ lower_omp_scan (gimple_stmt_iterator *gs
> SET_DECL_VALUE_EXPR (new_vard, x);
> SET_DECL_VALUE_EXPR (placeholder, NULL_TREE);
> DECL_HAS_VALUE_EXPR_P (placeholder) = 0;
> - x = lang_hooks.decls.omp_clause_assign_op (c, val, var2);
> - gimplify_and_add (x, &before);
> + if (octx->scan_inclusive)
> + {
> + x = lang_hooks.decls.omp_clause_assign_op (c, val,
> + var2);
> + gimplify_and_add (x, &before);
> + }
> + else if (lane0 == NULL_TREE)
> + {
> + x = lang_hooks.decls.omp_clause_assign_op (c, val,
> + var4);
> + gimplify_and_add (x, &before);
> + }
> }
> }
> else
> @@ -8728,10 +8873,29 @@ lower_omp_scan (gimple_stmt_iterator *gs
>
> tree x = build2 (code, TREE_TYPE (var2),
> unshare_expr (var2), unshare_expr (val));
> - gimplify_assign (unshare_expr (var2), x, &before);
> - gimplify_assign (val, var2, &before);
> + if (octx->scan_inclusive)
> + {
> + gimplify_assign (unshare_expr (var2), x, &before);
> + gimplify_assign (val, var2, &before);
> + }
> + else
> + {
> + gimplify_assign (unshare_expr (var4),
> + unshare_expr (var2), &before);
> + gimplify_assign (var2, x, &before);
> + if (lane0 == NULL_TREE)
> + gimplify_assign (val, var4, &before);
> + }
> }
> }
> + if (!octx->scan_inclusive && !input_phase && lane0)
> + {
> + tree vexpr = unshare_expr (var4);
> + TREE_OPERAND (vexpr, 1) = lane0;
> + if (omp_is_reference (var))
> + vexpr = build_fold_addr_expr_loc (clause_loc, vexpr);
> + SET_DECL_VALUE_EXPR (new_vard, vexpr);
> + }
> }
> }
> else if (has_clauses)
> --- gcc/tree-vectorizer.h.jj 2019-06-20 13:26:29.078150879 +0200
> +++ gcc/tree-vectorizer.h 2019-06-20 14:18:04.241075200 +0200
> @@ -917,7 +917,7 @@ struct _stmt_vec_info {
> bool strided_p;
>
> /* For both loads and stores. */
> - unsigned simd_lane_access_p : 2;
> + unsigned simd_lane_access_p : 3;
>
> /* Classifies how the load or store is going to be implemented
> for loop vectorization. */
> --- gcc/tree-vect-data-refs.c.jj 2019-06-20 13:55:35.421150589 +0200
> +++ gcc/tree-vect-data-refs.c 2019-06-20 14:18:04.240075216 +0200
> @@ -4223,7 +4223,8 @@ vect_analyze_data_refs (vec_info *vinfo,
> /* See if this was detected as SIMD lane access. */
> if (dr->aux == (void *)-1
> || dr->aux == (void *)-2
> - || dr->aux == (void *)-3)
> + || dr->aux == (void *)-3
> + || dr->aux == (void *)-4)
> {
> if (nested_in_vect_loop_p (loop, stmt_info))
> return opt_result::failure_at (stmt_info->stmt,
> --- gcc/tree-vect-stmts.c.jj 2019-06-20 13:26:29.084150785 +0200
> +++ gcc/tree-vect-stmts.c 2019-06-20 14:18:04.239075231 +0200
> @@ -6512,7 +6512,37 @@ check_scan_store (stmt_vec_info stmt_inf
> kinds are there in order to allow optimizing the initializer store
> and combiner sequence, e.g. if it is originally some C++ish user
> defined reduction, but allow the vectorizer to pattern recognize it
> - and turn into the appropriate vectorized scan. */
> + and turn into the appropriate vectorized scan.
> +
> + For exclusive scan, this is slightly different:
> + #pragma omp simd reduction(inscan,+:r)
> + for (...)
> + {
> + use (r);
> + #pragma omp scan exclusive (r)
> + r += something ();
> + }
> + shall have body with:
> + // Initialization for input phase, store the reduction initializer:
> + _20 = .GOMP_SIMD_LANE (simduid.3_14(D), 0);
> + _21 = .GOMP_SIMD_LANE (simduid.3_14(D), 1);
> + D.2042[_21] = 0;
> + // Actual input phase:
> + ...
> + r.0_5 = D.2042[_20];
> + _6 = _4 + r.0_5;
> + D.2042[_20] = _6;
> + // Initialization for scan phase:
> + _25 = .GOMP_SIMD_LANE (simduid.3_14(D), 3);
> + _26 = D.2043[_25];
> + D.2044[_25] = _26;
> + _27 = D.2042[_25];
> + _28 = _26 + _27;
> + D.2043[_25] = _28;
> + // Actual scan phase:
> + ...
> + r.1_8 = D.2044[_20];
> + ... */
>
> if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 2)
> {
> @@ -6553,26 +6583,52 @@ check_scan_store (stmt_vec_info stmt_inf
> if (TREE_CODE (rhs) != SSA_NAME)
> goto fail;
>
> - use_operand_p use_p;
> - imm_use_iterator iter;
> gimple *other_store_stmt = NULL;
> - FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> + bool inscan_var_store
> + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> +
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> {
> - gimple *use_stmt = USE_STMT (use_p);
> - if (use_stmt == stmt || is_gimple_debug (use_stmt))
> - continue;
> - if (gimple_bb (use_stmt) != gimple_bb (stmt)
> - || !gimple_store_p (use_stmt)
> - || other_store_stmt)
> - goto fail;
> - other_store_stmt = use_stmt;
> + if (!inscan_var_store)
> + {
> + use_operand_p use_p;
> + imm_use_iterator iter;
> + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> + {
> + gimple *use_stmt = USE_STMT (use_p);
> + if (use_stmt == stmt || is_gimple_debug (use_stmt))
> + continue;
> + if (gimple_bb (use_stmt) != gimple_bb (stmt)
> + || !is_gimple_assign (use_stmt)
> + || gimple_assign_rhs_class (use_stmt) != GIMPLE_BINARY_RHS
> + || other_store_stmt
> + || TREE_CODE (gimple_assign_lhs (use_stmt)) != SSA_NAME)
> + goto fail;
> + other_store_stmt = use_stmt;
> + }
> + if (other_store_stmt == NULL)
> + goto fail;
> + rhs = gimple_assign_lhs (other_store_stmt);
> + if (!single_imm_use (rhs, &use_p, &other_store_stmt))
> + goto fail;
> + }
> }
> - if (other_store_stmt == NULL)
> - goto fail;
> - stmt_vec_info other_store_stmt_info
> - = loop_vinfo->lookup_stmt (other_store_stmt);
> - if (other_store_stmt_info == NULL
> - || STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info) != 3)
> + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
> + {
> + use_operand_p use_p;
> + imm_use_iterator iter;
> + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> + {
> + gimple *use_stmt = USE_STMT (use_p);
> + if (use_stmt == stmt || is_gimple_debug (use_stmt))
> + continue;
> + if (other_store_stmt)
> + goto fail;
> + other_store_stmt = use_stmt;
> + }
> + }
> + else
> goto fail;
>
> gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
> @@ -6599,8 +6655,7 @@ check_scan_store (stmt_vec_info stmt_inf
>
> tree rhs1 = gimple_assign_rhs1 (def_stmt);
> tree rhs2 = gimple_assign_rhs2 (def_stmt);
> - if (TREE_CODE (rhs1) != SSA_NAME
> - || TREE_CODE (rhs2) != SSA_NAME)
> + if (TREE_CODE (rhs1) != SSA_NAME || TREE_CODE (rhs2) != SSA_NAME)
> goto fail;
>
> gimple *load1_stmt = SSA_NAME_DEF_STMT (rhs1);
> @@ -6615,22 +6670,83 @@ check_scan_store (stmt_vec_info stmt_inf
> stmt_vec_info load2_stmt_info = loop_vinfo->lookup_stmt (load2_stmt);
> if (load1_stmt_info == NULL
> || load2_stmt_info == NULL
> - || STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info) != 3
> - || STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info) != 3)
> + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load1_stmt_info)
> + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info))
> + || (STMT_VINFO_SIMD_LANE_ACCESS_P (load2_stmt_info)
> + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
> goto fail;
>
> - if (scan_operand_equal_p (gimple_assign_lhs (stmt),
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && inscan_var_store)
> + {
> + dr_vec_info *load1_dr_info = STMT_VINFO_DR_INFO (load1_stmt_info);
> + if (TREE_CODE (DR_BASE_ADDRESS (load1_dr_info->dr)) != ADDR_EXPR
> + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0)))
> + goto fail;
> + tree var1 = TREE_OPERAND (DR_BASE_ADDRESS (load1_dr_info->dr), 0);
> + tree lrhs;
> + if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
> + lrhs = rhs1;
> + else
> + lrhs = rhs2;
> + use_operand_p use_p;
> + imm_use_iterator iter;
> + FOR_EACH_IMM_USE_FAST (use_p, iter, lrhs)
> + {
> + gimple *use_stmt = USE_STMT (use_p);
> + if (use_stmt == def_stmt || is_gimple_debug (use_stmt))
> + continue;
> + if (other_store_stmt)
> + goto fail;
> + other_store_stmt = use_stmt;
> + }
> + }
> +
> + if (other_store_stmt == NULL)
> + goto fail;
> + if (gimple_bb (other_store_stmt) != gimple_bb (stmt)
> + || !gimple_store_p (other_store_stmt))
> + goto fail;
> +
> + stmt_vec_info other_store_stmt_info
> + = loop_vinfo->lookup_stmt (other_store_stmt);
> + if (other_store_stmt_info == NULL
> + || (STMT_VINFO_SIMD_LANE_ACCESS_P (other_store_stmt_info)
> + != STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)))
> + goto fail;
> +
> + gimple *stmt1 = stmt;
> + gimple *stmt2 = other_store_stmt;
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> + std::swap (stmt1, stmt2);
> + if (scan_operand_equal_p (gimple_assign_lhs (stmt1),
> gimple_assign_rhs1 (load2_stmt)))
> {
> std::swap (rhs1, rhs2);
> std::swap (load1_stmt, load2_stmt);
> std::swap (load1_stmt_info, load2_stmt_info);
> }
> - if (!scan_operand_equal_p (gimple_assign_lhs (stmt),
> - gimple_assign_rhs1 (load1_stmt))
> - || !scan_operand_equal_p (gimple_assign_lhs (other_store_stmt),
> + if (!scan_operand_equal_p (gimple_assign_lhs (stmt1),
> + gimple_assign_rhs1 (load1_stmt)))
> + goto fail;
> +
> + tree var3 = NULL_TREE;
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3
> + && !scan_operand_equal_p (gimple_assign_lhs (stmt2),
> gimple_assign_rhs1 (load2_stmt)))
> goto fail;
> + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> + {
> + dr_vec_info *load2_dr_info = STMT_VINFO_DR_INFO (load2_stmt_info);
> + if (TREE_CODE (DR_BASE_ADDRESS (load2_dr_info->dr)) != ADDR_EXPR
> + || !VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0)))
> + goto fail;
> + var3 = TREE_OPERAND (DR_BASE_ADDRESS (load2_dr_info->dr), 0);
> + if (!lookup_attribute ("omp simd array", DECL_ATTRIBUTES (var3))
> + || lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var3))
> + || lookup_attribute ("omp simd inscan exclusive",
> + DECL_ATTRIBUTES (var3)))
> + goto fail;
> + }
>
> dr_vec_info *other_dr_info = STMT_VINFO_DR_INFO (other_store_stmt_info);
> if (TREE_CODE (DR_BASE_ADDRESS (other_dr_info->dr)) != ADDR_EXPR
> @@ -6648,6 +6764,14 @@ check_scan_store (stmt_vec_info stmt_inf
> if (lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var1)))
> std::swap (var1, var2);
>
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> + {
> + if (!lookup_attribute ("omp simd inscan exclusive",
> + DECL_ATTRIBUTES (var1)))
> + goto fail;
> + var1 = var3;
> + }
> +
> if (loop_vinfo->scan_map == NULL)
> goto fail;
> tree *init = loop_vinfo->scan_map->get (var1);
> @@ -6655,6 +6779,7 @@ check_scan_store (stmt_vec_info stmt_inf
> goto fail;
>
> /* The IL is as expected, now check if we can actually vectorize it.
> + Inclusive scan:
> _26 = D.2043[_25];
> _27 = D.2042[_25];
> _28 = _26 + _27;
> @@ -6664,21 +6789,49 @@ check_scan_store (stmt_vec_info stmt_inf
> from the D.2042[_21] = 0; store):
> _30 = MEM <vector(8) int> [(int *)&D.2043];
> _31 = MEM <vector(8) int> [(int *)&D.2042];
> - _32 = VEC_PERM_EXPR <_31, _40, { 8, 0, 1, 2, 3, 4, 5, 6 }>;
> + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
> _33 = _31 + _32;
> // _33 = { _31[0], _31[0]+_31[1], _31[1]+_31[2], ..., _31[6]+_31[7] };
> - _34 = VEC_PERM_EXPR <_33, _40, { 8, 9, 0, 1, 2, 3, 4, 5 }>;
> + _34 = VEC_PERM_EXPR <_40, _33, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
> _35 = _33 + _34;
> // _35 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> // _31[1]+.._31[4], ... _31[4]+.._31[7] };
> - _36 = VEC_PERM_EXPR <_35, _40, { 8, 9, 10, 11, 0, 1, 2, 3 }>;
> + _36 = VEC_PERM_EXPR <_40, _35, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
> _37 = _35 + _36;
> // _37 = { _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> // _31[0]+.._31[4], ... _31[0]+.._31[7] };
> _38 = _30 + _37;
> _39 = VEC_PERM_EXPR <_38, _38, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
> MEM <vector(8) int> [(int *)&D.2043] = _39;
> - MEM <vector(8) int> [(int *)&D.2042] = _38; */
> + MEM <vector(8) int> [(int *)&D.2042] = _38;
> + Exclusive scan:
> + _26 = D.2043[_25];
> + D.2044[_25] = _26;
> + _27 = D.2042[_25];
> + _28 = _26 + _27;
> + D.2043[_25] = _28;
> + should be vectorized as (where _40 is the vectorized rhs
> + from the D.2042[_21] = 0; store):
> + _30 = MEM <vector(8) int> [(int *)&D.2043];
> + _31 = MEM <vector(8) int> [(int *)&D.2042];
> + _32 = VEC_PERM_EXPR <_40, _31, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
> + _33 = VEC_PERM_EXPR <_40, _32, { 0, 8, 9, 10, 11, 12, 13, 14 }>;
> + _34 = _32 + _33;
> + // _34 = { 0, _31[0], _31[0]+_31[1], _31[1]+_31[2], _31[2]+_31[3],
> + // _31[3]+_31[4], ... _31[5]+.._31[6] };
> + _35 = VEC_PERM_EXPR <_40, _34, { 0, 1, 8, 9, 10, 11, 12, 13 }>;
> + _36 = _34 + _35;
> + // _36 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> + // _31[1]+.._31[4], ... _31[3]+.._31[6] };
> + _37 = VEC_PERM_EXPR <_40, _36, { 0, 1, 2, 3, 8, 9, 10, 11 }>;
> + _38 = _36 + _37;
> + // _38 = { 0, _31[0], _31[0]+_31[1], _31[0]+.._31[2], _31[0]+.._31[3],
> + // _31[0]+.._31[4], ... _31[0]+.._31[6] };
> + _39 = _30 + _38;
> + _50 = _31 + _39;
> + _51 = VEC_PERM_EXPR <_50, _50, { 7, 7, 7, 7, 7, 7, 7, 7 }>;
> + MEM <vector(8) int> [(int *)&D.2044] = _39;
> + MEM <vector(8) int> [(int *)&D.2042] = _51; */
> enum machine_mode vec_mode = TYPE_MODE (vectype);
> optab optab = optab_for_tree_code (code, vectype, optab_default);
> if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing)
> @@ -6715,6 +6868,24 @@ vectorizable_scan_store (stmt_vec_info s
> tree rhs = gimple_assign_rhs1 (stmt);
> gcc_assert (TREE_CODE (rhs) == SSA_NAME);
>
> + tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> + bool inscan_var_store
> + = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> +
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> + {
> + use_operand_p use_p;
> + imm_use_iterator iter;
> + FOR_EACH_IMM_USE_FAST (use_p, iter, rhs)
> + {
> + gimple *use_stmt = USE_STMT (use_p);
> + if (use_stmt == stmt || is_gimple_debug (use_stmt))
> + continue;
> + rhs = gimple_assign_lhs (use_stmt);
> + break;
> + }
> + }
> +
> gimple *def_stmt = SSA_NAME_DEF_STMT (rhs);
> enum tree_code code = gimple_assign_rhs_code (def_stmt);
> if (code == POINTER_PLUS_EXPR)
> @@ -6737,15 +6908,12 @@ vectorizable_scan_store (stmt_vec_info s
> {
> std::swap (rhs1, rhs2);
> std::swap (var1, var2);
> + std::swap (load1_dr_info, load2_dr_info);
> }
>
> tree *init = loop_vinfo->scan_map->get (var1);
> gcc_assert (init);
>
> - tree var = TREE_OPERAND (DR_BASE_ADDRESS (dr_info->dr), 0);
> - bool inscan_var_store
> - = lookup_attribute ("omp simd inscan", DECL_ATTRIBUTES (var)) != NULL;
> -
> unsigned HOST_WIDE_INT nunits;
> if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits))
> gcc_unreachable ();
> @@ -6789,29 +6957,50 @@ vectorizable_scan_store (stmt_vec_info s
> tree vec_oprnd1 = NULL_TREE;
> tree vec_oprnd2 = NULL_TREE;
> tree vec_oprnd3 = NULL_TREE;
> - tree dataref_ptr = unshare_expr (DR_BASE_ADDRESS (dr_info->dr));
> + tree dataref_ptr = DR_BASE_ADDRESS (dr_info->dr);
> tree dataref_offset = build_int_cst (ref_type, 0);
> tree bump = vect_get_data_ptr_increment (dr_info, vectype,
> VMAT_CONTIGUOUS);
> + tree ldataref_ptr = NULL_TREE;
> tree orig = NULL_TREE;
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4 && !inscan_var_store)
> + ldataref_ptr = DR_BASE_ADDRESS (load1_dr_info->dr);
> for (int j = 0; j < ncopies; j++)
> {
> stmt_vec_info new_stmt_info;
> if (j == 0)
> {
> vec_oprnd1 = vect_get_vec_def_for_operand (*init, stmt_info);
> - vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info);
> + if (ldataref_ptr == NULL)
> + vec_oprnd2 = vect_get_vec_def_for_operand (rhs1, stmt_info);
> vec_oprnd3 = vect_get_vec_def_for_operand (rhs2, stmt_info);
> orig = vec_oprnd3;
> }
> else
> {
> vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1);
> - vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2);
> + if (ldataref_ptr == NULL)
> + vec_oprnd2 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd2);
> vec_oprnd3 = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd3);
> if (!inscan_var_store)
> dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
> bump);
> }
>
> + if (ldataref_ptr)
> + {
> + vec_oprnd2 = make_ssa_name (vectype);
> + tree data_ref = fold_build2 (MEM_REF, vectype,
> + unshare_expr (ldataref_ptr),
> + dataref_offset);
> + vect_copy_ref_info (data_ref, DR_REF (load1_dr_info->dr));
> + gimple *g = gimple_build_assign (vec_oprnd2, data_ref);
> + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> + if (prev_stmt_info == NULL)
> + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
> + else
> + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> + prev_stmt_info = new_stmt_info;
> + }
> +
> tree v = vec_oprnd2;
> for (int i = 0; i < units_log2; ++i)
> {
> @@ -6848,6 +7037,17 @@ vectorizable_scan_store (stmt_vec_info s
> new_temp = new_temp2;
> }
>
> + /* For exclusive scan, perform the perms[i] permutation once
> + more. */
> + if (i == 0
> + && STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4
> + && v == vec_oprnd2)
> + {
> + v = new_temp;
> + --i;
> + continue;
> + }
> +
> tree new_temp2 = make_ssa_name (vectype);
> g = gimple_build_assign (new_temp2, code, v, new_temp);
> new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> @@ -6863,16 +7063,30 @@ vectorizable_scan_store (stmt_vec_info s
> STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> prev_stmt_info = new_stmt_info;
>
> + tree last_perm_arg = new_temp;
> + /* For exclusive scan, new_temp computed above is the exclusive scan
> + prefix sum. Turn it into inclusive prefix sum for the broadcast
> + of the last element into orig. */
> + if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 4)
> + {
> + last_perm_arg = make_ssa_name (vectype);
> + g = gimple_build_assign (last_perm_arg, code, new_temp, vec_oprnd2);
> + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> + prev_stmt_info = new_stmt_info;
> + }
> +
> orig = make_ssa_name (vectype);
> - g = gimple_build_assign (orig, VEC_PERM_EXPR, new_temp, new_temp,
> - perms[units_log2]);
> + g = gimple_build_assign (orig, VEC_PERM_EXPR, last_perm_arg,
> + last_perm_arg, perms[units_log2]);
> new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi);
> STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
> prev_stmt_info = new_stmt_info;
>
> if (!inscan_var_store)
> {
> - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
> + tree data_ref = fold_build2 (MEM_REF, vectype,
> + unshare_expr (dataref_ptr),
> dataref_offset);
> vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
> g = gimple_build_assign (data_ref, new_temp);
> @@ -6888,7 +7102,8 @@ vectorizable_scan_store (stmt_vec_info s
> if (j != 0)
> dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, bump);
>
> - tree data_ref = fold_build2 (MEM_REF, vectype, dataref_ptr,
> + tree data_ref = fold_build2 (MEM_REF, vectype,
> + unshare_expr (dataref_ptr),
> dataref_offset);
> vect_copy_ref_info (data_ref, DR_REF (dr_info->dr));
> gimple *g = gimple_build_assign (data_ref, orig);
> @@ -7325,7 +7540,7 @@ vectorizable_store (stmt_vec_info stmt_i
> }
> return true;
> }
> - else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) == 3)
> + else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
> return vectorizable_scan_store (stmt_info, gsi, vec_stmt, ncopies);
>
> if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> --- gcc/testsuite/gcc.dg/vect/vect-simd-12.c.jj 2019-06-20 15:08:50.260400440
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-12.c 2019-06-20 15:08:24.332805239
> +0200
> @@ -0,0 +1,122 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b)
> +{
> + #pragma omp simd reduction (inscan, +:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> + int s = 0;
> + #pragma omp simd reduction (inscan, +:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b)
> +{
> + #pragma omp simd reduction (inscan, +:r) if (simd: 0)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> + int s = 0;
> + #pragma omp simd reduction (inscan, +:s) simdlen (1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + int s = 0;
> +#ifndef main
> + check_vect ();
> +#endif
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i] = i;
> + b[i] = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = 25;
> + s += i;
> + }
> + if (bar () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -1;
> + s += 2 * i;
> + }
> + r = 0;
> + baz (a, b);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -25;
> + s += i;
> + }
> + if (qux () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/gcc.dg/vect/vect-simd-13.c.jj 2019-06-20 15:47:23.580359715
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-13.c 2019-06-20 15:13:23.500134387
> +0200
> @@ -0,0 +1,124 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer
> (omp_priv = 0)
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b)
> +{
> + #pragma omp simd reduction (inscan, foo:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> + int s = 0;
> + #pragma omp simd reduction (inscan, foo:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b)
> +{
> + #pragma omp simd reduction (inscan, foo:r) if (simd: 0)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> + int s = 0;
> + #pragma omp simd reduction (inscan, foo:s) simdlen (1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + int s = 0;
> +#ifndef main
> + check_vect ();
> +#endif
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i] = i;
> + b[i] = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = 25;
> + s += i;
> + }
> + if (bar () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -1;
> + s += 2 * i;
> + }
> + r = 0;
> + baz (a, b);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -25;
> + s += i;
> + }
> + if (qux () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/gcc.dg/vect/vect-simd-14.c.jj 2019-06-20 15:48:30.536321539
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-14.c 2019-06-20 15:54:39.291617792
> +0200
> @@ -0,0 +1,94 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +float r = 1.0f, a[1024], b[1024];
> +
> +__attribute__((noipa)) void
> +foo (float *a, float *b)
> +{
> + #pragma omp simd reduction (inscan, *:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r *= a[i];
> + }
> +}
> +
> +__attribute__((noipa)) float
> +bar (void)
> +{
> + float s = -__builtin_inff ();
> + #pragma omp simd reduction (inscan, max:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s = s > a[i] ? s : a[i];
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + float s = 1.0f;
> +#ifndef main
> + check_vect ();
> +#endif
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (i < 80)
> + a[i] = (i & 1) ? 0.25f : 0.5f;
> + else if (i < 200)
> + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
> + else if (i < 280)
> + a[i] = (i & 1) ? 0.25f : 0.5f;
> + else if (i < 380)
> + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f;
> + else
> + switch (i % 6)
> + {
> + case 0: a[i] = 0.25f; break;
> + case 1: a[i] = 2.0f; break;
> + case 2: a[i] = -1.0f; break;
> + case 3: a[i] = -4.0f; break;
> + case 4: a[i] = 0.5f; break;
> + case 5: a[i] = 1.0f; break;
> + default: a[i] = 0.0f; break;
> + }
> + b[i] = -19.0f;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b);
> + if (r * 16384.0f != 0.125f)
> + abort ();
> + float m = -175.25f;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -231.75f;
> + s *= a[i];
> + a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f);
> + m += 0.75f;
> + }
> + if (bar () != 592.0f)
> + abort ();
> + s = -__builtin_inff ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + if (s < a[i])
> + s = a[i];
> + }
> + return 0;
> +}
Hi,
I've noticed that this new test (gcc.dg/vect/vect-simd-14.c)
fails at execution time on arm targets.
It does pass on aarch64.
Christophe
> --- gcc/testsuite/gcc.dg/vect/vect-simd-15.c.jj 2019-06-20 15:50:34.483399705
> +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-simd-15.c 2019-06-20 15:52:09.976919050
> +0200
> @@ -0,0 +1,186 @@
> +/* { dg-require-effective-target size32plus } */
> +/* { dg-additional-options "-fopenmp-simd" } */
> +/* { dg-additional-options "-mavx" { target avx_runtime } } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } } */
> +
> +#ifndef main
> +#include "tree-vect.h"
> +#endif
> +
> +int r, a[1024], b[1024];
> +unsigned short r2, b2[1024];
> +unsigned char r3, b3[1024];
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b, unsigned short *b2, unsigned char *b3)
> +{
> + #pragma omp simd reduction (inscan, +:r, r2, r3)
> + for (int i = 0; i < 1024; i++)
> + {
> + {
> + b[i] = r;
> + b2[i] = r2;
> + b3[i] = r3;
> + }
> + #pragma omp scan exclusive(r, r2, r3)
> + { r += a[i]; r2 += a[i]; r3 += a[i]; }
> + }
> +}
> +
> +__attribute__((noipa)) int
> +bar (unsigned short *s2p, unsigned char *s3p)
> +{
> + int s = 0;
> + unsigned short s2 = 0;
> + unsigned char s3 = 0;
> + #pragma omp simd reduction (inscan, +:s, s2, s3)
> + for (int i = 0; i < 1024; i++)
> + {
> + { b[i] = s; b2[i] = s2; b3[i] = s3; }
> + #pragma omp scan exclusive(s, s2, s3)
> + {
> + s += 2 * a[i];
> + s2 += 2 * a[i];
> + s3 += 2 * a[i];
> + }
> + }
> + *s2p = s2;
> + *s3p = s3;
> + return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b, unsigned short *b2, unsigned char *b3)
> +{
> + #pragma omp simd reduction (inscan, +:r, r2, r3) if (simd: 0)
> + for (int i = 0; i < 1024; i++)
> + {
> + {
> + b[i] = r;
> + b2[i] = r2;
> + b3[i] = r3;
> + }
> + #pragma omp scan exclusive(r, r2, r3)
> + {
> + r += a[i];
> + r2 += a[i];
> + r3 += a[i];
> + }
> + }
> +}
> +
> +__attribute__((noipa)) int
> +qux (unsigned short *s2p, unsigned char *s3p)
> +{
> + int s = 0;
> + unsigned short s2 = 0;
> + unsigned char s3 = 0;
> + #pragma omp simd reduction (inscan, +:s, s2, s3) simdlen (1)
> + for (int i = 0; i < 1024; i++)
> + {
> + { b[i] = s; b2[i] = s2; b3[i] = s3; }
> + #pragma omp scan exclusive(s, s2, s3)
> + { s += 2 * a[i]; s2 += 2 * a[i]; s3 += 2 * a[i]; }
> + }
> + *s2p = s2;
> + *s3p = s3;
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + int s = 0;
> + unsigned short s2;
> + unsigned char s3;
> +#ifndef main
> + check_vect ();
> +#endif
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i] = i;
> + b[i] = -1;
> + b2[i] = -1;
> + b3[i] = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b, b2, b3);
> + if (r != 1024 * 1023 / 2
> + || r2 != (unsigned short) r
> + || r3 != (unsigned char) r)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s
> + || b2[i] != (unsigned short) s
> + || b3[i] != (unsigned char) s)
> + abort ();
> + else
> + {
> + b[i] = 25;
> + b2[i] = 24;
> + b3[i] = 26;
> + }
> + s += i;
> + }
> + if (bar (&s2, &s3) != 1024 * 1023)
> + abort ();
> + if (s2 != (unsigned short) (1024 * 1023)
> + || s3 != (unsigned char) (1024 * 1023))
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s
> + || b2[i] != (unsigned short) s
> + || b3[i] != (unsigned char) s)
> + abort ();
> + else
> + {
> + b[i] = -1;
> + b2[i] = -1;
> + b3[i] = -1;
> + }
> + s += 2 * i;
> + }
> + r = 0;
> + r2 = 0;
> + r3 = 0;
> + baz (a, b, b2, b3);
> + if (r != 1024 * 1023 / 2
> + || r2 != (unsigned short) r
> + || r3 != (unsigned char) r)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s
> + || b2[i] != (unsigned short) s
> + || b3[i] != (unsigned char) s)
> + abort ();
> + else
> + {
> + b[i] = 25;
> + b2[i] = 24;
> + b3[i] = 26;
> + }
> + s += i;
> + }
> + s2 = 0;
> + s3 = 0;
> + if (qux (&s2, &s3) != 1024 * 1023)
> + abort ();
> + if (s2 != (unsigned short) (1024 * 1023)
> + || s3 != (unsigned char) (1024 * 1023))
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s
> + || b2[i] != (unsigned short) s
> + || b3[i] != (unsigned char) s)
> + abort ();
> + s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c.jj 2019-06-20
> 15:58:35.276983324 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-12.c 2019-06-20
> 15:58:35.274983355 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +sse2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c.jj 2019-06-20
> 15:58:35.283983216 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-13.c 2019-06-20
> 15:58:35.281983247 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +sse2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c.jj 2019-06-20
> 15:58:35.288983139 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-14.c 2019-06-20
> 15:58:35.287983154 +0200
> @@ -0,0 +1,15 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +sse2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c.jj 2019-06-20
> 15:58:35.293983061 +0200
> +++ gcc/testsuite/gcc.target/i386/sse2-vect-simd-15.c 2019-06-20
> 15:58:35.292983077 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target sse2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "sse2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +sse2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c.jj 2019-06-20
> 15:58:35.299982969 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-12.c 2019-06-20
> 15:58:35.297982999 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +avx2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c.jj 2019-06-20
> 15:58:35.305982876 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-13.c 2019-06-20
> 15:58:35.303982907 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +avx2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c.jj 2019-06-20
> 15:58:35.310982799 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-14.c 2019-06-20
> 15:58:35.309982815 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +avx2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c.jj 2019-06-20
> 15:58:35.316982707 +0200
> +++ gcc/testsuite/gcc.target/i386/avx2-vect-simd-15.c 2019-06-20
> 15:58:35.314982738 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx2 } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx2-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +avx2_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c.jj 2019-06-20
> 15:58:35.323982599 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-12.c 2019-06-20
> 15:58:35.321982630 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-12.c"
> +
> +static void
> +avx512f_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c.jj 2019-06-20
> 15:58:35.328982522 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-13.c 2019-06-20
> 15:58:35.326982553 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-13.c"
> +
> +static void
> +avx512f_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c.jj 2019-06-20
> 15:58:35.333982445 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-vect-simd-14.c 2019-06-20
> 15:58:35.332982461 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512f } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx512f-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-14.c"
> +
> +static void
> +avx512f_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c.jj 2019-06-20
> 15:58:35.347982230 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512bw-vect-simd-15.c 2019-06-20
> 15:58:35.346982245 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fopenmp-simd -mavx512bw -mprefer-vector-width=512
> -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target avx512bw } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } }
> */
> +
> +#include "avx512bw-check.h"
> +
> +#define main() do_main ()
> +
> +#include "../../gcc.dg/vect/vect-simd-15.c"
> +
> +static void
> +avx512bw_test (void)
> +{
> + do_main ();
> +}
> --- gcc/testsuite/g++.dg/vect/simd-6.cc.jj 2019-06-20 16:00:34.800142524
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-6.cc 2019-06-20 16:07:41.722559826 +0200
> @@ -0,0 +1,161 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> xfail *-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +template <typename T>
> +struct S {
> + inline S ();
> + inline ~S ();
> + inline S (const S &);
> + inline S & operator= (const S &);
> + T s;
> +};
> +
> +template <typename T>
> +S<T>::S () : s (0)
> +{
> +}
> +
> +template <typename T>
> +S<T>::~S ()
> +{
> +}
> +
> +template <typename T>
> +S<T>::S (const S &x)
> +{
> + s = x.s;
> +}
> +
> +template <typename T>
> +S<T> &
> +S<T>::operator= (const S &x)
> +{
> + s = x.s;
> + return *this;
> +}
> +
> +template <typename T>
> +static inline void
> +ini (S<T> &x)
> +{
> + x.s = 0;
> +}
> +
> +S<int> r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (+: S<int>: omp_out.s += omp_in.s)
> +#pragma omp declare reduction (plus: S<int>: omp_out.s += omp_in.s)
> initializer (ini (omp_priv))
> +
> +template <typename T>
> +__attribute__((noipa)) void
> +foo (S<T> *a, S<T> *b)
> +{
> + #pragma omp simd reduction (inscan, +:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r.s += a[i].s;
> + }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) S<T>
> +bar (void)
> +{
> + S<T> s;
> + #pragma omp simd reduction (inscan, plus:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s.s += 2 * a[i].s;
> + }
> + return S<T> (s);
> +}
> +
> +__attribute__((noipa)) void
> +baz (S<int> *a, S<int> *b)
> +{
> + #pragma omp simd reduction (inscan, +:r) simdlen(1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r.s += a[i].s;
> + }
> +}
> +
> +__attribute__((noipa)) S<int>
> +qux (void)
> +{
> + S<int> s;
> + #pragma omp simd if (0) reduction (inscan, plus:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s.s += 2 * a[i].s;
> + }
> + return S<int> (s);
> +}
> +
> +int
> +main ()
> +{
> + S<int> s;
> + check_vect ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i].s = i;
> + b[i].s = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b);
> + if (r.s != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + else
> + b[i].s = 25;
> + s.s += i;
> + }
> + if (bar<int> ().s != 1024 * 1023)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + s.s += 2 * i;
> + }
> + r.s = 0;
> + baz (a, b);
> + if (r.s != 1024 * 1023 / 2)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + else
> + b[i].s = 25;
> + s.s += i;
> + }
> + if (qux ().s != 1024 * 1023)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + s.s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-7.cc.jj 2019-06-20 16:00:51.095891542
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-7.cc 2019-06-20 16:12:50.222747875 +0200
> @@ -0,0 +1,124 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } } */
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +int r, a[1024], b[1024], q;
> +
> +template <typename T, typename U>
> +__attribute__((noipa)) void
> +foo (T a, T b, U r)
> +{
> + #pragma omp simd reduction (inscan, +:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) T
> +bar (void)
> +{
> + T &s = q;
> + q = 0;
> + #pragma omp simd reduction (inscan, +:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) void
> +baz (T *a, T *b, T &r)
> +{
> + #pragma omp simd reduction (inscan, +:r) if (simd: 0)
> + for (T i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +template <typename T>
> +__attribute__((noipa)) int
> +qux (void)
> +{
> + T s = q;
> + q = 0;
> + #pragma omp simd reduction (inscan, +:s) simdlen (1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + int s = 0;
> + check_vect ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i] = i;
> + b[i] = -1;
> + asm ("" : "+g" (i));
> + }
> + foo<int *, int &> (a, b, r);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = 25;
> + s += i;
> + }
> + if (bar<int> () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -1;
> + s += 2 * i;
> + }
> + r = 0;
> + baz<int> (a, b, r);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -25;
> + s += i;
> + }
> + if (qux<int &> () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-8.cc.jj 2019-06-20 16:00:54.154844430
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-8.cc 2019-06-20 16:15:37.994133891 +0200
> @@ -0,0 +1,122 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> target i?86-*-* x86_64-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +int r, a[1024], b[1024], q;
> +
> +#pragma omp declare reduction (foo: int: omp_out += omp_in) initializer
> (omp_priv = 0)
> +
> +__attribute__((noipa)) void
> +foo (int *a, int *b, int &r)
> +{
> + #pragma omp simd reduction (inscan, foo:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +bar (void)
> +{
> + int &s = q;
> + q = 0;
> + #pragma omp simd reduction (inscan, foo:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (int *a, int *b, int &r)
> +{
> + #pragma omp simd reduction (inscan, foo:r) if (simd: 0)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r += a[i];
> + }
> +}
> +
> +__attribute__((noipa)) int
> +qux (void)
> +{
> + int &s = q;
> + q = 0;
> + #pragma omp simd reduction (inscan, foo:s) simdlen (1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s += 2 * a[i];
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + int s = 0;
> + check_vect ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i] = i;
> + b[i] = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b, r);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = 25;
> + s += i;
> + }
> + if (bar () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -1;
> + s += 2 * i;
> + }
> + r = 0;
> + baz (a, b, r);
> + if (r != 1024 * 1023 / 2)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + else
> + b[i] = -25;
> + s += i;
> + }
> + if (qux () != 1024 * 1023)
> + abort ();
> + s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i] != s)
> + abort ();
> + s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/g++.dg/vect/simd-9.cc.jj 2019-06-20 16:00:57.197797566
> +0200
> +++ gcc/testsuite/g++.dg/vect/simd-9.cc 2019-06-20 16:17:27.484427949 +0200
> @@ -0,0 +1,153 @@
> +// { dg-require-effective-target size32plus }
> +// { dg-additional-options "-fopenmp-simd" }
> +// { dg-additional-options "-mavx" { target avx_runtime } }
> +// { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" {
> xfail *-*-* } } }
> +
> +#include "../../gcc.dg/vect/tree-vect.h"
> +
> +struct S {
> + inline S ();
> + inline ~S ();
> + inline S (const S &);
> + inline S & operator= (const S &);
> + int s;
> +};
> +
> +S::S () : s (0)
> +{
> +}
> +
> +S::~S ()
> +{
> +}
> +
> +S::S (const S &x)
> +{
> + s = x.s;
> +}
> +
> +S &
> +S::operator= (const S &x)
> +{
> + s = x.s;
> + return *this;
> +}
> +
> +static inline void
> +ini (S &x)
> +{
> + x.s = 0;
> +}
> +
> +S r, a[1024], b[1024];
> +
> +#pragma omp declare reduction (+: S: omp_out.s += omp_in.s)
> +#pragma omp declare reduction (plus: S: omp_out.s += omp_in.s) initializer
> (ini (omp_priv))
> +
> +__attribute__((noipa)) void
> +foo (S *a, S *b, S &r)
> +{
> + #pragma omp simd reduction (inscan, +:r)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r.s += a[i].s;
> + }
> +}
> +
> +__attribute__((noipa)) S
> +bar (void)
> +{
> + S s;
> + #pragma omp simd reduction (inscan, plus:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s.s += 2 * a[i].s;
> + }
> + return s;
> +}
> +
> +__attribute__((noipa)) void
> +baz (S *a, S *b, S &r)
> +{
> + #pragma omp simd reduction (inscan, +:r) simdlen(1)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = r;
> + #pragma omp scan exclusive(r)
> + r.s += a[i].s;
> + }
> +}
> +
> +__attribute__((noipa)) S
> +qux (void)
> +{
> + S s;
> + #pragma omp simd if (0) reduction (inscan, plus:s)
> + for (int i = 0; i < 1024; i++)
> + {
> + b[i] = s;
> + #pragma omp scan exclusive(s)
> + s.s += 2 * a[i].s;
> + }
> + return s;
> +}
> +
> +int
> +main ()
> +{
> + S s;
> + check_vect ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[i].s = i;
> + b[i].s = -1;
> + asm ("" : "+g" (i));
> + }
> + foo (a, b, r);
> + if (r.s != 1024 * 1023 / 2)
> + abort ();
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + else
> + b[i].s = 25;
> + s.s += i;
> + }
> + if (bar ().s != 1024 * 1023)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + s.s += 2 * i;
> + }
> + r.s = 0;
> + baz (a, b, r);
> + if (r.s != 1024 * 1023 / 2)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + else
> + b[i].s = 25;
> + s.s += i;
> + }
> + if (qux ().s != 1024 * 1023)
> + abort ();
> + s.s = 0;
> + for (int i = 0; i < 1024; ++i)
> + {
> + if (b[i].s != s.s)
> + abort ();
> + s.s += 2 * i;
> + }
> + return 0;
> +}
> --- gcc/testsuite/c-c++-common/gomp/scan-2.c.jj 2019-06-10 14:18:17.461525669
> +0200
> +++ gcc/testsuite/c-c++-common/gomp/scan-2.c 2019-06-20 23:54:03.615422149
> +0200
> @@ -8,7 +8,7 @@ f1 (int *c, int *d)
> for (i = 0; i < 64; i++)
> {
> d[i] = a;
> - #pragma omp scan exclusive (a) /* { dg-message "sorry,
> unimplemented: '#pragma omp scan' not supported yet" } */
> + #pragma omp scan exclusive (a)
> a += c[i];
> }
> }
>
> Jakub