On Wed, 27 Nov 2024, Tamar Christina wrote:
> Hi All,
>
> The testcase
>
> #include <stdint.h>
> #include <string.h>
>
> #define N 8
> #define L 8
>
> void f(const uint8_t * restrict seq1,
> const uint8_t *idx, uint8_t *seq_out) {
> for (int i = 0; i < L; ++i) {
> uint8_t h = idx[i];
> memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
> }
> }
>
> compiled at -O3 -mcpu=neoverse-n1+sve
>
> miscompiles to:
>
> ld1w z31.s, p3/z, [x23, z29.s, sxtw]
> ld1w z29.s, p7/z, [x23, z30.s, sxtw]
> st1w z29.s, p7, [x24, z12.s, sxtw]
> st1w z31.s, p7, [x24, z12.s, sxtw]
>
> rather than
>
> ld1w z31.s, p3/z, [x23, z29.s, sxtw]
> ld1w z29.s, p7/z, [x23, z30.s, sxtw]
> st1w z29.s, p7, [x24, z12.s, sxtw]
> addvl x3, x24, #2
> st1w z31.s, p3, [x3, z12.s, sxtw]
>
> Where two things go wrong, the wrong mask is used and the address pointers to
> the stores are wrong.
>
> This issue is happening because the codegen loop in vectorizable_store is a
> nested loop where in the outer loop we iterate over ncopies and in the inner
> loop we loop over vec_num.
>
> For SLP ncopies == 1 and vec_num == SLP_NUM_STMS, but the loop mask is
> determined by only the outerloop index and the pointer address is only updated
> in the outer loop.
>
> As such for SLP we always use the same predicate and the same memory location.
> This patch flattens the two loops and instead iterates over ncopies * vec_num
> and simplified the indexing.
>
> This does not fully fix the gcc_r miscompile error in SPECCPU 2017 as the
> error
> moves somewhere else. I will look at that next but fixes some other libraries
> that also started failing.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
> x86_64-pc-linux-gnu -m32, -m64 and no issues
>
> Ok for master?
OK.
Thanks,
Richard.
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/117557
> * tree-vect-stmts.cc (vectorizable_store): Flatten the ncopies and
> vec_num loops.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/117557
> * gcc.target/aarch64/pr117557.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr117557.c
> b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..80b3fde41109988db70eafd715224df0b0029cd1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr117557.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=neoverse-n1+sve -fdump-tree-vect" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +#include <string.h>
> +
> +#define N 8
> +#define L 8
> +
> +/*
> +**f:
> +** ...
> +** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** ld1w z[0-9]+.s, p([0-9]+)/z, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** st1w z[0-9]+.s, p\1, \[x[0-9]+, z[0-9]+.s, sxtw\]
> +** incb x([0-9]+), all, mul #2
> +** st1w z[0-9]+.s, p\2, \[x\3, z[0-9]+.s, sxtw\]
> +** ret
> +** ...
> +*/
> +void f(const uint8_t * restrict seq1,
> + const uint8_t *idx, uint8_t *seq_out) {
> + for (int i = 0; i < L; ++i) {
> + uint8_t h = idx[i];
> + memcpy((void *)&seq_out[i * N], (const void *)&seq1[h * N / 2], N / 2);
> + }
> +}
> +
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index
> c2d5818b2786123fac7afe290d85c7dd2bda4308..4759c274f3ccbb111a907576539b2a8efb7726a3
> 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9228,7 +9228,8 @@ vectorizable_store (vec_info *vinfo,
> gcc_assert (!grouped_store);
> auto_vec<tree> vec_offsets;
> unsigned int inside_cost = 0, prologue_cost = 0;
> - for (j = 0; j < ncopies; j++)
> + int num_stmts = ncopies * vec_num;
> + for (j = 0; j < num_stmts; j++)
> {
> gimple *new_stmt;
> if (j == 0)
> @@ -9246,14 +9247,14 @@ vectorizable_store (vec_info *vinfo,
> vect_get_slp_defs (op_node, gvec_oprnds[0]);
> else
> vect_get_vec_defs_for_operand (vinfo, first_stmt_info,
> - ncopies, op, gvec_oprnds[0]);
> + num_stmts, op,
> gvec_oprnds[0]);
> if (mask)
> {
> if (slp_node)
> vect_get_slp_defs (mask_node, &vec_masks);
> else
> vect_get_vec_defs_for_operand (vinfo, stmt_info,
> - ncopies,
> + num_stmts,
> mask, &vec_masks,
> mask_vectype);
> }
> @@ -9279,281 +9280,280 @@ vectorizable_store (vec_info *vinfo,
> }
>
> new_stmt = NULL;
> - for (i = 0; i < vec_num; ++i)
> + if (!costing_p)
> {
> - if (!costing_p)
> - {
> - vec_oprnd = (*gvec_oprnds[0])[vec_num * j + i];
> - if (mask)
> - vec_mask = vec_masks[vec_num * j + i];
> - /* We should have catched mismatched types earlier. */
> - gcc_assert (useless_type_conversion_p (vectype,
> - TREE_TYPE
> (vec_oprnd)));
> - }
> - unsigned HOST_WIDE_INT align;
> - tree final_mask = NULL_TREE;
> - tree final_len = NULL_TREE;
> - tree bias = NULL_TREE;
> - if (!costing_p)
> + vec_oprnd = (*gvec_oprnds[0])[j];
> + if (mask)
> + vec_mask = vec_masks[j];
> + /* We should have catched mismatched types earlier. */
> + gcc_assert (useless_type_conversion_p (vectype,
> + TREE_TYPE (vec_oprnd)));
> + }
> + unsigned HOST_WIDE_INT align;
> + tree final_mask = NULL_TREE;
> + tree final_len = NULL_TREE;
> + tree bias = NULL_TREE;
> + if (!costing_p)
> + {
> + if (loop_masks)
> + final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> + loop_masks, num_stmts,
> + vectype, j);
> + if (vec_mask)
> + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> + final_mask, vec_mask, gsi);
> + }
> +
> + if (gs_info.ifn != IFN_LAST)
> + {
> + if (costing_p)
> {
> - if (loop_masks)
> - final_mask = vect_get_loop_mask (loop_vinfo, gsi,
> - loop_masks,
> - ncopies * vec_num,
> - vectype, j);
> - if (vec_mask)
> - final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
> - final_mask, vec_mask, gsi);
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0,
> + vect_body);
> + continue;
> }
>
> - if (gs_info.ifn != IFN_LAST)
> - {
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0,
> - vect_body);
> - continue;
> - }
> + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> + vec_offset = vec_offsets[j];
>
> - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> - vec_offset = vec_offsets[vec_num * j + i];
> - tree scale = size_int (gs_info.scale);
> + tree scale = size_int (gs_info.scale);
>
> - if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> - {
> - if (loop_lens)
> - final_len = vect_get_loop_len (loop_vinfo, gsi,
> - loop_lens,
> - ncopies * vec_num,
> - vectype, j, 1);
> - else
> - final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> - signed char biasval
> - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> - bias = build_int_cst (intQI_type_node, biasval);
> - if (!final_mask)
> - {
> - mask_vectype = truth_type_for (vectype);
> - final_mask = build_minus_one_cst (mask_vectype);
> - }
> - }
> + if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
> + {
> + if (loop_lens)
> + final_len = vect_get_loop_len (loop_vinfo, gsi,
> + loop_lens, num_stmts,
> + vectype, j, 1);
> + else
> + final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
>
> - gcall *call;
> - if (final_len && final_mask)
> + signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> + bias = build_int_cst (intQI_type_node, biasval);
> + if (!final_mask)
> {
> - if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> - call = gimple_build_call_internal (
> - IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> - vec_offset, scale, vec_oprnd, final_mask, final_len,
> - bias);
> - else
> - /* Non-vector offset indicates that prefer to take
> - MASK_LEN_STRIDED_STORE instead of the
> - IFN_MASK_SCATTER_STORE with direct stride arg. */
> - call = gimple_build_call_internal (
> - IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> - vec_offset, vec_oprnd, final_mask, final_len, bias);
> + mask_vectype = truth_type_for (vectype);
> + final_mask = build_minus_one_cst (mask_vectype);
> }
> - else if (final_mask)
> - call = gimple_build_call_internal
> - (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> - vec_offset, scale, vec_oprnd, final_mask);
> + }
> +
> + gcall *call;
> + if (final_len && final_mask)
> + {
> + if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
> + vec_offset, scale, vec_oprnd, final_mask, final_len,
> + bias);
> else
> - call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> - dataref_ptr, vec_offset,
> - scale, vec_oprnd);
> - gimple_call_set_nothrow (call, true);
> - vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> - new_stmt = call;
> + /* Non-vector offset indicates that prefer to take
> + MASK_LEN_STRIDED_STORE instead of the
> + IFN_MASK_SCATTER_STORE with direct stride arg. */
> + call = gimple_build_call_internal (
> + IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
> + vec_offset, vec_oprnd, final_mask, final_len, bias);
> }
> - else if (gs_info.decl)
> + else if (final_mask)
> + call = gimple_build_call_internal
> + (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
> + vec_offset, scale, vec_oprnd, final_mask);
> + else
> + call = gimple_build_call_internal (IFN_SCATTER_STORE, 4,
> + dataref_ptr, vec_offset,
> + scale, vec_oprnd);
> + gimple_call_set_nothrow (call, true);
> + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
> + new_stmt = call;
> + }
> + else if (gs_info.decl)
> + {
> + /* The builtin decls path for scatter is legacy, x86 only. */
> + gcc_assert (nunits.is_constant ()
> + && (!final_mask
> + || SCALAR_INT_MODE_P
> + (TYPE_MODE (TREE_TYPE (final_mask)))));
> + if (costing_p)
> {
> - /* The builtin decls path for scatter is legacy, x86 only. */
> - gcc_assert (nunits.is_constant ()
> - && (!final_mask
> - || SCALAR_INT_MODE_P
> - (TYPE_MODE (TREE_TYPE (final_mask)))));
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0, vect_body);
> - continue;
> - }
> - poly_uint64 offset_nunits
> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> - if (known_eq (nunits, offset_nunits))
> - {
> - new_stmt = vect_build_one_scatter_store_call
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0, vect_body);
> + continue;
> + }
> +
> + poly_uint64 offset_nunits
> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype);
> + if (known_eq (nunits, offset_nunits))
> + {
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr, vec_offsets[vec_num * j + i],
> + dataref_ptr, vec_offsets[j],
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else if (known_eq (nunits, offset_nunits * 2))
> - {
> - /* We have a offset vector with half the number of
> - lanes but the builtins will store full vectype
> - data from the lower lanes. */
> - new_stmt = vect_build_one_scatter_store_call
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else if (known_eq (nunits, offset_nunits * 2))
> + {
> + /* We have a offset vector with half the number of
> + lanes but the builtins will store full vectype
> + data from the lower lanes. */
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr,
> - vec_offsets[2 * vec_num * j + 2 * i],
> + dataref_ptr, vec_offsets[2 * j],
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> + vect_finish_stmt_generation (vinfo, stmt_info,
> new_stmt, gsi);
> - int count = nunits.to_constant ();
> - vec_perm_builder sel (count, count, 1);
> - sel.quick_grow (count);
> - for (int i = 0; i < count; ++i)
> - sel[i] = i | (count / 2);
> - vec_perm_indices indices (sel, 2, count);
> - tree perm_mask
> - = vect_gen_perm_mask_checked (vectype, indices);
> - new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> - vec_oprnd, vec_oprnd,
> - perm_mask);
> - vec_oprnd = make_ssa_name (vectype);
> - gimple_set_lhs (new_stmt, vec_oprnd);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - if (final_mask)
> - {
> - new_stmt = gimple_build_assign (NULL_TREE,
> - VEC_UNPACK_HI_EXPR,
> - final_mask);
> - final_mask = make_ssa_name
> + int count = nunits.to_constant ();
> + vec_perm_builder sel (count, count, 1);
> + sel.quick_grow (count);
> + for (int i = 0; i < count; ++i)
> + sel[i] = i | (count / 2);
> + vec_perm_indices indices (sel, 2, count);
> + tree perm_mask
> + = vect_gen_perm_mask_checked (vectype, indices);
> + new_stmt = gimple_build_assign (NULL_TREE, VEC_PERM_EXPR,
> + vec_oprnd, vec_oprnd,
> + perm_mask);
> + vec_oprnd = make_ssa_name (vectype);
> + gimple_set_lhs (new_stmt, vec_oprnd);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + if (final_mask)
> + {
> + new_stmt = gimple_build_assign (NULL_TREE,
> + VEC_UNPACK_HI_EXPR,
> + final_mask);
> + final_mask = make_ssa_name
> (truth_type_for (gs_info.offset_vectype));
> - gimple_set_lhs (new_stmt, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> + gimple_set_lhs (new_stmt, final_mask);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> }
> - new_stmt = vect_build_one_scatter_store_call
> - (vinfo, stmt_info, gsi, &gs_info,
> - dataref_ptr,
> - vec_offsets[2 * vec_num * j + 2 * i + 1],
> - vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else if (known_eq (nunits * 2, offset_nunits))
> - {
> - /* We have a offset vector with double the number of
> - lanes. Select the low/high part accordingly. */
> - vec_offset = vec_offsets[(vec_num * j + i) / 2];
> - if ((vec_num * j + i) & 1)
> - {
> - int count = offset_nunits.to_constant ();
> - vec_perm_builder sel (count, count, 1);
> - sel.quick_grow (count);
> - for (int i = 0; i < count; ++i)
> - sel[i] = i | (count / 2);
> - vec_perm_indices indices (sel, 2, count);
> - tree perm_mask = vect_gen_perm_mask_checked
> - (TREE_TYPE (vec_offset), indices);
> - new_stmt = gimple_build_assign (NULL_TREE,
> - VEC_PERM_EXPR,
> - vec_offset,
> - vec_offset,
> - perm_mask);
> - vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> - gimple_set_lhs (new_stmt, vec_offset);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - new_stmt = vect_build_one_scatter_store_call
> +
> + new_stmt = vect_build_one_scatter_store_call
> + (vinfo, stmt_info, gsi, &gs_info,
> + dataref_ptr, vec_offsets[2 * j + 1],
> + vec_oprnd, final_mask);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else if (known_eq (nunits * 2, offset_nunits))
> + {
> + /* We have a offset vector with double the number of
> + lanes. Select the low/high part accordingly. */
> + vec_offset = vec_offsets[j / 2];
> + if (j & 1)
> + {
> + int count = offset_nunits.to_constant ();
> + vec_perm_builder sel (count, count, 1);
> + sel.quick_grow (count);
> + for (int i = 0; i < count; ++i)
> + sel[i] = i | (count / 2);
> + vec_perm_indices indices (sel, 2, count);
> + tree perm_mask = vect_gen_perm_mask_checked
> + (TREE_TYPE (vec_offset), indices);
> + new_stmt = gimple_build_assign (NULL_TREE,
> + VEC_PERM_EXPR,
> + vec_offset,
> + vec_offset,
> + perm_mask);
> + vec_offset = make_ssa_name (TREE_TYPE (vec_offset));
> + gimple_set_lhs (new_stmt, vec_offset);
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> +
> + new_stmt = vect_build_one_scatter_store_call
> (vinfo, stmt_info, gsi, &gs_info,
> dataref_ptr, vec_offset,
> vec_oprnd, final_mask);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - else
> - gcc_unreachable ();
> - }
> - else
> + vect_finish_stmt_generation (vinfo, stmt_info,
> + new_stmt, gsi);
> + }
> + else
> + gcc_unreachable ();
> + }
> + else
> + {
> + /* Emulated scatter. */
> + gcc_assert (!final_mask);
> + if (costing_p)
> {
> - /* Emulated scatter. */
> - gcc_assert (!final_mask);
> - if (costing_p)
> - {
> - unsigned int cnunits = vect_nunits_for_cost (vectype);
> - /* For emulated scatter N offset vector element extracts
> - (we assume the scalar scaling and ptr + offset add is
> - consumed by the load). */
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> - stmt_info, slp_node, 0, vect_body);
> - /* N scalar stores plus extracting the elements. */
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> - stmt_info, slp_node, 0, vect_body);
> - inside_cost
> - += record_stmt_cost (cost_vec, cnunits, scalar_store,
> - stmt_info, slp_node, 0, vect_body);
> - continue;
> - }
> + unsigned int cnunits = vect_nunits_for_cost (vectype);
> + /* For emulated scatter N offset vector element extracts
> + (we assume the scalar scaling and ptr + offset add is
> + consumed by the load). */
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> + stmt_info, slp_node, 0, vect_body);
> + /* N scalar stores plus extracting the elements. */
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
> + stmt_info, slp_node, 0, vect_body);
> + inside_cost
> + += record_stmt_cost (cost_vec, cnunits, scalar_store,
> + stmt_info, slp_node, 0, vect_body);
> + continue;
> + }
>
> - unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> - unsigned HOST_WIDE_INT const_offset_nunits
> - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant
> ();
> - vec<constructor_elt, va_gc> *ctor_elts;
> - vec_alloc (ctor_elts, const_nunits);
> - gimple_seq stmts = NULL;
> - tree elt_type = TREE_TYPE (vectype);
> - unsigned HOST_WIDE_INT elt_size
> - = tree_to_uhwi (TYPE_SIZE (elt_type));
> - /* We support offset vectors with more elements
> - than the data vector for now. */
> - unsigned HOST_WIDE_INT factor
> - = const_offset_nunits / const_nunits;
> - vec_offset = vec_offsets[(vec_num * j + i) / factor];
> - unsigned elt_offset
> - = ((vec_num * j + i) % factor) * const_nunits;
> - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> - tree scale = size_int (gs_info.scale);
> - align = get_object_alignment (DR_REF (first_dr_info->dr));
> - tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> - for (unsigned k = 0; k < const_nunits; ++k)
> - {
> - /* Compute the offsetted pointer. */
> - tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> - bitsize_int (k + elt_offset));
> - tree idx
> - = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> - vec_offset, TYPE_SIZE (idx_type), boff);
> - idx = gimple_convert (&stmts, sizetype, idx);
> - idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> - idx, scale);
> - tree ptr
> - = gimple_build (&stmts, PLUS_EXPR,
> - TREE_TYPE (dataref_ptr),
> - dataref_ptr, idx);
> - ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> - /* Extract the element to be stored. */
> - tree elt
> - = gimple_build (&stmts, BIT_FIELD_REF,
> - TREE_TYPE (vectype),
> - vec_oprnd, TYPE_SIZE (elt_type),
> - bitsize_int (k * elt_size));
> - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> - stmts = NULL;
> - tree ref
> - = build2 (MEM_REF, ltype, ptr,
> - build_int_cst (ref_type, 0));
> - new_stmt = gimple_build_assign (ref, elt);
> - vect_finish_stmt_generation (vinfo, stmt_info,
> - new_stmt, gsi);
> - }
> - if (slp)
> - slp_node->push_vec_def (new_stmt);
> + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
> + unsigned HOST_WIDE_INT const_offset_nunits
> + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype).to_constant ();
> + vec<constructor_elt, va_gc> *ctor_elts;
> + vec_alloc (ctor_elts, const_nunits);
> + gimple_seq stmts = NULL;
> + tree elt_type = TREE_TYPE (vectype);
> + unsigned HOST_WIDE_INT elt_size
> + = tree_to_uhwi (TYPE_SIZE (elt_type));
> + /* We support offset vectors with more elements
> + than the data vector for now. */
> + unsigned HOST_WIDE_INT factor
> + = const_offset_nunits / const_nunits;
> + vec_offset = vec_offsets[j / factor];
> + unsigned elt_offset
> + = (j % factor) * const_nunits;
> + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
> + tree scale = size_int (gs_info.scale);
> + align = get_object_alignment (DR_REF (first_dr_info->dr));
> + tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
> + for (unsigned k = 0; k < const_nunits; ++k)
> + {
> + /* Compute the offsetted pointer. */
> + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
> + bitsize_int (k + elt_offset));
> + tree idx
> + = gimple_build (&stmts, BIT_FIELD_REF, idx_type,
> + vec_offset, TYPE_SIZE (idx_type), boff);
> + idx = gimple_convert (&stmts, sizetype, idx);
> + idx = gimple_build (&stmts, MULT_EXPR, sizetype,
> + idx, scale);
> + tree ptr
> + = gimple_build (&stmts, PLUS_EXPR,
> + TREE_TYPE (dataref_ptr),
> + dataref_ptr, idx);
> + ptr = gimple_convert (&stmts, ptr_type_node, ptr);
> + /* Extract the element to be stored. */
> + tree elt
> + = gimple_build (&stmts, BIT_FIELD_REF,
> + TREE_TYPE (vectype),
> + vec_oprnd, TYPE_SIZE (elt_type),
> + bitsize_int (k * elt_size));
> + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> + stmts = NULL;
> + tree ref
> + = build2 (MEM_REF, ltype, ptr,
> + build_int_cst (ref_type, 0));
> + new_stmt = gimple_build_assign (ref, elt);
> + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> }
> +
> + if (slp)
> + slp_node->push_vec_def (new_stmt);
> }
> +
> if (!slp && !costing_p)
> STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> }
>
>
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)