Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: > On Mon, 6 Jun 2022 at 16:29, Richard Sandiford > <richard.sandif...@arm.com> wrote: >> >> Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> writes: >> >> > { >> >> > /* The pattern matching functions above are written to look for a >> >> > small >> >> > number to begin the sequence (0, 1, N/2). If we begin with an >> >> > index >> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct >> >> > expand_vec_perm_d *d) >> >> > || d->vec_flags == VEC_SVE_PRED) >> >> > && known_gt (nelt, 1)) >> >> > { >> >> > + /* If operand and result modes differ, then only check >> >> > + for dup case. */ >> >> > + if (d->vmode != op_mode) >> >> > + return (d->vec_flags == VEC_SVE_DATA) >> >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false; >> >> > + >> >> >> >> I think it'd be more future-proof to format this as: >> >> >> >> if (d->vmod == d->op_mode) >> >> { >> >> …existing code… >> >> } >> >> else >> >> { >> >> if (aarch64_evpc_sve_dup (d)) >> >> return true; >> >> } >> >> >> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup, >> >> alongside the op_mode check. I think we'll be adding more checks here >> >> over time. >> > Um I was wondering if we should structure it as: >> > if (d->vmode == d->op_mode) >> > { >> > ...existing code... >> > } >> > if (aarch64_evpc_sve_dup (d)) >> > return true; >> > >> > So we check for dup irrespective of d->vmode == d->op_mode ? >> >> Yeah, I can see the attraction of that. I think the else is better >> though because the fallback TBL handling will (rightly) come at the end >> of the existing code. Without the else, we'd have specific tests like >> DUP after generic ones like TBL, so the reader would have to work out >> for themselves that DUP and TBL handle disjoint cases. >> >> >> > if (aarch64_evpc_rev_local (d)) >> >> > return true; >> >> > else if (aarch64_evpc_rev_global (d)) >> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct >> >> > expand_vec_perm_d *d) >> >> > else if (aarch64_evpc_reencode (d)) >> >> > return true; >> >> > if (d->vec_flags == VEC_SVE_DATA) >> >> > - return aarch64_evpc_sve_tbl (d); >> >> > + { >> >> > + if (aarch64_evpc_sve_tbl (d)) >> >> > + return true; >> >> > + else if (aarch64_evpc_sve_dup (d, op_mode)) >> >> > + return true; >> >> > + } >> >> > else if (d->vec_flags == VEC_ADVSIMD) >> >> > return aarch64_evpc_tbl (d); >> >> > } >> >> >> >> Is this part still needed, given the above? >> >> >> >> Thanks, >> >> Richard >> >> >> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode >> >> > vmode, machine_mode op_mode, >> >> > rtx target, rtx op0, rtx op1, >> >> > const vec_perm_indices &sel) >> >> > { >> >> > - if (vmode != op_mode) >> >> > - return false; >> >> > - >> >> > struct expand_vec_perm_d d; >> >> > >> >> > /* Check whether the mask can be applied to a single vector. */ >> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const >> >> > (machine_mode vmode, machine_mode op_mode, >> >> > d.testing_p = !target; >> >> > >> >> > if (!d.testing_p) >> >> > - return aarch64_expand_vec_perm_const_1 (&d); >> >> > + return aarch64_expand_vec_perm_const_1 (&d, op_mode); >> >> > >> >> > rtx_insn *last = get_last_insn (); >> >> > - bool ret = aarch64_expand_vec_perm_const_1 (&d); >> >> > + bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode); >> >> > gcc_assert (last == get_last_insn ()); >> >> > >> >> > return ret; >> > >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > index bee410929bd..1a804b1ab73 100644 >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > @@ -44,6 +44,7 @@ >> > #include "aarch64-sve-builtins-shapes.h" >> > #include "aarch64-sve-builtins-base.h" >> > #include "aarch64-sve-builtins-functions.h" >> > +#include "ssa.h" >> > >> > using namespace aarch64_sve; >> > >> > @@ -1207,6 +1208,64 @@ public: >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); >> > return e.use_contiguous_load_insn (icode); >> > } >> > + >> > + gimple * >> > + fold (gimple_folder &f) const override >> > + { >> > + tree arg0 = gimple_call_arg (f.call, 0); >> > + tree arg1 = gimple_call_arg (f.call, 1); >> > + >> > + /* Transform: >> > + lhs = svld1rq ({-1, -1, ... }, arg1) >> > + into: >> > + tmp = mem_ref<vectype> [(int * {ref-all}) arg1] >> > + lhs = vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>. >> > + on little endian target. >> > + vectype is the corresponding ADVSIMD type. */ >> > + >> > + if (!BYTES_BIG_ENDIAN >> > + && integer_all_onesp (arg0)) >> > + { >> > + tree lhs = gimple_call_lhs (f.call); >> > + tree lhs_type = TREE_TYPE (lhs); >> > + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); >> > + tree eltype = TREE_TYPE (lhs_type); >> > + >> > + scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type)); >> > + machine_mode vq_mode = aarch64_vq_mode (elmode).require (); >> > + tree vectype = build_vector_type_for_mode (eltype, vq_mode); >> > + >> > + tree elt_ptr_type >> > + = build_pointer_type_for_mode (eltype, VOIDmode, true); >> > + tree zero = build_zero_cst (elt_ptr_type); >> > + >> > + /* Use element type alignment. */ >> > + tree access_type >> > + = build_aligned_type (vectype, TYPE_ALIGN (eltype)); >> > + >> > + tree mem_ref_lhs = make_ssa_name_fn (cfun, access_type, 0); >> > + tree mem_ref_op = fold_build2 (MEM_REF, access_type, arg1, zero); >> > + gimple *mem_ref_stmt >> > + = gimple_build_assign (mem_ref_lhs, mem_ref_op); >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); >> > + >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (access_type).to_constant (); >> > + vec_perm_builder sel (lhs_len, source_nelts, 1); >> > + for (int i = 0; i < source_nelts; i++) >> > + sel.quick_push (i); >> > + >> > + vec_perm_indices indices (sel, 1, source_nelts); >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), >> > + TYPE_MODE (access_type), >> > + indices)); >> > + tree mask_type = build_vector_type (ssizetype, lhs_len); >> > + tree mask = vec_perm_indices_to_tree (mask_type, indices); >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, >> > + mem_ref_lhs, mem_ref_lhs, mask); >> > + } >> > + >> > + return NULL; >> > + } >> > }; >> > >> > class svld1ro_impl : public load_replicate >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> > index d4c575ce976..bb24701b0d2 100644 >> > --- a/gcc/config/aarch64/aarch64.cc >> > +++ b/gcc/config/aarch64/aarch64.cc >> > @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d >> > { >> > rtx target, op0, op1; >> > vec_perm_indices perm; >> > + machine_mode op_mode; >> > machine_mode vmode; >> > unsigned int vec_flags; >> > + unsigned int op_vec_flags; >> >> Very minor, but it would be good to keep the order consistent: >> output mode first or input mode first. Guess it might as well >> be output mode first, to match the hook: >> >> machine_mode vmode; >> machine_mode op_mode; >> unsigned int vec_flags; >> unsigned int op_vec_flags; >> >> > bool one_vector_p; >> > bool testing_p; >> > }; >> > @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) >> > return true; >> > } >> > >> > +/* Try to implement D using SVE dup instruction. */ >> > + >> > +static bool >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) >> > +{ >> > + if (BYTES_BIG_ENDIAN >> > + || !d->one_vector_p >> > + || d->vec_flags != VEC_SVE_DATA >> > + || d->op_vec_flags != VEC_ADVSIMD >> >> Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need: >> >> || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128) >> >> This isn't redundant with any of the other tests. >> >> (We can use DUP .D for 64-bit input vectors, but that's a separate patch.) >> >> OK with those changes (including using "else" :-)), thanks. > Hi, > The patch regressed vdup_n_3.c and vzip_{2,3,4}.c because > aarch64_expand_vec_perm_const_1 > was getting passed uninitialized values for d->op_mode and > d->op_vec_flags when called from > aarch64_evpc_reencode. The attached patch fixes the issue by setting > newd.op_mode to newd.vmode and likewise for op_vec_flags. > Does that look OK ? > Bootstrap+test in progress on aarch64-linux-gnu.
OK, thanks. > PS: How to bootstrap with SVE enabled ? > Shall make BOOT_CFLAGS="-mcpu=generic+sve" be sufficient ? > Currently I only tested the patch with normal bootstrap+test. That should work, but it's probably easier to configure with --with-cpu=generic+sve instead (or just pick an actual SVE CPU instead of generic+sve). The testsuite will then be run with SVE enabled too. Richard