Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > On Mon, 6 Jun 2022 at 16:29, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> >> > { >> >> >/* The pattern matching functions above are written to look for a >> >> > small >> >> > number to begin the sequence (0, 1, N/2). If we begin with an >> >> > index >> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct >> >> > expand_vec_perm_d *d) >> >> > || d->vec_flags == VEC_SVE_PRED) >> >> >&& known_gt (nelt, 1)) >> >> > { >> >> > + /* If operand and result modes differ, then only check >> >> > + for dup case. */ >> >> > + if (d->vmode != op_mode) >> >> > + return (d->vec_flags == VEC_SVE_DATA) >> >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false; >> >> > + >> >> >> >> I think it'd be more future-proof to format this as: >> >> >> >> if (d->vmod == d->op_mode) >> >> { >> >> …existing code… >> >> } >> >> else >> >> { >> >> if (aarch64_evpc_sve_dup (d)) >> >> return true; >> >> } >> >> >> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup, >> >> alongside the op_mode check. I think we'll be adding more checks here >> >> over time. >> > Um I was wondering if we should structure it as: >> > if (d->vmode == d->op_mode) >> > { >> > ...existing code... >> > } >> > if (aarch64_evpc_sve_dup (d)) >> > return true; >> > >> > So we check for dup irrespective of d->vmode == d->op_mode ? >> >> Yeah, I can see the attraction of that. I think the else is better >> though because the fallback TBL handling will (rightly) come at the end >> of the existing code. Without the else, we'd have specific tests like >> DUP after generic ones like TBL, so the reader would have to work out >> for themselves that DUP and TBL handle disjoint cases. >> >> >> >if (aarch64_evpc_rev_local (d)) >> >> > return true; >> >> >else if (aarch64_evpc_rev_global (d)) >> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct >> >> > expand_vec_perm_d *d) >> >> >else if (aarch64_evpc_reencode (d)) >> >> > return true; >> >> >if (d->vec_flags == VEC_SVE_DATA) >> >> > - return aarch64_evpc_sve_tbl (d); >> >> > + { >> >> > + if (aarch64_evpc_sve_tbl (d)) >> >> > + return true; >> >> > + else if (aarch64_evpc_sve_dup (d, op_mode)) >> >> > + return true; >> >> > + } >> >> >else if (d->vec_flags == VEC_ADVSIMD) >> >> > return aarch64_evpc_tbl (d); >> >> > } >> >> >> >> Is this part still needed, given the above? >> >> >> >> Thanks, >> >> Richard >> >> >> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode >> >> > vmode, machine_mode op_mode, >> >> > rtx target, rtx op0, rtx op1, >> >> > const vec_perm_indices &sel) >> >> > { >> >> > - if (vmode != op_mode) >> >> > -return false; >> >> > - >> >> >struct expand_vec_perm_d d; >> >> > >> >> >/* Check whether the mask can be applied to a single vector. */ >> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const >> >> > (machine_mode vmode, machine_mode op_mode, >> >> >d.testing_p = !target; >> >> > >> >> >if (!d.testing_p) >> >> > -return aarch64_expand_vec_perm_const_1 (&d); >> >> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode); >> >> > >> >> >rtx_insn *last = get_last_insn (); >> >> > - bool ret = aarch64_expand_vec_perm_const_1 (&d); >> >> > + bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode); >> >> >gcc_assert (last == get_last_insn ()); >> >> > >> >> >return ret; >> > >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > index bee410929bd..1a804b1ab73 100644 >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > @@ -44,6 +44,7 @@ >> > #include "aarch64-sve-builtins-shapes.h" >> > #include "aarch64-sve-builtins-base.h" >> > #include "aarch64-sve-builtins-functions.h" >> > +#include "ssa.h" >> > >> > using namespace aarch64_sve; >> > >> > @@ -1207,6 +1208,64 @@ public: >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); >> > return e.use_contiguous_load_insn (icode); >> >} >> > + >> > + gimple * >> > + fold (gimple_folder &f) const override >> > + { >> > +tree arg0 = gimple_call_arg (f.call, 0); >> > +tree arg1 = gimple_call_arg (f.call, 1); >> > + >> > +/* Transform: >> > + lhs = svld1rq ({-1, -1, ... }, arg1) >> > + into: >> > + tmp = mem_ref [(int * {ref-all}) arg1] >> > + lhs = vec_perm_expr. >> > + on little endian target. >> > + vectype is the corresponding ADVSIMD type. */ >> > + >> > +if (!BYTES_BIG_ENDIAN >> > + && integer_all_onesp (arg0)) >> > + { >> > + tre
Re: [1/2] PR96463 - aarch64 specific changes
On Mon, 6 Jun 2022 at 16:29, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > >> > { > >> >/* The pattern matching functions above are written to look for a > >> > small > >> > number to begin the sequence (0, 1, N/2). If we begin with an > >> > index > >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct > >> > expand_vec_perm_d *d) > >> > || d->vec_flags == VEC_SVE_PRED) > >> >&& known_gt (nelt, 1)) > >> > { > >> > + /* If operand and result modes differ, then only check > >> > + for dup case. */ > >> > + if (d->vmode != op_mode) > >> > + return (d->vec_flags == VEC_SVE_DATA) > >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false; > >> > + > >> > >> I think it'd be more future-proof to format this as: > >> > >> if (d->vmod == d->op_mode) > >> { > >> …existing code… > >> } > >> else > >> { > >> if (aarch64_evpc_sve_dup (d)) > >> return true; > >> } > >> > >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup, > >> alongside the op_mode check. I think we'll be adding more checks here > >> over time. > > Um I was wondering if we should structure it as: > > if (d->vmode == d->op_mode) > > { > > ...existing code... > > } > > if (aarch64_evpc_sve_dup (d)) > > return true; > > > > So we check for dup irrespective of d->vmode == d->op_mode ? > > Yeah, I can see the attraction of that. I think the else is better > though because the fallback TBL handling will (rightly) come at the end > of the existing code. Without the else, we'd have specific tests like > DUP after generic ones like TBL, so the reader would have to work out > for themselves that DUP and TBL handle disjoint cases. > > >> >if (aarch64_evpc_rev_local (d)) > >> > return true; > >> >else if (aarch64_evpc_rev_global (d)) > >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct > >> > expand_vec_perm_d *d) > >> >else if (aarch64_evpc_reencode (d)) > >> > return true; > >> >if (d->vec_flags == VEC_SVE_DATA) > >> > - return aarch64_evpc_sve_tbl (d); > >> > + { > >> > + if (aarch64_evpc_sve_tbl (d)) > >> > + return true; > >> > + else if (aarch64_evpc_sve_dup (d, op_mode)) > >> > + return true; > >> > + } > >> >else if (d->vec_flags == VEC_ADVSIMD) > >> > return aarch64_evpc_tbl (d); > >> > } > >> > >> Is this part still needed, given the above? > >> > >> Thanks, > >> Richard > >> > >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode > >> > vmode, machine_mode op_mode, > >> > rtx target, rtx op0, rtx op1, > >> > const vec_perm_indices &sel) > >> > { > >> > - if (vmode != op_mode) > >> > -return false; > >> > - > >> >struct expand_vec_perm_d d; > >> > > >> >/* Check whether the mask can be applied to a single vector. */ > >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode > >> > vmode, machine_mode op_mode, > >> >d.testing_p = !target; > >> > > >> >if (!d.testing_p) > >> > -return aarch64_expand_vec_perm_const_1 (&d); > >> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode); > >> > > >> >rtx_insn *last = get_last_insn (); > >> > - bool ret = aarch64_expand_vec_perm_const_1 (&d); > >> > + bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode); > >> >gcc_assert (last == get_last_insn ()); > >> > > >> >return ret; > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > index bee410929bd..1a804b1ab73 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > @@ -44,6 +44,7 @@ > > #include "aarch64-sve-builtins-shapes.h" > > #include "aarch64-sve-builtins-base.h" > > #include "aarch64-sve-builtins-functions.h" > > +#include "ssa.h" > > > > using namespace aarch64_sve; > > > > @@ -1207,6 +1208,64 @@ public: > > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > > return e.use_contiguous_load_insn (icode); > >} > > + > > + gimple * > > + fold (gimple_folder &f) const override > > + { > > +tree arg0 = gimple_call_arg (f.call, 0); > > +tree arg1 = gimple_call_arg (f.call, 1); > > + > > +/* Transform: > > + lhs = svld1rq ({-1, -1, ... }, arg1) > > + into: > > + tmp = mem_ref [(int * {ref-all}) arg1] > > + lhs = vec_perm_expr. > > + on little endian target. > > + vectype is the corresponding ADVSIMD type. */ > > + > > +if (!BYTES_BIG_ENDIAN > > + && integer_all_onesp (arg0)) > > + { > > + tree lhs = gimple_call_lhs (f.call); > > + tree lhs_type = TREE_TYPE (lhs); > > + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); > > + tree eltype = TRE
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: >> > { >> >/* The pattern matching functions above are written to look for a small >> > number to begin the sequence (0, 1, N/2). If we begin with an index >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct >> > expand_vec_perm_d *d) >> > || d->vec_flags == VEC_SVE_PRED) >> >&& known_gt (nelt, 1)) >> > { >> > + /* If operand and result modes differ, then only check >> > + for dup case. */ >> > + if (d->vmode != op_mode) >> > + return (d->vec_flags == VEC_SVE_DATA) >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false; >> > + >> >> I think it'd be more future-proof to format this as: >> >> if (d->vmod == d->op_mode) >> { >> …existing code… >> } >> else >> { >> if (aarch64_evpc_sve_dup (d)) >> return true; >> } >> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup, >> alongside the op_mode check. I think we'll be adding more checks here >> over time. > Um I was wondering if we should structure it as: > if (d->vmode == d->op_mode) > { > ...existing code... > } > if (aarch64_evpc_sve_dup (d)) > return true; > > So we check for dup irrespective of d->vmode == d->op_mode ? Yeah, I can see the attraction of that. I think the else is better though because the fallback TBL handling will (rightly) come at the end of the existing code. Without the else, we'd have specific tests like DUP after generic ones like TBL, so the reader would have to work out for themselves that DUP and TBL handle disjoint cases. >> >if (aarch64_evpc_rev_local (d)) >> > return true; >> >else if (aarch64_evpc_rev_global (d)) >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct >> > expand_vec_perm_d *d) >> >else if (aarch64_evpc_reencode (d)) >> > return true; >> >if (d->vec_flags == VEC_SVE_DATA) >> > - return aarch64_evpc_sve_tbl (d); >> > + { >> > + if (aarch64_evpc_sve_tbl (d)) >> > + return true; >> > + else if (aarch64_evpc_sve_dup (d, op_mode)) >> > + return true; >> > + } >> >else if (d->vec_flags == VEC_ADVSIMD) >> > return aarch64_evpc_tbl (d); >> > } >> >> Is this part still needed, given the above? >> >> Thanks, >> Richard >> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode >> > vmode, machine_mode op_mode, >> > rtx target, rtx op0, rtx op1, >> > const vec_perm_indices &sel) >> > { >> > - if (vmode != op_mode) >> > -return false; >> > - >> >struct expand_vec_perm_d d; >> > >> >/* Check whether the mask can be applied to a single vector. */ >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode >> > vmode, machine_mode op_mode, >> >d.testing_p = !target; >> > >> >if (!d.testing_p) >> > -return aarch64_expand_vec_perm_const_1 (&d); >> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode); >> > >> >rtx_insn *last = get_last_insn (); >> > - bool ret = aarch64_expand_vec_perm_const_1 (&d); >> > + bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode); >> >gcc_assert (last == get_last_insn ()); >> > >> >return ret; > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index bee410929bd..1a804b1ab73 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -44,6 +44,7 @@ > #include "aarch64-sve-builtins-shapes.h" > #include "aarch64-sve-builtins-base.h" > #include "aarch64-sve-builtins-functions.h" > +#include "ssa.h" > > using namespace aarch64_sve; > > @@ -1207,6 +1208,64 @@ public: > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > return e.use_contiguous_load_insn (icode); >} > + > + gimple * > + fold (gimple_folder &f) const override > + { > +tree arg0 = gimple_call_arg (f.call, 0); > +tree arg1 = gimple_call_arg (f.call, 1); > + > +/* Transform: > + lhs = svld1rq ({-1, -1, ... }, arg1) > + into: > + tmp = mem_ref [(int * {ref-all}) arg1] > + lhs = vec_perm_expr. > + on little endian target. > + vectype is the corresponding ADVSIMD type. */ > + > +if (!BYTES_BIG_ENDIAN > + && integer_all_onesp (arg0)) > + { > + tree lhs = gimple_call_lhs (f.call); > + tree lhs_type = TREE_TYPE (lhs); > + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type); > + tree eltype = TREE_TYPE (lhs_type); > + > + scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type)); > + machine_mode vq_mode = aarch64_vq_mode (elmode).require (); > + tree vectype = build_vector_type_for_mode (eltype, vq_mode); > + > + tree elt_ptr_type > + = build_pointer_type_for_mode (eltype, VOIDmode, true); > + tree ze
Re: [1/2] PR96463 - aarch64 specific changes
On Wed, 1 Jun 2022 at 14:12, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Thu, 12 May 2022 at 16:15, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > On Wed, 11 May 2022 at 12:44, Richard Sandiford > >> > wrote: > >> >> > >> >> Prathamesh Kulkarni writes: > >> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford > >> >> > wrote: > >> >> >> > >> >> >> Prathamesh Kulkarni writes: > >> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> >> > index c24c0548724..1ef4ea2087b 100644 > >> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> >> > @@ -44,6 +44,14 @@ > >> >> >> > #include "aarch64-sve-builtins-shapes.h" > >> >> >> > #include "aarch64-sve-builtins-base.h" > >> >> >> > #include "aarch64-sve-builtins-functions.h" > >> >> >> > +#include "aarch64-builtins.h" > >> >> >> > +#include "gimple-ssa.h" > >> >> >> > +#include "tree-phinodes.h" > >> >> >> > +#include "tree-ssa-operands.h" > >> >> >> > +#include "ssa-iterators.h" > >> >> >> > +#include "stringpool.h" > >> >> >> > +#include "value-range.h" > >> >> >> > +#include "tree-ssanames.h" > >> >> >> > >> >> >> Minor, but: I think the preferred approach is to include "ssa.h" > >> >> >> rather than include some of these headers directly. > >> >> >> > >> >> >> > > >> >> >> > using namespace aarch64_sve; > >> >> >> > > >> >> >> > @@ -1207,6 +1215,56 @@ public: > >> >> >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode > >> >> >> > (0)); > >> >> >> > return e.use_contiguous_load_insn (icode); > >> >> >> >} > >> >> >> > + > >> >> >> > + gimple * > >> >> >> > + fold (gimple_folder &f) const OVERRIDE > >> >> >> > + { > >> >> >> > +tree arg0 = gimple_call_arg (f.call, 0); > >> >> >> > +tree arg1 = gimple_call_arg (f.call, 1); > >> >> >> > + > >> >> >> > +/* Transform: > >> >> >> > + lhs = svld1rq ({-1, -1, ... }, arg1) > >> >> >> > + into: > >> >> >> > + tmp = mem_ref [(int * {ref-all}) arg1] > >> >> >> > + lhs = vec_perm_expr. > >> >> >> > + on little endian target. */ > >> >> >> > + > >> >> >> > +if (!BYTES_BIG_ENDIAN > >> >> >> > + && integer_all_onesp (arg0)) > >> >> >> > + { > >> >> >> > + tree lhs = gimple_call_lhs (f.call); > >> >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); > >> >> >> > >> >> >> Does this work for other element sizes? I would have expected it > >> >> >> to be the (128-bit) Advanced SIMD vector associated with the same > >> >> >> element type as the SVE vector. > >> >> >> > >> >> >> The testcase should cover more than just int32x4_t -> svint32_t, > >> >> >> just to be sure. > >> >> > In the attached patch, it obtains corresponding advsimd type with: > >> >> > > >> >> > tree eltype = TREE_TYPE (lhs_type); > >> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); > >> >> > tree vectype = build_vector_type (eltype, nunits); > >> >> > > >> >> > While this seems to work with different element sizes, I am not sure > >> >> > if it's > >> >> > the correct approach ? > >> >> > >> >> Yeah, that looks correct. Other SVE code uses aarch64_vq_mode > >> >> to get the vector mode associated with a .Q “element”, so an > >> >> alternative would be: > >> >> > >> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require > >> >> (); > >> >> tree vectype = build_vector_type_for_mode (eltype, vq_mode); > >> >> > >> >> which is more explicit about wanting an Advanced SIMD vector. > >> >> > >> >> >> > + > >> >> >> > + tree elt_ptr_type > >> >> >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, > >> >> >> > true); > >> >> >> > + tree zero = build_zero_cst (elt_ptr_type); > >> >> >> > + > >> >> >> > + /* Use element type alignment. */ > >> >> >> > + tree access_type > >> >> >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN > >> >> >> > (simd_type.eltype)); > >> >> >> > + > >> >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); > >> >> >> > + gimple *mem_ref_stmt > >> >> >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, > >> >> >> > access_type, arg1, zero)); > >> >> >> > >> >> >> Long line. Might be easier to format by assigning the fold_build2 > >> >> >> result > >> >> >> to a temporary variable. > >> >> >> > >> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > >> >> >> > + > >> >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); > >> >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs); > >> >> >> > + tree lhs_type = TREE_TYPE (lhs); > >> >> >> > >> >> >> Is this necessary? The code above supplied the types and I wouldn't > >> >> >> have expected them to change during the build process. > >> >> >> > >> >> >> > + > >> >> >> > + int source_nelts = TYPE_VECTOR_S
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > On Thu, 12 May 2022 at 16:15, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > On Wed, 11 May 2022 at 12:44, Richard Sandiford >> > wrote: >> >> >> >> Prathamesh Kulkarni writes: >> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford >> >> > wrote: >> >> >> >> >> >> Prathamesh Kulkarni writes: >> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> >> > index c24c0548724..1ef4ea2087b 100644 >> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> >> > @@ -44,6 +44,14 @@ >> >> >> > #include "aarch64-sve-builtins-shapes.h" >> >> >> > #include "aarch64-sve-builtins-base.h" >> >> >> > #include "aarch64-sve-builtins-functions.h" >> >> >> > +#include "aarch64-builtins.h" >> >> >> > +#include "gimple-ssa.h" >> >> >> > +#include "tree-phinodes.h" >> >> >> > +#include "tree-ssa-operands.h" >> >> >> > +#include "ssa-iterators.h" >> >> >> > +#include "stringpool.h" >> >> >> > +#include "value-range.h" >> >> >> > +#include "tree-ssanames.h" >> >> >> >> >> >> Minor, but: I think the preferred approach is to include "ssa.h" >> >> >> rather than include some of these headers directly. >> >> >> >> >> >> > >> >> >> > using namespace aarch64_sve; >> >> >> > >> >> >> > @@ -1207,6 +1215,56 @@ public: >> >> >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode >> >> >> > (0)); >> >> >> > return e.use_contiguous_load_insn (icode); >> >> >> >} >> >> >> > + >> >> >> > + gimple * >> >> >> > + fold (gimple_folder &f) const OVERRIDE >> >> >> > + { >> >> >> > +tree arg0 = gimple_call_arg (f.call, 0); >> >> >> > +tree arg1 = gimple_call_arg (f.call, 1); >> >> >> > + >> >> >> > +/* Transform: >> >> >> > + lhs = svld1rq ({-1, -1, ... }, arg1) >> >> >> > + into: >> >> >> > + tmp = mem_ref [(int * {ref-all}) arg1] >> >> >> > + lhs = vec_perm_expr. >> >> >> > + on little endian target. */ >> >> >> > + >> >> >> > +if (!BYTES_BIG_ENDIAN >> >> >> > + && integer_all_onesp (arg0)) >> >> >> > + { >> >> >> > + tree lhs = gimple_call_lhs (f.call); >> >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); >> >> >> >> >> >> Does this work for other element sizes? I would have expected it >> >> >> to be the (128-bit) Advanced SIMD vector associated with the same >> >> >> element type as the SVE vector. >> >> >> >> >> >> The testcase should cover more than just int32x4_t -> svint32_t, >> >> >> just to be sure. >> >> > In the attached patch, it obtains corresponding advsimd type with: >> >> > >> >> > tree eltype = TREE_TYPE (lhs_type); >> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); >> >> > tree vectype = build_vector_type (eltype, nunits); >> >> > >> >> > While this seems to work with different element sizes, I am not sure if >> >> > it's >> >> > the correct approach ? >> >> >> >> Yeah, that looks correct. Other SVE code uses aarch64_vq_mode >> >> to get the vector mode associated with a .Q “element”, so an >> >> alternative would be: >> >> >> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require >> >> (); >> >> tree vectype = build_vector_type_for_mode (eltype, vq_mode); >> >> >> >> which is more explicit about wanting an Advanced SIMD vector. >> >> >> >> >> > + >> >> >> > + tree elt_ptr_type >> >> >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, >> >> >> > true); >> >> >> > + tree zero = build_zero_cst (elt_ptr_type); >> >> >> > + >> >> >> > + /* Use element type alignment. */ >> >> >> > + tree access_type >> >> >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN >> >> >> > (simd_type.eltype)); >> >> >> > + >> >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); >> >> >> > + gimple *mem_ref_stmt >> >> >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, >> >> >> > access_type, arg1, zero)); >> >> >> >> >> >> Long line. Might be easier to format by assigning the fold_build2 >> >> >> result >> >> >> to a temporary variable. >> >> >> >> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); >> >> >> > + >> >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); >> >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs); >> >> >> > + tree lhs_type = TREE_TYPE (lhs); >> >> >> >> >> >> Is this necessary? The code above supplied the types and I wouldn't >> >> >> have expected them to change during the build process. >> >> >> >> >> >> > + >> >> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant >> >> >> > (); >> >> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), >> >> >> > source_nelts, 1); >> >> >> > + for (int i = 0; i < source_nelts; i++) >> >> >> > + sel.quick_push (i); >> >> >> > + >> >> >> > + vec_perm_indices indic
Re: [1/2] PR96463 - aarch64 specific changes
On Thu, 12 May 2022 at 16:15, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Wed, 11 May 2022 at 12:44, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford > >> > wrote: > >> >> > >> >> Prathamesh Kulkarni writes: > >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > index c24c0548724..1ef4ea2087b 100644 > >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> >> > @@ -44,6 +44,14 @@ > >> >> > #include "aarch64-sve-builtins-shapes.h" > >> >> > #include "aarch64-sve-builtins-base.h" > >> >> > #include "aarch64-sve-builtins-functions.h" > >> >> > +#include "aarch64-builtins.h" > >> >> > +#include "gimple-ssa.h" > >> >> > +#include "tree-phinodes.h" > >> >> > +#include "tree-ssa-operands.h" > >> >> > +#include "ssa-iterators.h" > >> >> > +#include "stringpool.h" > >> >> > +#include "value-range.h" > >> >> > +#include "tree-ssanames.h" > >> >> > >> >> Minor, but: I think the preferred approach is to include "ssa.h" > >> >> rather than include some of these headers directly. > >> >> > >> >> > > >> >> > using namespace aarch64_sve; > >> >> > > >> >> > @@ -1207,6 +1215,56 @@ public: > >> >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > >> >> > return e.use_contiguous_load_insn (icode); > >> >> >} > >> >> > + > >> >> > + gimple * > >> >> > + fold (gimple_folder &f) const OVERRIDE > >> >> > + { > >> >> > +tree arg0 = gimple_call_arg (f.call, 0); > >> >> > +tree arg1 = gimple_call_arg (f.call, 1); > >> >> > + > >> >> > +/* Transform: > >> >> > + lhs = svld1rq ({-1, -1, ... }, arg1) > >> >> > + into: > >> >> > + tmp = mem_ref [(int * {ref-all}) arg1] > >> >> > + lhs = vec_perm_expr. > >> >> > + on little endian target. */ > >> >> > + > >> >> > +if (!BYTES_BIG_ENDIAN > >> >> > + && integer_all_onesp (arg0)) > >> >> > + { > >> >> > + tree lhs = gimple_call_lhs (f.call); > >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); > >> >> > >> >> Does this work for other element sizes? I would have expected it > >> >> to be the (128-bit) Advanced SIMD vector associated with the same > >> >> element type as the SVE vector. > >> >> > >> >> The testcase should cover more than just int32x4_t -> svint32_t, > >> >> just to be sure. > >> > In the attached patch, it obtains corresponding advsimd type with: > >> > > >> > tree eltype = TREE_TYPE (lhs_type); > >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); > >> > tree vectype = build_vector_type (eltype, nunits); > >> > > >> > While this seems to work with different element sizes, I am not sure if > >> > it's > >> > the correct approach ? > >> > >> Yeah, that looks correct. Other SVE code uses aarch64_vq_mode > >> to get the vector mode associated with a .Q “element”, so an > >> alternative would be: > >> > >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require (); > >> tree vectype = build_vector_type_for_mode (eltype, vq_mode); > >> > >> which is more explicit about wanting an Advanced SIMD vector. > >> > >> >> > + > >> >> > + tree elt_ptr_type > >> >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, > >> >> > true); > >> >> > + tree zero = build_zero_cst (elt_ptr_type); > >> >> > + > >> >> > + /* Use element type alignment. */ > >> >> > + tree access_type > >> >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN > >> >> > (simd_type.eltype)); > >> >> > + > >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); > >> >> > + gimple *mem_ref_stmt > >> >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, > >> >> > access_type, arg1, zero)); > >> >> > >> >> Long line. Might be easier to format by assigning the fold_build2 > >> >> result > >> >> to a temporary variable. > >> >> > >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > >> >> > + > >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); > >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs); > >> >> > + tree lhs_type = TREE_TYPE (lhs); > >> >> > >> >> Is this necessary? The code above supplied the types and I wouldn't > >> >> have expected them to change during the build process. > >> >> > >> >> > + > >> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant > >> >> > (); > >> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > >> >> > source_nelts, 1); > >> >> > + for (int i = 0; i < source_nelts; i++) > >> >> > + sel.quick_push (i); > >> >> > + > >> >> > + vec_perm_indices indices (sel, 1, source_nelts); > >> >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE > >> >> > (lhs_type), indices)); > >> >> > + tree mask = vec_perm_indice
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > On Wed, 11 May 2022 at 12:44, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford >> > wrote: >> >> >> >> Prathamesh Kulkarni writes: >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > index c24c0548724..1ef4ea2087b 100644 >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> >> > @@ -44,6 +44,14 @@ >> >> > #include "aarch64-sve-builtins-shapes.h" >> >> > #include "aarch64-sve-builtins-base.h" >> >> > #include "aarch64-sve-builtins-functions.h" >> >> > +#include "aarch64-builtins.h" >> >> > +#include "gimple-ssa.h" >> >> > +#include "tree-phinodes.h" >> >> > +#include "tree-ssa-operands.h" >> >> > +#include "ssa-iterators.h" >> >> > +#include "stringpool.h" >> >> > +#include "value-range.h" >> >> > +#include "tree-ssanames.h" >> >> >> >> Minor, but: I think the preferred approach is to include "ssa.h" >> >> rather than include some of these headers directly. >> >> >> >> > >> >> > using namespace aarch64_sve; >> >> > >> >> > @@ -1207,6 +1215,56 @@ public: >> >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); >> >> > return e.use_contiguous_load_insn (icode); >> >> >} >> >> > + >> >> > + gimple * >> >> > + fold (gimple_folder &f) const OVERRIDE >> >> > + { >> >> > +tree arg0 = gimple_call_arg (f.call, 0); >> >> > +tree arg1 = gimple_call_arg (f.call, 1); >> >> > + >> >> > +/* Transform: >> >> > + lhs = svld1rq ({-1, -1, ... }, arg1) >> >> > + into: >> >> > + tmp = mem_ref [(int * {ref-all}) arg1] >> >> > + lhs = vec_perm_expr. >> >> > + on little endian target. */ >> >> > + >> >> > +if (!BYTES_BIG_ENDIAN >> >> > + && integer_all_onesp (arg0)) >> >> > + { >> >> > + tree lhs = gimple_call_lhs (f.call); >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); >> >> >> >> Does this work for other element sizes? I would have expected it >> >> to be the (128-bit) Advanced SIMD vector associated with the same >> >> element type as the SVE vector. >> >> >> >> The testcase should cover more than just int32x4_t -> svint32_t, >> >> just to be sure. >> > In the attached patch, it obtains corresponding advsimd type with: >> > >> > tree eltype = TREE_TYPE (lhs_type); >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); >> > tree vectype = build_vector_type (eltype, nunits); >> > >> > While this seems to work with different element sizes, I am not sure if >> > it's >> > the correct approach ? >> >> Yeah, that looks correct. Other SVE code uses aarch64_vq_mode >> to get the vector mode associated with a .Q “element”, so an >> alternative would be: >> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require (); >> tree vectype = build_vector_type_for_mode (eltype, vq_mode); >> >> which is more explicit about wanting an Advanced SIMD vector. >> >> >> > + >> >> > + tree elt_ptr_type >> >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, >> >> > true); >> >> > + tree zero = build_zero_cst (elt_ptr_type); >> >> > + >> >> > + /* Use element type alignment. */ >> >> > + tree access_type >> >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN >> >> > (simd_type.eltype)); >> >> > + >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); >> >> > + gimple *mem_ref_stmt >> >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, >> >> > arg1, zero)); >> >> >> >> Long line. Might be easier to format by assigning the fold_build2 result >> >> to a temporary variable. >> >> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); >> >> > + >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs); >> >> > + tree lhs_type = TREE_TYPE (lhs); >> >> >> >> Is this necessary? The code above supplied the types and I wouldn't >> >> have expected them to change during the build process. >> >> >> >> > + >> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); >> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), >> >> > source_nelts, 1); >> >> > + for (int i = 0; i < source_nelts; i++) >> >> > + sel.quick_push (i); >> >> > + >> >> > + vec_perm_indices indices (sel, 1, source_nelts); >> >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), >> >> > indices)); >> >> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); >> >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, >> >> > mem_ref_lhs, mask); >> >> >> >> Nit: long line. >> >> >> >> > + } >> >> > + >> >> > +return NULL; >> >> > + } >> >> > }; >> >> > >> >> > class svld1ro_impl : public load_replicate >> >> > diff
Re: [1/2] PR96463 - aarch64 specific changes
On Wed, 11 May 2022 at 12:44, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > On Fri, 6 May 2022 at 16:00, Richard Sandiford > > wrote: > >> > >> Prathamesh Kulkarni writes: > >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > index c24c0548724..1ef4ea2087b 100644 > >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > >> > @@ -44,6 +44,14 @@ > >> > #include "aarch64-sve-builtins-shapes.h" > >> > #include "aarch64-sve-builtins-base.h" > >> > #include "aarch64-sve-builtins-functions.h" > >> > +#include "aarch64-builtins.h" > >> > +#include "gimple-ssa.h" > >> > +#include "tree-phinodes.h" > >> > +#include "tree-ssa-operands.h" > >> > +#include "ssa-iterators.h" > >> > +#include "stringpool.h" > >> > +#include "value-range.h" > >> > +#include "tree-ssanames.h" > >> > >> Minor, but: I think the preferred approach is to include "ssa.h" > >> rather than include some of these headers directly. > >> > >> > > >> > using namespace aarch64_sve; > >> > > >> > @@ -1207,6 +1215,56 @@ public: > >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > >> > return e.use_contiguous_load_insn (icode); > >> >} > >> > + > >> > + gimple * > >> > + fold (gimple_folder &f) const OVERRIDE > >> > + { > >> > +tree arg0 = gimple_call_arg (f.call, 0); > >> > +tree arg1 = gimple_call_arg (f.call, 1); > >> > + > >> > +/* Transform: > >> > + lhs = svld1rq ({-1, -1, ... }, arg1) > >> > + into: > >> > + tmp = mem_ref [(int * {ref-all}) arg1] > >> > + lhs = vec_perm_expr. > >> > + on little endian target. */ > >> > + > >> > +if (!BYTES_BIG_ENDIAN > >> > + && integer_all_onesp (arg0)) > >> > + { > >> > + tree lhs = gimple_call_lhs (f.call); > >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); > >> > >> Does this work for other element sizes? I would have expected it > >> to be the (128-bit) Advanced SIMD vector associated with the same > >> element type as the SVE vector. > >> > >> The testcase should cover more than just int32x4_t -> svint32_t, > >> just to be sure. > > In the attached patch, it obtains corresponding advsimd type with: > > > > tree eltype = TREE_TYPE (lhs_type); > > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); > > tree vectype = build_vector_type (eltype, nunits); > > > > While this seems to work with different element sizes, I am not sure if it's > > the correct approach ? > > Yeah, that looks correct. Other SVE code uses aarch64_vq_mode > to get the vector mode associated with a .Q “element”, so an > alternative would be: > > machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require (); > tree vectype = build_vector_type_for_mode (eltype, vq_mode); > > which is more explicit about wanting an Advanced SIMD vector. > > >> > + > >> > + tree elt_ptr_type > >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true); > >> > + tree zero = build_zero_cst (elt_ptr_type); > >> > + > >> > + /* Use element type alignment. */ > >> > + tree access_type > >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN > >> > (simd_type.eltype)); > >> > + > >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); > >> > + gimple *mem_ref_stmt > >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, > >> > arg1, zero)); > >> > >> Long line. Might be easier to format by assigning the fold_build2 result > >> to a temporary variable. > >> > >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > >> > + > >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); > >> > + tree vectype = TREE_TYPE (mem_ref_lhs); > >> > + tree lhs_type = TREE_TYPE (lhs); > >> > >> Is this necessary? The code above supplied the types and I wouldn't > >> have expected them to change during the build process. > >> > >> > + > >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); > >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > >> > source_nelts, 1); > >> > + for (int i = 0; i < source_nelts; i++) > >> > + sel.quick_push (i); > >> > + > >> > + vec_perm_indices indices (sel, 1, source_nelts); > >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), > >> > indices)); > >> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); > >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, > >> > mem_ref_lhs, mask); > >> > >> Nit: long line. > >> > >> > + } > >> > + > >> > +return NULL; > >> > + } > >> > }; > >> > > >> > class svld1ro_impl : public load_replicate > >> > diff --git a/gcc/config/aarch64/aarch64.cc > >> > b/gcc/config/aarch64/aarch64.cc > >> > index f650abbc4ce..47810fec804 100644 > >> > --- a/gcc/config/aarch64/aarch64.cc > >> > +++
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > On Fri, 6 May 2022 at 16:00, Richard Sandiford > wrote: >> >> Prathamesh Kulkarni writes: >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > index c24c0548724..1ef4ea2087b 100644 >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >> > @@ -44,6 +44,14 @@ >> > #include "aarch64-sve-builtins-shapes.h" >> > #include "aarch64-sve-builtins-base.h" >> > #include "aarch64-sve-builtins-functions.h" >> > +#include "aarch64-builtins.h" >> > +#include "gimple-ssa.h" >> > +#include "tree-phinodes.h" >> > +#include "tree-ssa-operands.h" >> > +#include "ssa-iterators.h" >> > +#include "stringpool.h" >> > +#include "value-range.h" >> > +#include "tree-ssanames.h" >> >> Minor, but: I think the preferred approach is to include "ssa.h" >> rather than include some of these headers directly. >> >> > >> > using namespace aarch64_sve; >> > >> > @@ -1207,6 +1215,56 @@ public: >> > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); >> > return e.use_contiguous_load_insn (icode); >> >} >> > + >> > + gimple * >> > + fold (gimple_folder &f) const OVERRIDE >> > + { >> > +tree arg0 = gimple_call_arg (f.call, 0); >> > +tree arg1 = gimple_call_arg (f.call, 1); >> > + >> > +/* Transform: >> > + lhs = svld1rq ({-1, -1, ... }, arg1) >> > + into: >> > + tmp = mem_ref [(int * {ref-all}) arg1] >> > + lhs = vec_perm_expr. >> > + on little endian target. */ >> > + >> > +if (!BYTES_BIG_ENDIAN >> > + && integer_all_onesp (arg0)) >> > + { >> > + tree lhs = gimple_call_lhs (f.call); >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); >> >> Does this work for other element sizes? I would have expected it >> to be the (128-bit) Advanced SIMD vector associated with the same >> element type as the SVE vector. >> >> The testcase should cover more than just int32x4_t -> svint32_t, >> just to be sure. > In the attached patch, it obtains corresponding advsimd type with: > > tree eltype = TREE_TYPE (lhs_type); > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); > tree vectype = build_vector_type (eltype, nunits); > > While this seems to work with different element sizes, I am not sure if it's > the correct approach ? Yeah, that looks correct. Other SVE code uses aarch64_vq_mode to get the vector mode associated with a .Q “element”, so an alternative would be: machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require (); tree vectype = build_vector_type_for_mode (eltype, vq_mode); which is more explicit about wanting an Advanced SIMD vector. >> > + >> > + tree elt_ptr_type >> > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true); >> > + tree zero = build_zero_cst (elt_ptr_type); >> > + >> > + /* Use element type alignment. */ >> > + tree access_type >> > + = build_aligned_type (simd_type.itype, TYPE_ALIGN >> > (simd_type.eltype)); >> > + >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); >> > + gimple *mem_ref_stmt >> > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, >> > arg1, zero)); >> >> Long line. Might be easier to format by assigning the fold_build2 result >> to a temporary variable. >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); >> > + >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); >> > + tree vectype = TREE_TYPE (mem_ref_lhs); >> > + tree lhs_type = TREE_TYPE (lhs); >> >> Is this necessary? The code above supplied the types and I wouldn't >> have expected them to change during the build process. >> >> > + >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, >> > 1); >> > + for (int i = 0; i < source_nelts; i++) >> > + sel.quick_push (i); >> > + >> > + vec_perm_indices indices (sel, 1, source_nelts); >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), >> > indices)); >> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, >> > mem_ref_lhs, mask); >> >> Nit: long line. >> >> > + } >> > + >> > +return NULL; >> > + } >> > }; >> > >> > class svld1ro_impl : public load_replicate >> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc >> > index f650abbc4ce..47810fec804 100644 >> > --- a/gcc/config/aarch64/aarch64.cc >> > +++ b/gcc/config/aarch64/aarch64.cc >> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) >> >return true; >> > } >> > >> > +/* Try to implement D using SVE dup instruction. */ >> > + >> > +static bool >> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) >> > +{ >> > + if (BYTES_BIG_ENDIAN
Re: [1/2] PR96463 - aarch64 specific changes
On Fri, 6 May 2022 at 16:00, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > index c24c0548724..1ef4ea2087b 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > @@ -44,6 +44,14 @@ > > #include "aarch64-sve-builtins-shapes.h" > > #include "aarch64-sve-builtins-base.h" > > #include "aarch64-sve-builtins-functions.h" > > +#include "aarch64-builtins.h" > > +#include "gimple-ssa.h" > > +#include "tree-phinodes.h" > > +#include "tree-ssa-operands.h" > > +#include "ssa-iterators.h" > > +#include "stringpool.h" > > +#include "value-range.h" > > +#include "tree-ssanames.h" > > Minor, but: I think the preferred approach is to include "ssa.h" > rather than include some of these headers directly. > > > > > using namespace aarch64_sve; > > > > @@ -1207,6 +1215,56 @@ public: > > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > > return e.use_contiguous_load_insn (icode); > >} > > + > > + gimple * > > + fold (gimple_folder &f) const OVERRIDE > > + { > > +tree arg0 = gimple_call_arg (f.call, 0); > > +tree arg1 = gimple_call_arg (f.call, 1); > > + > > +/* Transform: > > + lhs = svld1rq ({-1, -1, ... }, arg1) > > + into: > > + tmp = mem_ref [(int * {ref-all}) arg1] > > + lhs = vec_perm_expr. > > + on little endian target. */ > > + > > +if (!BYTES_BIG_ENDIAN > > + && integer_all_onesp (arg0)) > > + { > > + tree lhs = gimple_call_lhs (f.call); > > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); > > Does this work for other element sizes? I would have expected it > to be the (128-bit) Advanced SIMD vector associated with the same > element type as the SVE vector. > > The testcase should cover more than just int32x4_t -> svint32_t, > just to be sure. In the attached patch, it obtains corresponding advsimd type with: tree eltype = TREE_TYPE (lhs_type); unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype)); tree vectype = build_vector_type (eltype, nunits); While this seems to work with different element sizes, I am not sure if it's the correct approach ? > > > + > > + tree elt_ptr_type > > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true); > > + tree zero = build_zero_cst (elt_ptr_type); > > + > > + /* Use element type alignment. */ > > + tree access_type > > + = build_aligned_type (simd_type.itype, TYPE_ALIGN > > (simd_type.eltype)); > > + > > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); > > + gimple *mem_ref_stmt > > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, > > arg1, zero)); > > Long line. Might be easier to format by assigning the fold_build2 result > to a temporary variable. > > > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > > + > > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); > > + tree vectype = TREE_TYPE (mem_ref_lhs); > > + tree lhs_type = TREE_TYPE (lhs); > > Is this necessary? The code above supplied the types and I wouldn't > have expected them to change during the build process. > > > + > > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); > > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, > > 1); > > + for (int i = 0; i < source_nelts; i++) > > + sel.quick_push (i); > > + > > + vec_perm_indices indices (sel, 1, source_nelts); > > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), > > indices)); > > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); > > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, > > mem_ref_lhs, mask); > > Nit: long line. > > > + } > > + > > +return NULL; > > + } > > }; > > > > class svld1ro_impl : public load_replicate > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index f650abbc4ce..47810fec804 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) > >return true; > > } > > > > +/* Try to implement D using SVE dup instruction. */ > > + > > +static bool > > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) > > +{ > > + if (BYTES_BIG_ENDIAN > > + || d->perm.length ().is_constant () > > + || !d->one_vector_p > > + || d->target == NULL > > + || d->op0 == NULL > > These last two lines mean that we always return false for d->testing. > The idea instead is that the return value should be the same for both > d->testing and !d->testing. The difference is that for !d->testing we > also emit code to do the permute. > > > + || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant () > > Sorry, I've forgotten the context now, but: these
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index c24c0548724..1ef4ea2087b 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -44,6 +44,14 @@ > #include "aarch64-sve-builtins-shapes.h" > #include "aarch64-sve-builtins-base.h" > #include "aarch64-sve-builtins-functions.h" > +#include "aarch64-builtins.h" > +#include "gimple-ssa.h" > +#include "tree-phinodes.h" > +#include "tree-ssa-operands.h" > +#include "ssa-iterators.h" > +#include "stringpool.h" > +#include "value-range.h" > +#include "tree-ssanames.h" Minor, but: I think the preferred approach is to include "ssa.h" rather than include some of these headers directly. > > using namespace aarch64_sve; > > @@ -1207,6 +1215,56 @@ public: > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > return e.use_contiguous_load_insn (icode); >} > + > + gimple * > + fold (gimple_folder &f) const OVERRIDE > + { > +tree arg0 = gimple_call_arg (f.call, 0); > +tree arg1 = gimple_call_arg (f.call, 1); > + > +/* Transform: > + lhs = svld1rq ({-1, -1, ... }, arg1) > + into: > + tmp = mem_ref [(int * {ref-all}) arg1] > + lhs = vec_perm_expr. > + on little endian target. */ > + > +if (!BYTES_BIG_ENDIAN > + && integer_all_onesp (arg0)) > + { > + tree lhs = gimple_call_lhs (f.call); > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t); Does this work for other element sizes? I would have expected it to be the (128-bit) Advanced SIMD vector associated with the same element type as the SVE vector. The testcase should cover more than just int32x4_t -> svint32_t, just to be sure. > + > + tree elt_ptr_type > + = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true); > + tree zero = build_zero_cst (elt_ptr_type); > + > + /* Use element type alignment. */ > + tree access_type > + = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype)); > + > + tree tmp = make_ssa_name_fn (cfun, access_type, 0); > + gimple *mem_ref_stmt > + = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, > zero)); Long line. Might be easier to format by assigning the fold_build2 result to a temporary variable. > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT); > + > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt); > + tree vectype = TREE_TYPE (mem_ref_lhs); > + tree lhs_type = TREE_TYPE (lhs); Is this necessary? The code above supplied the types and I wouldn't have expected them to change during the build process. > + > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1); > + for (int i = 0; i < source_nelts; i++) > + sel.quick_push (i); > + > + vec_perm_indices indices (sel, 1, source_nelts); > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), > indices)); > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, > mem_ref_lhs, mask); Nit: long line. > + } > + > +return NULL; > + } > }; > > class svld1ro_impl : public load_replicate > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index f650abbc4ce..47810fec804 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) >return true; > } > > +/* Try to implement D using SVE dup instruction. */ > + > +static bool > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d) > +{ > + if (BYTES_BIG_ENDIAN > + || d->perm.length ().is_constant () > + || !d->one_vector_p > + || d->target == NULL > + || d->op0 == NULL These last two lines mean that we always return false for d->testing. The idea instead is that the return value should be the same for both d->testing and !d->testing. The difference is that for !d->testing we also emit code to do the permute. > + || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant () Sorry, I've forgotten the context now, but: these positive tests for is_constant surprised me. Do we really only want to do this for variable-length SVE code generation, rather than fixed-length? > + || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ()) > +return false; > + > + if (d->testing_p) > +return true; This should happen after the later tests, once we're sure that the permute vector has the right form. If the issue is that op0 isn't provided for testing then I think the hook needs to be passed the input mode alongside the result mode. It might then be better to test: aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD (des
Re: [1/2] PR96463 - aarch64 specific changes
On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni wrote: > > On Fri, 17 Dec 2021 at 17:03, Richard Sandiford > wrote: > > > > Prathamesh Kulkarni writes: > > > Hi, > > > The patch folds: > > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0]) > > > into: > > > lhs = vec_perm_expr > > > and expands above vec_perm_expr using aarch64_expand_sve_dupq. > > > > > > With patch, for following test: > > > #include > > > #include > > > > > > svint32_t > > > foo (int32x4_t x) > > > { > > > return svld1rq (svptrue_b8 (), &x[0]); > > > } > > > > > > it generates following code: > > > foo: > > > .LFB4350: > > > dup z0.q, z0.q[0] > > > ret > > > > > > and passes bootstrap+test on aarch64-linux-gnu. > > > But I am not sure if the changes to aarch64_evpc_sve_tbl > > > are correct. > > > > Just in case: I was only using int32x4_t in the PR as an example. > > The same thing should work for all element types. > > > > > > > > Thanks, > > > Prathamesh > > > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > index 02e42a71e5e..e21bbec360c 100644 > > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > > @@ -1207,6 +1207,56 @@ public: > > > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > > > return e.use_contiguous_load_insn (icode); > > >} > > > + > > > + gimple * > > > + fold (gimple_folder &f) const OVERRIDE > > > + { > > > +tree arg0 = gimple_call_arg (f.call, 0); > > > +tree arg1 = gimple_call_arg (f.call, 1); > > > + > > > +/* Transform: > > > + lhs = svld1rq ({-1, -1, ... }, &v[0]) > > > + into: > > > + lhs = vec_perm_expr. > > > + on little endian target. */ > > > + > > > +if (!BYTES_BIG_ENDIAN > > > + && integer_all_onesp (arg0) > > > + && TREE_CODE (arg1) == ADDR_EXPR) > > > + { > > > + tree t = TREE_OPERAND (arg1, 0); > > > + if (TREE_CODE (t) == ARRAY_REF) > > > + { > > > + tree index = TREE_OPERAND (t, 1); > > > + t = TREE_OPERAND (t, 0); > > > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR) > > > + { > > > + t = TREE_OPERAND (t, 0); > > > + tree vectype = TREE_TYPE (t); > > > + if (VECTOR_TYPE_P (vectype) > > > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) > > > + && wi::to_wide (TYPE_SIZE (vectype)) == 128) > > > + { > > > > Since this is quite a specific pattern match, and since we now lower > > arm_neon.h vld1* to normal gimple accesses, I think we should try the > > “more generally” approach mentioned in the PR and see what the fallout > > is. That is, keep: > > > > if (!BYTES_BIG_ENDIAN > > && integer_all_onesp (arg0) > > > > If those conditions pass, create an Advanced SIMD access at address arg1, > > using similar code to the handling of: > > > > BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) > > BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) > > BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD) > > > > in aarch64_general_gimple_fold_builtin. (Would be good to move the > > common code to aarch64.c so that both files can use it.) > > > > > + tree lhs = gimple_call_lhs (f.call); > > > + tree lhs_type = TREE_TYPE (lhs); > > > + int source_nelts = TYPE_VECTOR_SUBPARTS > > > (vectype).to_constant (); > > > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > > > source_nelts, 1); > > > + for (int i = 0; i < source_nelts; i++) > > > + sel.quick_push (i); > > > + > > > + vec_perm_indices indices (sel, 1, source_nelts); > > > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), > > > indices)) > > > + return NULL; > > > > I don't think we need to check this: it should always be true. > > Probably worth keeping as a gcc_checking_assert though. > > > > > + > > > + tree mask = vec_perm_indices_to_tree (lhs_type, > > > indices); > > > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, > > > mask); > > > + } > > > + } > > > + } > > > + } > > > + > > > +return NULL; > > > + } > > > }; > > > > > > class svld1ro_impl : public load_replicate > > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > > index f07330cff4f..af27f550be3 100644 > > > --- a/gcc/config/aarch64/aarch64.c > > > +++ b/gcc/config/aarch64/aarch64.c > > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d > > > *d) > > > > > >machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); > > >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); > > > + > > >if (d->one_vector_p) > > > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mod
Re: [1/2] PR96463 - aarch64 specific changes
On Fri, 17 Dec 2021 at 17:03, Richard Sandiford wrote: > > Prathamesh Kulkarni writes: > > Hi, > > The patch folds: > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0]) > > into: > > lhs = vec_perm_expr > > and expands above vec_perm_expr using aarch64_expand_sve_dupq. > > > > With patch, for following test: > > #include > > #include > > > > svint32_t > > foo (int32x4_t x) > > { > > return svld1rq (svptrue_b8 (), &x[0]); > > } > > > > it generates following code: > > foo: > > .LFB4350: > > dup z0.q, z0.q[0] > > ret > > > > and passes bootstrap+test on aarch64-linux-gnu. > > But I am not sure if the changes to aarch64_evpc_sve_tbl > > are correct. > > Just in case: I was only using int32x4_t in the PR as an example. > The same thing should work for all element types. > > > > > Thanks, > > Prathamesh > > > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > index 02e42a71e5e..e21bbec360c 100644 > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > > @@ -1207,6 +1207,56 @@ public: > > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > > return e.use_contiguous_load_insn (icode); > >} > > + > > + gimple * > > + fold (gimple_folder &f) const OVERRIDE > > + { > > +tree arg0 = gimple_call_arg (f.call, 0); > > +tree arg1 = gimple_call_arg (f.call, 1); > > + > > +/* Transform: > > + lhs = svld1rq ({-1, -1, ... }, &v[0]) > > + into: > > + lhs = vec_perm_expr. > > + on little endian target. */ > > + > > +if (!BYTES_BIG_ENDIAN > > + && integer_all_onesp (arg0) > > + && TREE_CODE (arg1) == ADDR_EXPR) > > + { > > + tree t = TREE_OPERAND (arg1, 0); > > + if (TREE_CODE (t) == ARRAY_REF) > > + { > > + tree index = TREE_OPERAND (t, 1); > > + t = TREE_OPERAND (t, 0); > > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR) > > + { > > + t = TREE_OPERAND (t, 0); > > + tree vectype = TREE_TYPE (t); > > + if (VECTOR_TYPE_P (vectype) > > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) > > + && wi::to_wide (TYPE_SIZE (vectype)) == 128) > > + { > > Since this is quite a specific pattern match, and since we now lower > arm_neon.h vld1* to normal gimple accesses, I think we should try the > “more generally” approach mentioned in the PR and see what the fallout > is. That is, keep: > > if (!BYTES_BIG_ENDIAN > && integer_all_onesp (arg0) > > If those conditions pass, create an Advanced SIMD access at address arg1, > using similar code to the handling of: > > BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) > BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) > BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD) > > in aarch64_general_gimple_fold_builtin. (Would be good to move the > common code to aarch64.c so that both files can use it.) > > > + tree lhs = gimple_call_lhs (f.call); > > + tree lhs_type = TREE_TYPE (lhs); > > + int source_nelts = TYPE_VECTOR_SUBPARTS > > (vectype).to_constant (); > > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > > source_nelts, 1); > > + for (int i = 0; i < source_nelts; i++) > > + sel.quick_push (i); > > + > > + vec_perm_indices indices (sel, 1, source_nelts); > > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices)) > > + return NULL; > > I don't think we need to check this: it should always be true. > Probably worth keeping as a gcc_checking_assert though. > > > + > > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); > > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, > > mask); > > + } > > + } > > + } > > + } > > + > > +return NULL; > > + } > > }; > > > > class svld1ro_impl : public load_replicate > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index f07330cff4f..af27f550be3 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) > > > >machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); > >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); > > + > >if (d->one_vector_p) > > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, > > sel)); > > +{ > > + bool use_dupq = false; > > + /* Check if sel is dup vector with encoded elements {0, 1, 2, ... > > nelts} */ > > + if (GET_CODE (sel) == CONST_VECTOR > > + && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant () > > + && CONST_VECTOR_DUPLICATE_P (sel)) > > + { > > + unsigned nelts =
Re: [1/2] PR96463 - aarch64 specific changes
Prathamesh Kulkarni writes: > Hi, > The patch folds: > lhs = svld1rq ({-1, -1, -1, ...}, &v[0]) > into: > lhs = vec_perm_expr > and expands above vec_perm_expr using aarch64_expand_sve_dupq. > > With patch, for following test: > #include > #include > > svint32_t > foo (int32x4_t x) > { > return svld1rq (svptrue_b8 (), &x[0]); > } > > it generates following code: > foo: > .LFB4350: > dup z0.q, z0.q[0] > ret > > and passes bootstrap+test on aarch64-linux-gnu. > But I am not sure if the changes to aarch64_evpc_sve_tbl > are correct. Just in case: I was only using int32x4_t in the PR as an example. The same thing should work for all element types. > > Thanks, > Prathamesh > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index 02e42a71e5e..e21bbec360c 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -1207,6 +1207,56 @@ public: > insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0)); > return e.use_contiguous_load_insn (icode); >} > + > + gimple * > + fold (gimple_folder &f) const OVERRIDE > + { > +tree arg0 = gimple_call_arg (f.call, 0); > +tree arg1 = gimple_call_arg (f.call, 1); > + > +/* Transform: > + lhs = svld1rq ({-1, -1, ... }, &v[0]) > + into: > + lhs = vec_perm_expr. > + on little endian target. */ > + > +if (!BYTES_BIG_ENDIAN > + && integer_all_onesp (arg0) > + && TREE_CODE (arg1) == ADDR_EXPR) > + { > + tree t = TREE_OPERAND (arg1, 0); > + if (TREE_CODE (t) == ARRAY_REF) > + { > + tree index = TREE_OPERAND (t, 1); > + t = TREE_OPERAND (t, 0); > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR) > + { > + t = TREE_OPERAND (t, 0); > + tree vectype = TREE_TYPE (t); > + if (VECTOR_TYPE_P (vectype) > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u) > + && wi::to_wide (TYPE_SIZE (vectype)) == 128) > + { Since this is quite a specific pattern match, and since we now lower arm_neon.h vld1* to normal gimple accesses, I think we should try the “more generally” approach mentioned in the PR and see what the fallout is. That is, keep: if (!BYTES_BIG_ENDIAN && integer_all_onesp (arg0) If those conditions pass, create an Advanced SIMD access at address arg1, using similar code to the handling of: BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD) in aarch64_general_gimple_fold_builtin. (Would be good to move the common code to aarch64.c so that both files can use it.) > + tree lhs = gimple_call_lhs (f.call); > + tree lhs_type = TREE_TYPE (lhs); > + int source_nelts = TYPE_VECTOR_SUBPARTS > (vectype).to_constant (); > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), > source_nelts, 1); > + for (int i = 0; i < source_nelts; i++) > + sel.quick_push (i); > + > + vec_perm_indices indices (sel, 1, source_nelts); > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices)) > + return NULL; I don't think we need to check this: it should always be true. Probably worth keeping as a gcc_checking_assert though. > + > + tree mask = vec_perm_indices_to_tree (lhs_type, indices); > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask); > + } > + } > + } > + } > + > +return NULL; > + } > }; > > class svld1ro_impl : public load_replicate > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index f07330cff4f..af27f550be3 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) > >machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); > + >if (d->one_vector_p) > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel)); > +{ > + bool use_dupq = false; > + /* Check if sel is dup vector with encoded elements {0, 1, 2, ... > nelts} */ > + if (GET_CODE (sel) == CONST_VECTOR > + && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant () > + && CONST_VECTOR_DUPLICATE_P (sel)) > + { > + unsigned nelts = const_vector_encoded_nelts (sel); > + unsigned i; > + for (i = 0; i < nelts; i++) > + { > + rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i); > + if (!(CONST_INT_P (elem) && INTVAL(elem) == i)) > + break; > + } > + if (i == nelts) > + use_dupq = true;