[1/2] PR96463 - aarch64 specific changes

2021-12-17 Thread Prathamesh Kulkarni via Gcc-patches
Hi,
The patch folds:
lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
into:
lhs = vec_perm_expr
and expands above vec_perm_expr using aarch64_expand_sve_dupq.

With patch, for following test:
#include 
#include 

svint32_t
foo (int32x4_t x)
{
  return svld1rq (svptrue_b8 (), &x[0]);
}

it generates following code:
foo:
.LFB4350:
dup z0.q, z0.q[0]
ret

and passes bootstrap+test on aarch64-linux-gnu.
But I am not sure if the changes to aarch64_evpc_sve_tbl
are correct.

Thanks,
Prathamesh
diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 02e42a71e5e..e21bbec360c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1207,6 +1207,56 @@ public:
 insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
 return e.use_contiguous_load_insn (icode);
   }
+
+  gimple *
+  fold (gimple_folder &f) const OVERRIDE
+  {
+tree arg0 = gimple_call_arg (f.call, 0);
+tree arg1 = gimple_call_arg (f.call, 1);
+
+/* Transform:
+   lhs = svld1rq ({-1, -1, ... }, &v[0])
+   into:
+   lhs = vec_perm_expr.
+   on little endian target.  */
+
+if (!BYTES_BIG_ENDIAN
+   && integer_all_onesp (arg0)
+   && TREE_CODE (arg1) == ADDR_EXPR)
+  {
+   tree t = TREE_OPERAND (arg1, 0);
+   if (TREE_CODE (t) == ARRAY_REF)
+ {
+   tree index = TREE_OPERAND (t, 1);
+   t = TREE_OPERAND (t, 0);
+   if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
+ {
+   t = TREE_OPERAND (t, 0);
+   tree vectype = TREE_TYPE (t);
+   if (VECTOR_TYPE_P (vectype)
+   && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
+   && wi::to_wide (TYPE_SIZE (vectype)) == 128)
+ {
+   tree lhs = gimple_call_lhs (f.call);
+   tree lhs_type = TREE_TYPE (lhs);
+   int source_nelts = TYPE_VECTOR_SUBPARTS 
(vectype).to_constant ();
+   vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
source_nelts, 1);
+   for (int i = 0; i < source_nelts; i++)
+ sel.quick_push (i);
+
+   vec_perm_indices indices (sel, 1, source_nelts);
+   if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
+ return NULL;
+
+   tree mask = vec_perm_indices_to_tree (lhs_type, indices);
+   return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
+ }
+ }
+ }
+  }
+
+return NULL;
+  }
 };
 
 class svld1ro_impl : public load_replicate
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f07330cff4f..af27f550be3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
 
   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
+
   if (d->one_vector_p)
-emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+{
+  bool use_dupq = false;
+  /* Check if sel is dup vector with encoded elements {0, 1, 2, ... nelts} 
 */
+  if (GET_CODE (sel) == CONST_VECTOR
+ && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
+ && CONST_VECTOR_DUPLICATE_P (sel))
+ {
+   unsigned nelts = const_vector_encoded_nelts (sel);
+   unsigned i;
+   for (i = 0; i < nelts; i++)
+ {
+   rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
+   if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
+ break;
+ }
+   if (i == nelts)
+ use_dupq = true;
+ }
+
+  if (use_dupq)
+   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
+  else
+   emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
+}
   else
 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
   return true;


Re: [1/2] PR96463 - aarch64 specific changes

2022-05-03 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 27 Dec 2021 at 15:54, Prathamesh Kulkarni
 wrote:
>
> On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
>  wrote:
> >
> > Prathamesh Kulkarni  writes:
> > > Hi,
> > > The patch folds:
> > > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > > into:
> > > lhs = vec_perm_expr
> > > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> > >
> > > With patch, for following test:
> > > #include 
> > > #include 
> > >
> > > svint32_t
> > > foo (int32x4_t x)
> > > {
> > >   return svld1rq (svptrue_b8 (), &x[0]);
> > > }
> > >
> > > it generates following code:
> > > foo:
> > > .LFB4350:
> > > dup z0.q, z0.q[0]
> > > ret
> > >
> > > and passes bootstrap+test on aarch64-linux-gnu.
> > > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > > are correct.
> >
> > Just in case: I was only using int32x4_t in the PR as an example.
> > The same thing should work for all element types.
> >
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > index 02e42a71e5e..e21bbec360c 100644
> > > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > > @@ -1207,6 +1207,56 @@ public:
> > >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> > >  return e.use_contiguous_load_insn (icode);
> > >}
> > > +
> > > +  gimple *
> > > +  fold (gimple_folder &f) const OVERRIDE
> > > +  {
> > > +tree arg0 = gimple_call_arg (f.call, 0);
> > > +tree arg1 = gimple_call_arg (f.call, 1);
> > > +
> > > +/* Transform:
> > > +   lhs = svld1rq ({-1, -1, ... }, &v[0])
> > > +   into:
> > > +   lhs = vec_perm_expr.
> > > +   on little endian target.  */
> > > +
> > > +if (!BYTES_BIG_ENDIAN
> > > + && integer_all_onesp (arg0)
> > > + && TREE_CODE (arg1) == ADDR_EXPR)
> > > +  {
> > > + tree t = TREE_OPERAND (arg1, 0);
> > > + if (TREE_CODE (t) == ARRAY_REF)
> > > +   {
> > > + tree index = TREE_OPERAND (t, 1);
> > > + t = TREE_OPERAND (t, 0);
> > > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > > +   {
> > > + t = TREE_OPERAND (t, 0);
> > > + tree vectype = TREE_TYPE (t);
> > > + if (VECTOR_TYPE_P (vectype)
> > > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > > + && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > > +   {
> >
> > Since this is quite a specific pattern match, and since we now lower
> > arm_neon.h vld1* to normal gimple accesses, I think we should try the
> > “more generally” approach mentioned in the PR and see what the fallout
> > is.  That is, keep:
> >
> > if (!BYTES_BIG_ENDIAN
> > && integer_all_onesp (arg0)
> >
> > If those conditions pass, create an Advanced SIMD access at address arg1,
> > using similar code to the handling of:
> >
> >  BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
> >  BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
> >  BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
> >
> > in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> > common code to aarch64.c so that both files can use it.)
> >
> > > + tree lhs = gimple_call_lhs (f.call);
> > > + tree lhs_type = TREE_TYPE (lhs);
> > > + int source_nelts = TYPE_VECTOR_SUBPARTS 
> > > (vectype).to_constant ();
> > > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> > > source_nelts, 1);
> > > + for (int i = 0; i < source_nelts; i++)
> > > +   sel.quick_push (i);
> > > +
> > > + vec_perm_indices indices (sel, 1, source_nelts);
> > > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), 
> > > indices))
> > > +   return NULL;
> >
> > I don't think we need to check this: it should always be true.
> > Probably worth keeping as a gcc_checking_assert though.
> >
> > > +
> > > + tree mask = vec_perm_indices_to_tree (lhs_type, 
> > > indices);
> > > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, 
> > > mask);
> > > +   }
> > > +   }
> > > +   }
> > > +  }
> > > +
> > > +return NULL;
> > > +  }
> > >  };
> > >
> > >  class svld1ro_impl : public load_replicate
> > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > > index f07330cff4f..af27f550be3 100644
> > > --- a/gcc/config/aarch64/aarch64.c
> > > +++ b/gcc/config/aarch64/aarch64.c
> > > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d 
> > > *d)
> > >
> > >machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> > >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > > +
> > >if (d->one_vector_p)
> > > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mod

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-06 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index c24c0548724..1ef4ea2087b 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,14 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "aarch64-builtins.h"
> +#include "gimple-ssa.h"
> +#include "tree-phinodes.h"
> +#include "tree-ssa-operands.h"
> +#include "ssa-iterators.h"
> +#include "stringpool.h"
> +#include "value-range.h"
> +#include "tree-ssanames.h"

Minor, but: I think the preferred approach is to include "ssa.h"
rather than include some of these headers directly.

>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1215,56 @@ public:
>  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>  return e.use_contiguous_load_insn (icode);
>}
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +tree arg0 = gimple_call_arg (f.call, 0);
> +tree arg1 = gimple_call_arg (f.call, 1);
> +
> +/* Transform:
> +   lhs = svld1rq ({-1, -1, ... }, arg1)
> +   into:
> +   tmp = mem_ref [(int * {ref-all}) arg1]
> +   lhs = vec_perm_expr.
> +   on little endian target.  */
> +
> +if (!BYTES_BIG_ENDIAN
> + && integer_all_onesp (arg0))
> +  {
> + tree lhs = gimple_call_lhs (f.call);
> + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);

Does this work for other element sizes?  I would have expected it
to be the (128-bit) Advanced SIMD vector associated with the same
element type as the SVE vector.

The testcase should cover more than just int32x4_t -> svint32_t,
just to be sure.

> +
> + tree elt_ptr_type
> +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> + tree zero = build_zero_cst (elt_ptr_type);
> +
> + /* Use element type alignment.  */
> + tree access_type
> +   = build_aligned_type (simd_type.itype, TYPE_ALIGN (simd_type.eltype));
> +
> + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> + gimple *mem_ref_stmt
> +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, arg1, 
> zero));

Long line.  Might be easier to format by assigning the fold_build2 result
to a temporary variable.

> + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> + tree vectype = TREE_TYPE (mem_ref_lhs);
> + tree lhs_type = TREE_TYPE (lhs);

Is this necessary?  The code above supplied the types and I wouldn't
have expected them to change during the build process.

> +
> + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 1);
> + for (int i = 0; i < source_nelts; i++)
> +   sel.quick_push (i);
> +
> + vec_perm_indices indices (sel, 1, source_nelts);
> + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
> indices));
> + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
> mem_ref_lhs, mask);

Nit: long line.

> +  }
> +
> +return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f650abbc4ce..47810fec804 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>return true;
>  }
>  
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +  || d->perm.length ().is_constant ()
> +  || !d->one_vector_p
> +  || d->target == NULL
> +  || d->op0 == NULL

These last two lines mean that we always return false for d->testing.
The idea instead is that the return value should be the same for both
d->testing and !d->testing.  The difference is that for !d->testing we
also emit code to do the permute.

> +  || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()

Sorry, I've forgotten the context now, but: these positive tests
for is_constant surprised me.  Do we really only want to do this
for variable-length SVE code generation, rather than fixed-length?

> +  || !GET_MODE_NUNITS (GET_MODE (d->op0)).is_constant ())
> +return false;
> +
> +  if (d->testing_p)
> +return true;

This should happen after the later tests, once we're sure that the
permute vector has the right form.  If the issue is that op0 isn't
provided for testing then I think the hook needs to be passed the
input mode alongside the result mode.

It might then be better to test:

  aarch64_classify_vector_mode (...input_mode...) == VEC_ADVSIMD

(des

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-10 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 6 May 2022 at 16:00, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index c24c0548724..1ef4ea2087b 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,14 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "aarch64-builtins.h"
> > +#include "gimple-ssa.h"
> > +#include "tree-phinodes.h"
> > +#include "tree-ssa-operands.h"
> > +#include "ssa-iterators.h"
> > +#include "stringpool.h"
> > +#include "value-range.h"
> > +#include "tree-ssanames.h"
>
> Minor, but: I think the preferred approach is to include "ssa.h"
> rather than include some of these headers directly.
>
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1215,56 @@ public:
> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >  return e.use_contiguous_load_insn (icode);
> >}
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +tree arg0 = gimple_call_arg (f.call, 0);
> > +tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +/* Transform:
> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
> > +   into:
> > +   tmp = mem_ref [(int * {ref-all}) arg1]
> > +   lhs = vec_perm_expr.
> > +   on little endian target.  */
> > +
> > +if (!BYTES_BIG_ENDIAN
> > + && integer_all_onesp (arg0))
> > +  {
> > + tree lhs = gimple_call_lhs (f.call);
> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>
> Does this work for other element sizes?  I would have expected it
> to be the (128-bit) Advanced SIMD vector associated with the same
> element type as the SVE vector.
>
> The testcase should cover more than just int32x4_t -> svint32_t,
> just to be sure.
In the attached patch, it obtains corresponding advsimd type with:

tree eltype = TREE_TYPE (lhs_type);
unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
tree vectype = build_vector_type (eltype, nunits);

While this seems to work with different element sizes, I am not sure if it's
the correct approach ?
>
> > +
> > + tree elt_ptr_type
> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> > + tree zero = build_zero_cst (elt_ptr_type);
> > +
> > + /* Use element type alignment.  */
> > + tree access_type
> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
> > (simd_type.eltype));
> > +
> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> > + gimple *mem_ref_stmt
> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, 
> > arg1, zero));
>
> Long line.  Might be easier to format by assigning the fold_build2 result
> to a temporary variable.
>
> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> > +
> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> > + tree vectype = TREE_TYPE (mem_ref_lhs);
> > + tree lhs_type = TREE_TYPE (lhs);
>
> Is this necessary?  The code above supplied the types and I wouldn't
> have expected them to change during the build process.
>
> > +
> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 
> > 1);
> > + for (int i = 0; i < source_nelts; i++)
> > +   sel.quick_push (i);
> > +
> > + vec_perm_indices indices (sel, 1, source_nelts);
> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
> > indices));
> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
> > mem_ref_lhs, mask);
>
> Nit: long line.
>
> > +  }
> > +
> > +return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index f650abbc4ce..47810fec804 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >return true;
> >  }
> >
> > +/* Try to implement D using SVE dup instruction.  */
> > +
> > +static bool
> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> > +{
> > +  if (BYTES_BIG_ENDIAN
> > +  || d->perm.length ().is_constant ()
> > +  || !d->one_vector_p
> > +  || d->target == NULL
> > +  || d->op0 == NULL
>
> These last two lines mean that we always return false for d->testing.
> The idea instead is that the return value should be the same for both
> d->testing and !d->testing.  The difference is that for !d->testing we
> also emit code to do the permute.
>
> > +  || GET_MODE_NUNITS (GET_MODE (d->target)).is_constant ()
>
> Sorry, I've forgotten the context now, but: these

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-11 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Fri, 6 May 2022 at 16:00, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index c24c0548724..1ef4ea2087b 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -44,6 +44,14 @@
>> >  #include "aarch64-sve-builtins-shapes.h"
>> >  #include "aarch64-sve-builtins-base.h"
>> >  #include "aarch64-sve-builtins-functions.h"
>> > +#include "aarch64-builtins.h"
>> > +#include "gimple-ssa.h"
>> > +#include "tree-phinodes.h"
>> > +#include "tree-ssa-operands.h"
>> > +#include "ssa-iterators.h"
>> > +#include "stringpool.h"
>> > +#include "value-range.h"
>> > +#include "tree-ssanames.h"
>>
>> Minor, but: I think the preferred approach is to include "ssa.h"
>> rather than include some of these headers directly.
>>
>> >
>> >  using namespace aarch64_sve;
>> >
>> > @@ -1207,6 +1215,56 @@ public:
>> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >  return e.use_contiguous_load_insn (icode);
>> >}
>> > +
>> > +  gimple *
>> > +  fold (gimple_folder &f) const OVERRIDE
>> > +  {
>> > +tree arg0 = gimple_call_arg (f.call, 0);
>> > +tree arg1 = gimple_call_arg (f.call, 1);
>> > +
>> > +/* Transform:
>> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
>> > +   into:
>> > +   tmp = mem_ref [(int * {ref-all}) arg1]
>> > +   lhs = vec_perm_expr.
>> > +   on little endian target.  */
>> > +
>> > +if (!BYTES_BIG_ENDIAN
>> > + && integer_all_onesp (arg0))
>> > +  {
>> > + tree lhs = gimple_call_lhs (f.call);
>> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>>
>> Does this work for other element sizes?  I would have expected it
>> to be the (128-bit) Advanced SIMD vector associated with the same
>> element type as the SVE vector.
>>
>> The testcase should cover more than just int32x4_t -> svint32_t,
>> just to be sure.
> In the attached patch, it obtains corresponding advsimd type with:
>
> tree eltype = TREE_TYPE (lhs_type);
> unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> tree vectype = build_vector_type (eltype, nunits);
>
> While this seems to work with different element sizes, I am not sure if it's
> the correct approach ?

Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
to get the vector mode associated with a .Q “element”, so an
alternative would be:

machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
tree vectype = build_vector_type_for_mode (eltype, vq_mode);

which is more explicit about wanting an Advanced SIMD vector.

>> > +
>> > + tree elt_ptr_type
>> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
>> > + tree zero = build_zero_cst (elt_ptr_type);
>> > +
>> > + /* Use element type alignment.  */
>> > + tree access_type
>> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
>> > (simd_type.eltype));
>> > +
>> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> > + gimple *mem_ref_stmt
>> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, 
>> > arg1, zero));
>>
>> Long line.  Might be easier to format by assigning the fold_build2 result
>> to a temporary variable.
>>
>> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> > +
>> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> > + tree vectype = TREE_TYPE (mem_ref_lhs);
>> > + tree lhs_type = TREE_TYPE (lhs);
>>
>> Is this necessary?  The code above supplied the types and I wouldn't
>> have expected them to change during the build process.
>>
>> > +
>> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), source_nelts, 
>> > 1);
>> > + for (int i = 0; i < source_nelts; i++)
>> > +   sel.quick_push (i);
>> > +
>> > + vec_perm_indices indices (sel, 1, source_nelts);
>> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
>> > indices));
>> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
>> > mem_ref_lhs, mask);
>>
>> Nit: long line.
>>
>> > +  }
>> > +
>> > +return NULL;
>> > +  }
>> >  };
>> >
>> >  class svld1ro_impl : public load_replicate
>> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> > index f650abbc4ce..47810fec804 100644
>> > --- a/gcc/config/aarch64/aarch64.cc
>> > +++ b/gcc/config/aarch64/aarch64.cc
>> > @@ -23969,6 +23969,35 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>> >return true;
>> >  }
>> >
>> > +/* Try to implement D using SVE dup instruction.  */
>> > +
>> > +static bool
>> > +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
>> > +{
>> > +  if (BYTES_BIG_ENDIAN

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-12 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 11 May 2022 at 12:44, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > index c24c0548724..1ef4ea2087b 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> > @@ -44,6 +44,14 @@
> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >  #include "aarch64-sve-builtins-base.h"
> >> >  #include "aarch64-sve-builtins-functions.h"
> >> > +#include "aarch64-builtins.h"
> >> > +#include "gimple-ssa.h"
> >> > +#include "tree-phinodes.h"
> >> > +#include "tree-ssa-operands.h"
> >> > +#include "ssa-iterators.h"
> >> > +#include "stringpool.h"
> >> > +#include "value-range.h"
> >> > +#include "tree-ssanames.h"
> >>
> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> rather than include some of these headers directly.
> >>
> >> >
> >> >  using namespace aarch64_sve;
> >> >
> >> > @@ -1207,6 +1215,56 @@ public:
> >> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >  return e.use_contiguous_load_insn (icode);
> >> >}
> >> > +
> >> > +  gimple *
> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> > +  {
> >> > +tree arg0 = gimple_call_arg (f.call, 0);
> >> > +tree arg1 = gimple_call_arg (f.call, 1);
> >> > +
> >> > +/* Transform:
> >> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
> >> > +   into:
> >> > +   tmp = mem_ref [(int * {ref-all}) arg1]
> >> > +   lhs = vec_perm_expr.
> >> > +   on little endian target.  */
> >> > +
> >> > +if (!BYTES_BIG_ENDIAN
> >> > + && integer_all_onesp (arg0))
> >> > +  {
> >> > + tree lhs = gimple_call_lhs (f.call);
> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >>
> >> Does this work for other element sizes?  I would have expected it
> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> element type as the SVE vector.
> >>
> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> just to be sure.
> > In the attached patch, it obtains corresponding advsimd type with:
> >
> > tree eltype = TREE_TYPE (lhs_type);
> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> > tree vectype = build_vector_type (eltype, nunits);
> >
> > While this seems to work with different element sizes, I am not sure if it's
> > the correct approach ?
>
> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> to get the vector mode associated with a .Q “element”, so an
> alternative would be:
>
> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
> tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>
> which is more explicit about wanting an Advanced SIMD vector.
>
> >> > +
> >> > + tree elt_ptr_type
> >> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, true);
> >> > + tree zero = build_zero_cst (elt_ptr_type);
> >> > +
> >> > + /* Use element type alignment.  */
> >> > + tree access_type
> >> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
> >> > (simd_type.eltype));
> >> > +
> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> > + gimple *mem_ref_stmt
> >> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, 
> >> > arg1, zero));
> >>
> >> Long line.  Might be easier to format by assigning the fold_build2 result
> >> to a temporary variable.
> >>
> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> > +
> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> > + tree vectype = TREE_TYPE (mem_ref_lhs);
> >> > + tree lhs_type = TREE_TYPE (lhs);
> >>
> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> have expected them to change during the build process.
> >>
> >> > +
> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> >> > source_nelts, 1);
> >> > + for (int i = 0; i < source_nelts; i++)
> >> > +   sel.quick_push (i);
> >> > +
> >> > + vec_perm_indices indices (sel, 1, source_nelts);
> >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
> >> > indices));
> >> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
> >> > mem_ref_lhs, mask);
> >>
> >> Nit: long line.
> >>
> >> > +  }
> >> > +
> >> > +return NULL;
> >> > +  }
> >> >  };
> >> >
> >> >  class svld1ro_impl : public load_replicate
> >> > diff --git a/gcc/config/aarch64/aarch64.cc 
> >> > b/gcc/config/aarch64/aarch64.cc
> >> > index f650abbc4ce..47810fec804 100644
> >> > --- a/gcc/config/aarch64/aarch64.cc
> >> > +++ 

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-12 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Wed, 11 May 2022 at 12:44, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
>> >  wrote:
>> >>
>> >> Prathamesh Kulkarni  writes:
>> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > index c24c0548724..1ef4ea2087b 100644
>> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> > @@ -44,6 +44,14 @@
>> >> >  #include "aarch64-sve-builtins-shapes.h"
>> >> >  #include "aarch64-sve-builtins-base.h"
>> >> >  #include "aarch64-sve-builtins-functions.h"
>> >> > +#include "aarch64-builtins.h"
>> >> > +#include "gimple-ssa.h"
>> >> > +#include "tree-phinodes.h"
>> >> > +#include "tree-ssa-operands.h"
>> >> > +#include "ssa-iterators.h"
>> >> > +#include "stringpool.h"
>> >> > +#include "value-range.h"
>> >> > +#include "tree-ssanames.h"
>> >>
>> >> Minor, but: I think the preferred approach is to include "ssa.h"
>> >> rather than include some of these headers directly.
>> >>
>> >> >
>> >> >  using namespace aarch64_sve;
>> >> >
>> >> > @@ -1207,6 +1215,56 @@ public:
>> >> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >> >  return e.use_contiguous_load_insn (icode);
>> >> >}
>> >> > +
>> >> > +  gimple *
>> >> > +  fold (gimple_folder &f) const OVERRIDE
>> >> > +  {
>> >> > +tree arg0 = gimple_call_arg (f.call, 0);
>> >> > +tree arg1 = gimple_call_arg (f.call, 1);
>> >> > +
>> >> > +/* Transform:
>> >> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
>> >> > +   into:
>> >> > +   tmp = mem_ref [(int * {ref-all}) arg1]
>> >> > +   lhs = vec_perm_expr.
>> >> > +   on little endian target.  */
>> >> > +
>> >> > +if (!BYTES_BIG_ENDIAN
>> >> > + && integer_all_onesp (arg0))
>> >> > +  {
>> >> > + tree lhs = gimple_call_lhs (f.call);
>> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>> >>
>> >> Does this work for other element sizes?  I would have expected it
>> >> to be the (128-bit) Advanced SIMD vector associated with the same
>> >> element type as the SVE vector.
>> >>
>> >> The testcase should cover more than just int32x4_t -> svint32_t,
>> >> just to be sure.
>> > In the attached patch, it obtains corresponding advsimd type with:
>> >
>> > tree eltype = TREE_TYPE (lhs_type);
>> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
>> > tree vectype = build_vector_type (eltype, nunits);
>> >
>> > While this seems to work with different element sizes, I am not sure if 
>> > it's
>> > the correct approach ?
>>
>> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
>> to get the vector mode associated with a .Q “element”, so an
>> alternative would be:
>>
>> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
>> tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>>
>> which is more explicit about wanting an Advanced SIMD vector.
>>
>> >> > +
>> >> > + tree elt_ptr_type
>> >> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, 
>> >> > true);
>> >> > + tree zero = build_zero_cst (elt_ptr_type);
>> >> > +
>> >> > + /* Use element type alignment.  */
>> >> > + tree access_type
>> >> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
>> >> > (simd_type.eltype));
>> >> > +
>> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> >> > + gimple *mem_ref_stmt
>> >> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, access_type, 
>> >> > arg1, zero));
>> >>
>> >> Long line.  Might be easier to format by assigning the fold_build2 result
>> >> to a temporary variable.
>> >>
>> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> >> > +
>> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> >> > + tree vectype = TREE_TYPE (mem_ref_lhs);
>> >> > + tree lhs_type = TREE_TYPE (lhs);
>> >>
>> >> Is this necessary?  The code above supplied the types and I wouldn't
>> >> have expected them to change during the build process.
>> >>
>> >> > +
>> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
>> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
>> >> > source_nelts, 1);
>> >> > + for (int i = 0; i < source_nelts; i++)
>> >> > +   sel.quick_push (i);
>> >> > +
>> >> > + vec_perm_indices indices (sel, 1, source_nelts);
>> >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type), 
>> >> > indices));
>> >> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
>> >> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, mem_ref_lhs, 
>> >> > mem_ref_lhs, mask);
>> >>
>> >> Nit: long line.
>> >>
>> >> > +  }
>> >> > +
>> >> > +return NULL;
>> >> > +  }
>> >> >  };
>> >> >
>> >> >  class svld1ro_impl : public load_replicate
>> >> > diff 

Re: [1/2] PR96463 - aarch64 specific changes

2022-05-31 Thread Prathamesh Kulkarni via Gcc-patches
On Thu, 12 May 2022 at 16:15, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Prathamesh Kulkarni  writes:
> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > index c24c0548724..1ef4ea2087b 100644
> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> > @@ -44,6 +44,14 @@
> >> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >> >  #include "aarch64-sve-builtins-base.h"
> >> >> >  #include "aarch64-sve-builtins-functions.h"
> >> >> > +#include "aarch64-builtins.h"
> >> >> > +#include "gimple-ssa.h"
> >> >> > +#include "tree-phinodes.h"
> >> >> > +#include "tree-ssa-operands.h"
> >> >> > +#include "ssa-iterators.h"
> >> >> > +#include "stringpool.h"
> >> >> > +#include "value-range.h"
> >> >> > +#include "tree-ssanames.h"
> >> >>
> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> >> rather than include some of these headers directly.
> >> >>
> >> >> >
> >> >> >  using namespace aarch64_sve;
> >> >> >
> >> >> > @@ -1207,6 +1215,56 @@ public:
> >> >> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >> >> >  return e.use_contiguous_load_insn (icode);
> >> >> >}
> >> >> > +
> >> >> > +  gimple *
> >> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> >> > +  {
> >> >> > +tree arg0 = gimple_call_arg (f.call, 0);
> >> >> > +tree arg1 = gimple_call_arg (f.call, 1);
> >> >> > +
> >> >> > +/* Transform:
> >> >> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
> >> >> > +   into:
> >> >> > +   tmp = mem_ref [(int * {ref-all}) arg1]
> >> >> > +   lhs = vec_perm_expr.
> >> >> > +   on little endian target.  */
> >> >> > +
> >> >> > +if (!BYTES_BIG_ENDIAN
> >> >> > + && integer_all_onesp (arg0))
> >> >> > +  {
> >> >> > + tree lhs = gimple_call_lhs (f.call);
> >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >> >>
> >> >> Does this work for other element sizes?  I would have expected it
> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> >> element type as the SVE vector.
> >> >>
> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> >> just to be sure.
> >> > In the attached patch, it obtains corresponding advsimd type with:
> >> >
> >> > tree eltype = TREE_TYPE (lhs_type);
> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> >> > tree vectype = build_vector_type (eltype, nunits);
> >> >
> >> > While this seems to work with different element sizes, I am not sure if 
> >> > it's
> >> > the correct approach ?
> >>
> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> >> to get the vector mode associated with a .Q “element”, so an
> >> alternative would be:
> >>
> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require ();
> >> tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> >>
> >> which is more explicit about wanting an Advanced SIMD vector.
> >>
> >> >> > +
> >> >> > + tree elt_ptr_type
> >> >> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, 
> >> >> > true);
> >> >> > + tree zero = build_zero_cst (elt_ptr_type);
> >> >> > +
> >> >> > + /* Use element type alignment.  */
> >> >> > + tree access_type
> >> >> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
> >> >> > (simd_type.eltype));
> >> >> > +
> >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> >> > + gimple *mem_ref_stmt
> >> >> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, 
> >> >> > access_type, arg1, zero));
> >> >>
> >> >> Long line.  Might be easier to format by assigning the fold_build2 
> >> >> result
> >> >> to a temporary variable.
> >> >>
> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> >> > +
> >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs);
> >> >> > + tree lhs_type = TREE_TYPE (lhs);
> >> >>
> >> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> >> have expected them to change during the build process.
> >> >>
> >> >> > +
> >> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant 
> >> >> > ();
> >> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> >> >> > source_nelts, 1);
> >> >> > + for (int i = 0; i < source_nelts; i++)
> >> >> > +   sel.quick_push (i);
> >> >> > +
> >> >> > + vec_perm_indices indices (sel, 1, source_nelts);
> >> >> > + gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE 
> >> >> > (lhs_type), indices));
> >> >> > + tree mask = vec_perm_indice

Re: [1/2] PR96463 - aarch64 specific changes

2022-06-01 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Thu, 12 May 2022 at 16:15, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
>> >  wrote:
>> >>
>> >> Prathamesh Kulkarni  writes:
>> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
>> >> >  wrote:
>> >> >>
>> >> >> Prathamesh Kulkarni  writes:
>> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > index c24c0548724..1ef4ea2087b 100644
>> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> >> >> > @@ -44,6 +44,14 @@
>> >> >> >  #include "aarch64-sve-builtins-shapes.h"
>> >> >> >  #include "aarch64-sve-builtins-base.h"
>> >> >> >  #include "aarch64-sve-builtins-functions.h"
>> >> >> > +#include "aarch64-builtins.h"
>> >> >> > +#include "gimple-ssa.h"
>> >> >> > +#include "tree-phinodes.h"
>> >> >> > +#include "tree-ssa-operands.h"
>> >> >> > +#include "ssa-iterators.h"
>> >> >> > +#include "stringpool.h"
>> >> >> > +#include "value-range.h"
>> >> >> > +#include "tree-ssanames.h"
>> >> >>
>> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
>> >> >> rather than include some of these headers directly.
>> >> >>
>> >> >> >
>> >> >> >  using namespace aarch64_sve;
>> >> >> >
>> >> >> > @@ -1207,6 +1215,56 @@ public:
>> >> >> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode 
>> >> >> > (0));
>> >> >> >  return e.use_contiguous_load_insn (icode);
>> >> >> >}
>> >> >> > +
>> >> >> > +  gimple *
>> >> >> > +  fold (gimple_folder &f) const OVERRIDE
>> >> >> > +  {
>> >> >> > +tree arg0 = gimple_call_arg (f.call, 0);
>> >> >> > +tree arg1 = gimple_call_arg (f.call, 1);
>> >> >> > +
>> >> >> > +/* Transform:
>> >> >> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
>> >> >> > +   into:
>> >> >> > +   tmp = mem_ref [(int * {ref-all}) arg1]
>> >> >> > +   lhs = vec_perm_expr.
>> >> >> > +   on little endian target.  */
>> >> >> > +
>> >> >> > +if (!BYTES_BIG_ENDIAN
>> >> >> > + && integer_all_onesp (arg0))
>> >> >> > +  {
>> >> >> > + tree lhs = gimple_call_lhs (f.call);
>> >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
>> >> >>
>> >> >> Does this work for other element sizes?  I would have expected it
>> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
>> >> >> element type as the SVE vector.
>> >> >>
>> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
>> >> >> just to be sure.
>> >> > In the attached patch, it obtains corresponding advsimd type with:
>> >> >
>> >> > tree eltype = TREE_TYPE (lhs_type);
>> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
>> >> > tree vectype = build_vector_type (eltype, nunits);
>> >> >
>> >> > While this seems to work with different element sizes, I am not sure if 
>> >> > it's
>> >> > the correct approach ?
>> >>
>> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
>> >> to get the vector mode associated with a .Q “element”, so an
>> >> alternative would be:
>> >>
>> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require 
>> >> ();
>> >> tree vectype = build_vector_type_for_mode (eltype, vq_mode);
>> >>
>> >> which is more explicit about wanting an Advanced SIMD vector.
>> >>
>> >> >> > +
>> >> >> > + tree elt_ptr_type
>> >> >> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, 
>> >> >> > true);
>> >> >> > + tree zero = build_zero_cst (elt_ptr_type);
>> >> >> > +
>> >> >> > + /* Use element type alignment.  */
>> >> >> > + tree access_type
>> >> >> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
>> >> >> > (simd_type.eltype));
>> >> >> > +
>> >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
>> >> >> > + gimple *mem_ref_stmt
>> >> >> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, 
>> >> >> > access_type, arg1, zero));
>> >> >>
>> >> >> Long line.  Might be easier to format by assigning the fold_build2 
>> >> >> result
>> >> >> to a temporary variable.
>> >> >>
>> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
>> >> >> > +
>> >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
>> >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs);
>> >> >> > + tree lhs_type = TREE_TYPE (lhs);
>> >> >>
>> >> >> Is this necessary?  The code above supplied the types and I wouldn't
>> >> >> have expected them to change during the build process.
>> >> >>
>> >> >> > +
>> >> >> > + int source_nelts = TYPE_VECTOR_SUBPARTS (vectype).to_constant 
>> >> >> > ();
>> >> >> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
>> >> >> > source_nelts, 1);
>> >> >> > + for (int i = 0; i < source_nelts; i++)
>> >> >> > +   sel.quick_push (i);
>> >> >> > +
>> >> >> > + vec_perm_indices indic

Re: [1/2] PR96463 - aarch64 specific changes

2022-06-05 Thread Prathamesh Kulkarni via Gcc-patches
On Wed, 1 Jun 2022 at 14:12, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > On Thu, 12 May 2022 at 16:15, Richard Sandiford
> >  wrote:
> >>
> >> Prathamesh Kulkarni  writes:
> >> > On Wed, 11 May 2022 at 12:44, Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Prathamesh Kulkarni  writes:
> >> >> > On Fri, 6 May 2022 at 16:00, Richard Sandiford
> >> >> >  wrote:
> >> >> >>
> >> >> >> Prathamesh Kulkarni  writes:
> >> >> >> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> >> >> >> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > index c24c0548724..1ef4ea2087b 100644
> >> >> >> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> >> >> >> > @@ -44,6 +44,14 @@
> >> >> >> >  #include "aarch64-sve-builtins-shapes.h"
> >> >> >> >  #include "aarch64-sve-builtins-base.h"
> >> >> >> >  #include "aarch64-sve-builtins-functions.h"
> >> >> >> > +#include "aarch64-builtins.h"
> >> >> >> > +#include "gimple-ssa.h"
> >> >> >> > +#include "tree-phinodes.h"
> >> >> >> > +#include "tree-ssa-operands.h"
> >> >> >> > +#include "ssa-iterators.h"
> >> >> >> > +#include "stringpool.h"
> >> >> >> > +#include "value-range.h"
> >> >> >> > +#include "tree-ssanames.h"
> >> >> >>
> >> >> >> Minor, but: I think the preferred approach is to include "ssa.h"
> >> >> >> rather than include some of these headers directly.
> >> >> >>
> >> >> >> >
> >> >> >> >  using namespace aarch64_sve;
> >> >> >> >
> >> >> >> > @@ -1207,6 +1215,56 @@ public:
> >> >> >> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode 
> >> >> >> > (0));
> >> >> >> >  return e.use_contiguous_load_insn (icode);
> >> >> >> >}
> >> >> >> > +
> >> >> >> > +  gimple *
> >> >> >> > +  fold (gimple_folder &f) const OVERRIDE
> >> >> >> > +  {
> >> >> >> > +tree arg0 = gimple_call_arg (f.call, 0);
> >> >> >> > +tree arg1 = gimple_call_arg (f.call, 1);
> >> >> >> > +
> >> >> >> > +/* Transform:
> >> >> >> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
> >> >> >> > +   into:
> >> >> >> > +   tmp = mem_ref [(int * {ref-all}) arg1]
> >> >> >> > +   lhs = vec_perm_expr.
> >> >> >> > +   on little endian target.  */
> >> >> >> > +
> >> >> >> > +if (!BYTES_BIG_ENDIAN
> >> >> >> > + && integer_all_onesp (arg0))
> >> >> >> > +  {
> >> >> >> > + tree lhs = gimple_call_lhs (f.call);
> >> >> >> > + auto simd_type = aarch64_get_simd_info_for_type (Int32x4_t);
> >> >> >>
> >> >> >> Does this work for other element sizes?  I would have expected it
> >> >> >> to be the (128-bit) Advanced SIMD vector associated with the same
> >> >> >> element type as the SVE vector.
> >> >> >>
> >> >> >> The testcase should cover more than just int32x4_t -> svint32_t,
> >> >> >> just to be sure.
> >> >> > In the attached patch, it obtains corresponding advsimd type with:
> >> >> >
> >> >> > tree eltype = TREE_TYPE (lhs_type);
> >> >> > unsigned nunits = 128 / TREE_INT_CST_LOW (TYPE_SIZE (eltype));
> >> >> > tree vectype = build_vector_type (eltype, nunits);
> >> >> >
> >> >> > While this seems to work with different element sizes, I am not sure 
> >> >> > if it's
> >> >> > the correct approach ?
> >> >>
> >> >> Yeah, that looks correct.  Other SVE code uses aarch64_vq_mode
> >> >> to get the vector mode associated with a .Q “element”, so an
> >> >> alternative would be:
> >> >>
> >> >> machine_mode vq_mode = aarch64_vq_mode (TYPE_MODE (eltype)).require 
> >> >> ();
> >> >> tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> >> >>
> >> >> which is more explicit about wanting an Advanced SIMD vector.
> >> >>
> >> >> >> > +
> >> >> >> > + tree elt_ptr_type
> >> >> >> > +   = build_pointer_type_for_mode (simd_type.eltype, VOIDmode, 
> >> >> >> > true);
> >> >> >> > + tree zero = build_zero_cst (elt_ptr_type);
> >> >> >> > +
> >> >> >> > + /* Use element type alignment.  */
> >> >> >> > + tree access_type
> >> >> >> > +   = build_aligned_type (simd_type.itype, TYPE_ALIGN 
> >> >> >> > (simd_type.eltype));
> >> >> >> > +
> >> >> >> > + tree tmp = make_ssa_name_fn (cfun, access_type, 0);
> >> >> >> > + gimple *mem_ref_stmt
> >> >> >> > +   = gimple_build_assign (tmp, fold_build2 (MEM_REF, 
> >> >> >> > access_type, arg1, zero));
> >> >> >>
> >> >> >> Long line.  Might be easier to format by assigning the fold_build2 
> >> >> >> result
> >> >> >> to a temporary variable.
> >> >> >>
> >> >> >> > + gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> >> >> >> > +
> >> >> >> > + tree mem_ref_lhs = gimple_get_lhs (mem_ref_stmt);
> >> >> >> > + tree vectype = TREE_TYPE (mem_ref_lhs);
> >> >> >> > + tree lhs_type = TREE_TYPE (lhs);
> >> >> >>
> >> >> >> Is this necessary?  The code above supplied the types and I wouldn't
> >> >> >> have expected them to change during the build process.
> >> >> >>
> >> >> >> > +
> >> >> >> > + int source_nelts = TYPE_VECTOR_S

Re: [1/2] PR96463 - aarch64 specific changes

2022-06-06 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
>> >  {
>> >/* The pattern matching functions above are written to look for a small
>> >   number to begin the sequence (0, 1, N/2).  If we begin with an index
>> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct 
>> > expand_vec_perm_d *d)
>> > || d->vec_flags == VEC_SVE_PRED)
>> >&& known_gt (nelt, 1))
>> >  {
>> > +  /* If operand and result modes differ, then only check
>> > +  for dup case.  */
>> > +  if (d->vmode != op_mode)
>> > + return (d->vec_flags == VEC_SVE_DATA)
>> > + ? aarch64_evpc_sve_dup (d, op_mode) : false;
>> > +
>>
>> I think it'd be more future-proof to format this as:
>>
>> if (d->vmod == d->op_mode)
>>   {
>> …existing code…
>>   }
>> else
>>   {
>> if (aarch64_evpc_sve_dup (d))
>>   return true;
>>   }
>>
>> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
>> alongside the op_mode check.  I think we'll be adding more checks here
>> over time.
> Um I was wondering if we should structure it as:
> if (d->vmode == d->op_mode)
>   {
>  ...existing code...
>   }
> if (aarch64_evpc_sve_dup (d))
>   return true;
>
> So we check for dup irrespective of  d->vmode == d->op_mode ?

Yeah, I can see the attraction of that.  I think the else is better
though because the fallback TBL handling will (rightly) come at the end
of the existing code.  Without the else, we'd have specific tests like
DUP after generic ones like TBL, so the reader would have to work out
for themselves that DUP and TBL handle disjoint cases.

>> >if (aarch64_evpc_rev_local (d))
>> >   return true;
>> >else if (aarch64_evpc_rev_global (d))
>> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct 
>> > expand_vec_perm_d *d)
>> >else if (aarch64_evpc_reencode (d))
>> >   return true;
>> >if (d->vec_flags == VEC_SVE_DATA)
>> > - return aarch64_evpc_sve_tbl (d);
>> > + {
>> > +   if (aarch64_evpc_sve_tbl (d))
>> > + return true;
>> > +   else if (aarch64_evpc_sve_dup (d, op_mode))
>> > + return true;
>> > + }
>> >else if (d->vec_flags == VEC_ADVSIMD)
>> >   return aarch64_evpc_tbl (d);
>> >  }
>>
>> Is this part still needed, given the above?
>>
>> Thanks,
>> Richard
>>
>> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode 
>> > vmode, machine_mode op_mode,
>> > rtx target, rtx op0, rtx op1,
>> > const vec_perm_indices &sel)
>> >  {
>> > -  if (vmode != op_mode)
>> > -return false;
>> > -
>> >struct expand_vec_perm_d d;
>> >
>> >/* Check whether the mask can be applied to a single vector.  */
>> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode 
>> > vmode, machine_mode op_mode,
>> >d.testing_p = !target;
>> >
>> >if (!d.testing_p)
>> > -return aarch64_expand_vec_perm_const_1 (&d);
>> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >
>> >rtx_insn *last = get_last_insn ();
>> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
>> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >gcc_assert (last == get_last_insn ());
>> >
>> >return ret;
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index bee410929bd..1a804b1ab73 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>  
>  using namespace aarch64_sve;
>  
> @@ -1207,6 +1208,64 @@ public:
>  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>  return e.use_contiguous_load_insn (icode);
>}
> +
> +  gimple *
> +  fold (gimple_folder &f) const override
> +  {
> +tree arg0 = gimple_call_arg (f.call, 0);
> +tree arg1 = gimple_call_arg (f.call, 1);
> +
> +/* Transform:
> +   lhs = svld1rq ({-1, -1, ... }, arg1)
> +   into:
> +   tmp = mem_ref [(int * {ref-all}) arg1]
> +   lhs = vec_perm_expr.
> +   on little endian target.
> +   vectype is the corresponding ADVSIMD type.  */
> +
> +if (!BYTES_BIG_ENDIAN
> + && integer_all_onesp (arg0))
> +  {
> + tree lhs = gimple_call_lhs (f.call);
> + tree lhs_type = TREE_TYPE (lhs);
> + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> + tree eltype = TREE_TYPE (lhs_type);
> +
> + scalar_mode elmode = GET_MODE_INNER (TYPE_MODE (lhs_type));
> + machine_mode vq_mode = aarch64_vq_mode (elmode).require ();
> + tree vectype = build_vector_type_for_mode (eltype, vq_mode);
> +
> + tree elt_ptr_type
> +   = build_pointer_type_for_mode (eltype, VOIDmode, true);
> + tree ze

Re: [1/2] PR96463 - aarch64 specific changes

2022-06-07 Thread Prathamesh Kulkarni via Gcc-patches
On Mon, 6 Jun 2022 at 16:29, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> >> >  {
> >> >/* The pattern matching functions above are written to look for a 
> >> > small
> >> >   number to begin the sequence (0, 1, N/2).  If we begin with an 
> >> > index
> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct 
> >> > expand_vec_perm_d *d)
> >> > || d->vec_flags == VEC_SVE_PRED)
> >> >&& known_gt (nelt, 1))
> >> >  {
> >> > +  /* If operand and result modes differ, then only check
> >> > +  for dup case.  */
> >> > +  if (d->vmode != op_mode)
> >> > + return (d->vec_flags == VEC_SVE_DATA)
> >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false;
> >> > +
> >>
> >> I think it'd be more future-proof to format this as:
> >>
> >> if (d->vmod == d->op_mode)
> >>   {
> >> …existing code…
> >>   }
> >> else
> >>   {
> >> if (aarch64_evpc_sve_dup (d))
> >>   return true;
> >>   }
> >>
> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
> >> alongside the op_mode check.  I think we'll be adding more checks here
> >> over time.
> > Um I was wondering if we should structure it as:
> > if (d->vmode == d->op_mode)
> >   {
> >  ...existing code...
> >   }
> > if (aarch64_evpc_sve_dup (d))
> >   return true;
> >
> > So we check for dup irrespective of  d->vmode == d->op_mode ?
>
> Yeah, I can see the attraction of that.  I think the else is better
> though because the fallback TBL handling will (rightly) come at the end
> of the existing code.  Without the else, we'd have specific tests like
> DUP after generic ones like TBL, so the reader would have to work out
> for themselves that DUP and TBL handle disjoint cases.
>
> >> >if (aarch64_evpc_rev_local (d))
> >> >   return true;
> >> >else if (aarch64_evpc_rev_global (d))
> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct 
> >> > expand_vec_perm_d *d)
> >> >else if (aarch64_evpc_reencode (d))
> >> >   return true;
> >> >if (d->vec_flags == VEC_SVE_DATA)
> >> > - return aarch64_evpc_sve_tbl (d);
> >> > + {
> >> > +   if (aarch64_evpc_sve_tbl (d))
> >> > + return true;
> >> > +   else if (aarch64_evpc_sve_dup (d, op_mode))
> >> > + return true;
> >> > + }
> >> >else if (d->vec_flags == VEC_ADVSIMD)
> >> >   return aarch64_evpc_tbl (d);
> >> >  }
> >>
> >> Is this part still needed, given the above?
> >>
> >> Thanks,
> >> Richard
> >>
> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode 
> >> > vmode, machine_mode op_mode,
> >> > rtx target, rtx op0, rtx op1,
> >> > const vec_perm_indices &sel)
> >> >  {
> >> > -  if (vmode != op_mode)
> >> > -return false;
> >> > -
> >> >struct expand_vec_perm_d d;
> >> >
> >> >/* Check whether the mask can be applied to a single vector.  */
> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mode 
> >> > vmode, machine_mode op_mode,
> >> >d.testing_p = !target;
> >> >
> >> >if (!d.testing_p)
> >> > -return aarch64_expand_vec_perm_const_1 (&d);
> >> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >> >
> >> >rtx_insn *last = get_last_insn ();
> >> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
> >> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
> >> >gcc_assert (last == get_last_insn ());
> >> >
> >> >return ret;
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index bee410929bd..1a804b1ab73 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -44,6 +44,7 @@
> >  #include "aarch64-sve-builtins-shapes.h"
> >  #include "aarch64-sve-builtins-base.h"
> >  #include "aarch64-sve-builtins-functions.h"
> > +#include "ssa.h"
> >
> >  using namespace aarch64_sve;
> >
> > @@ -1207,6 +1208,64 @@ public:
> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >  return e.use_contiguous_load_insn (icode);
> >}
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const override
> > +  {
> > +tree arg0 = gimple_call_arg (f.call, 0);
> > +tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +/* Transform:
> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
> > +   into:
> > +   tmp = mem_ref [(int * {ref-all}) arg1]
> > +   lhs = vec_perm_expr.
> > +   on little endian target.
> > +   vectype is the corresponding ADVSIMD type.  */
> > +
> > +if (!BYTES_BIG_ENDIAN
> > + && integer_all_onesp (arg0))
> > +  {
> > + tree lhs = gimple_call_lhs (f.call);
> > + tree lhs_type = TREE_TYPE (lhs);
> > + poly_uint64 lhs_len = TYPE_VECTOR_SUBPARTS (lhs_type);
> > + tree eltype = TRE

Re: [1/2] PR96463 - aarch64 specific changes

2022-06-07 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> On Mon, 6 Jun 2022 at 16:29, Richard Sandiford
>  wrote:
>>
>> Prathamesh Kulkarni  writes:
>> >> >  {
>> >> >/* The pattern matching functions above are written to look for a 
>> >> > small
>> >> >   number to begin the sequence (0, 1, N/2).  If we begin with an 
>> >> > index
>> >> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct 
>> >> > expand_vec_perm_d *d)
>> >> > || d->vec_flags == VEC_SVE_PRED)
>> >> >&& known_gt (nelt, 1))
>> >> >  {
>> >> > +  /* If operand and result modes differ, then only check
>> >> > +  for dup case.  */
>> >> > +  if (d->vmode != op_mode)
>> >> > + return (d->vec_flags == VEC_SVE_DATA)
>> >> > + ? aarch64_evpc_sve_dup (d, op_mode) : false;
>> >> > +
>> >>
>> >> I think it'd be more future-proof to format this as:
>> >>
>> >> if (d->vmod == d->op_mode)
>> >>   {
>> >> …existing code…
>> >>   }
>> >> else
>> >>   {
>> >> if (aarch64_evpc_sve_dup (d))
>> >>   return true;
>> >>   }
>> >>
>> >> with the d->vec_flags == VEC_SVE_DATA check being in aarch64_evpc_sve_dup,
>> >> alongside the op_mode check.  I think we'll be adding more checks here
>> >> over time.
>> > Um I was wondering if we should structure it as:
>> > if (d->vmode == d->op_mode)
>> >   {
>> >  ...existing code...
>> >   }
>> > if (aarch64_evpc_sve_dup (d))
>> >   return true;
>> >
>> > So we check for dup irrespective of  d->vmode == d->op_mode ?
>>
>> Yeah, I can see the attraction of that.  I think the else is better
>> though because the fallback TBL handling will (rightly) come at the end
>> of the existing code.  Without the else, we'd have specific tests like
>> DUP after generic ones like TBL, so the reader would have to work out
>> for themselves that DUP and TBL handle disjoint cases.
>>
>> >> >if (aarch64_evpc_rev_local (d))
>> >> >   return true;
>> >> >else if (aarch64_evpc_rev_global (d))
>> >> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct 
>> >> > expand_vec_perm_d *d)
>> >> >else if (aarch64_evpc_reencode (d))
>> >> >   return true;
>> >> >if (d->vec_flags == VEC_SVE_DATA)
>> >> > - return aarch64_evpc_sve_tbl (d);
>> >> > + {
>> >> > +   if (aarch64_evpc_sve_tbl (d))
>> >> > + return true;
>> >> > +   else if (aarch64_evpc_sve_dup (d, op_mode))
>> >> > + return true;
>> >> > + }
>> >> >else if (d->vec_flags == VEC_ADVSIMD)
>> >> >   return aarch64_evpc_tbl (d);
>> >> >  }
>> >>
>> >> Is this part still needed, given the above?
>> >>
>> >> Thanks,
>> >> Richard
>> >>
>> >> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode 
>> >> > vmode, machine_mode op_mode,
>> >> > rtx target, rtx op0, rtx op1,
>> >> > const vec_perm_indices &sel)
>> >> >  {
>> >> > -  if (vmode != op_mode)
>> >> > -return false;
>> >> > -
>> >> >struct expand_vec_perm_d d;
>> >> >
>> >> >/* Check whether the mask can be applied to a single vector.  */
>> >> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const 
>> >> > (machine_mode vmode, machine_mode op_mode,
>> >> >d.testing_p = !target;
>> >> >
>> >> >if (!d.testing_p)
>> >> > -return aarch64_expand_vec_perm_const_1 (&d);
>> >> > +return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >> >
>> >> >rtx_insn *last = get_last_insn ();
>> >> > -  bool ret = aarch64_expand_vec_perm_const_1 (&d);
>> >> > +  bool ret = aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >> >gcc_assert (last == get_last_insn ());
>> >> >
>> >> >return ret;
>> >
>> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
>> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > index bee410929bd..1a804b1ab73 100644
>> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
>> > @@ -44,6 +44,7 @@
>> >  #include "aarch64-sve-builtins-shapes.h"
>> >  #include "aarch64-sve-builtins-base.h"
>> >  #include "aarch64-sve-builtins-functions.h"
>> > +#include "ssa.h"
>> >
>> >  using namespace aarch64_sve;
>> >
>> > @@ -1207,6 +1208,64 @@ public:
>> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>> >  return e.use_contiguous_load_insn (icode);
>> >}
>> > +
>> > +  gimple *
>> > +  fold (gimple_folder &f) const override
>> > +  {
>> > +tree arg0 = gimple_call_arg (f.call, 0);
>> > +tree arg1 = gimple_call_arg (f.call, 1);
>> > +
>> > +/* Transform:
>> > +   lhs = svld1rq ({-1, -1, ... }, arg1)
>> > +   into:
>> > +   tmp = mem_ref [(int * {ref-all}) arg1]
>> > +   lhs = vec_perm_expr.
>> > +   on little endian target.
>> > +   vectype is the corresponding ADVSIMD type.  */
>> > +
>> > +if (!BYTES_BIG_ENDIAN
>> > + && integer_all_onesp (arg0))
>> > +  {
>> > + tre

Re: [1/2] PR96463 - aarch64 specific changes

2021-12-17 Thread Richard Sandiford via Gcc-patches
Prathamesh Kulkarni  writes:
> Hi,
> The patch folds:
> lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> into:
> lhs = vec_perm_expr
> and expands above vec_perm_expr using aarch64_expand_sve_dupq.
>
> With patch, for following test:
> #include 
> #include 
>
> svint32_t
> foo (int32x4_t x)
> {
>   return svld1rq (svptrue_b8 (), &x[0]);
> }
>
> it generates following code:
> foo:
> .LFB4350:
> dup z0.q, z0.q[0]
> ret
>
> and passes bootstrap+test on aarch64-linux-gnu.
> But I am not sure if the changes to aarch64_evpc_sve_tbl
> are correct.

Just in case: I was only using int32x4_t in the PR as an example.
The same thing should work for all element types.

>
> Thanks,
> Prathamesh
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 02e42a71e5e..e21bbec360c 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -1207,6 +1207,56 @@ public:
>  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>  return e.use_contiguous_load_insn (icode);
>}
> +
> +  gimple *
> +  fold (gimple_folder &f) const OVERRIDE
> +  {
> +tree arg0 = gimple_call_arg (f.call, 0);
> +tree arg1 = gimple_call_arg (f.call, 1);
> +
> +/* Transform:
> +   lhs = svld1rq ({-1, -1, ... }, &v[0])
> +   into:
> +   lhs = vec_perm_expr.
> +   on little endian target.  */
> +
> +if (!BYTES_BIG_ENDIAN
> + && integer_all_onesp (arg0)
> + && TREE_CODE (arg1) == ADDR_EXPR)
> +  {
> + tree t = TREE_OPERAND (arg1, 0);
> + if (TREE_CODE (t) == ARRAY_REF)
> +   {
> + tree index = TREE_OPERAND (t, 1);
> + t = TREE_OPERAND (t, 0);
> + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> +   {
> + t = TREE_OPERAND (t, 0);
> + tree vectype = TREE_TYPE (t);
> + if (VECTOR_TYPE_P (vectype)
> + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> + && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> +   {

Since this is quite a specific pattern match, and since we now lower
arm_neon.h vld1* to normal gimple accesses, I think we should try the
“more generally” approach mentioned in the PR and see what the fallout
is.  That is, keep:

if (!BYTES_BIG_ENDIAN
&& integer_all_onesp (arg0)

If those conditions pass, create an Advanced SIMD access at address arg1,
using similar code to the handling of:

 BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
 BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
 BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)

in aarch64_general_gimple_fold_builtin.  (Would be good to move the
common code to aarch64.c so that both files can use it.)

> + tree lhs = gimple_call_lhs (f.call);
> + tree lhs_type = TREE_TYPE (lhs);
> + int source_nelts = TYPE_VECTOR_SUBPARTS 
> (vectype).to_constant ();
> + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> source_nelts, 1);
> + for (int i = 0; i < source_nelts; i++)
> +   sel.quick_push (i);
> +
> + vec_perm_indices indices (sel, 1, source_nelts);
> + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> +   return NULL;

I don't think we need to check this: it should always be true.
Probably worth keeping as a gcc_checking_assert though.

> +
> + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, mask);
> +   }
> +   }
> +   }
> +  }
> +
> +return NULL;
> +  }
>  };
>  
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index f07330cff4f..af27f550be3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
>  
>machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
>rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> +
>if (d->one_vector_p)
> -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
> +{
> +  bool use_dupq = false;
> +  /* Check if sel is dup vector with encoded elements {0, 1, 2, ... 
> nelts}  */
> +  if (GET_CODE (sel) == CONST_VECTOR
> +   && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> +   && CONST_VECTOR_DUPLICATE_P (sel))
> +   {
> + unsigned nelts = const_vector_encoded_nelts (sel);
> + unsigned i;
> + for (i = 0; i < nelts; i++)
> +   {
> + rtx elem = CONST_VECTOR_ENCODED_ELT(sel, i);
> + if (!(CONST_INT_P (elem) && INTVAL(elem) == i))
> +   break;
> +   }
> + if (i == nelts)
> +   use_dupq = true;

Re: [1/2] PR96463 - aarch64 specific changes

2021-12-27 Thread Prathamesh Kulkarni via Gcc-patches
On Fri, 17 Dec 2021 at 17:03, Richard Sandiford
 wrote:
>
> Prathamesh Kulkarni  writes:
> > Hi,
> > The patch folds:
> > lhs = svld1rq ({-1, -1, -1, ...}, &v[0])
> > into:
> > lhs = vec_perm_expr
> > and expands above vec_perm_expr using aarch64_expand_sve_dupq.
> >
> > With patch, for following test:
> > #include 
> > #include 
> >
> > svint32_t
> > foo (int32x4_t x)
> > {
> >   return svld1rq (svptrue_b8 (), &x[0]);
> > }
> >
> > it generates following code:
> > foo:
> > .LFB4350:
> > dup z0.q, z0.q[0]
> > ret
> >
> > and passes bootstrap+test on aarch64-linux-gnu.
> > But I am not sure if the changes to aarch64_evpc_sve_tbl
> > are correct.
>
> Just in case: I was only using int32x4_t in the PR as an example.
> The same thing should work for all element types.
>
> >
> > Thanks,
> > Prathamesh
> >
> > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
> > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > index 02e42a71e5e..e21bbec360c 100644
> > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> > @@ -1207,6 +1207,56 @@ public:
> >  insn_code icode = code_for_aarch64_sve_ld1rq (e.vector_mode (0));
> >  return e.use_contiguous_load_insn (icode);
> >}
> > +
> > +  gimple *
> > +  fold (gimple_folder &f) const OVERRIDE
> > +  {
> > +tree arg0 = gimple_call_arg (f.call, 0);
> > +tree arg1 = gimple_call_arg (f.call, 1);
> > +
> > +/* Transform:
> > +   lhs = svld1rq ({-1, -1, ... }, &v[0])
> > +   into:
> > +   lhs = vec_perm_expr.
> > +   on little endian target.  */
> > +
> > +if (!BYTES_BIG_ENDIAN
> > + && integer_all_onesp (arg0)
> > + && TREE_CODE (arg1) == ADDR_EXPR)
> > +  {
> > + tree t = TREE_OPERAND (arg1, 0);
> > + if (TREE_CODE (t) == ARRAY_REF)
> > +   {
> > + tree index = TREE_OPERAND (t, 1);
> > + t = TREE_OPERAND (t, 0);
> > + if (integer_zerop (index) && TREE_CODE (t) == VIEW_CONVERT_EXPR)
> > +   {
> > + t = TREE_OPERAND (t, 0);
> > + tree vectype = TREE_TYPE (t);
> > + if (VECTOR_TYPE_P (vectype)
> > + && known_eq (TYPE_VECTOR_SUBPARTS (vectype), 4u)
> > + && wi::to_wide (TYPE_SIZE (vectype)) == 128)
> > +   {
>
> Since this is quite a specific pattern match, and since we now lower
> arm_neon.h vld1* to normal gimple accesses, I think we should try the
> “more generally” approach mentioned in the PR and see what the fallout
> is.  That is, keep:
>
> if (!BYTES_BIG_ENDIAN
> && integer_all_onesp (arg0)
>
> If those conditions pass, create an Advanced SIMD access at address arg1,
> using similar code to the handling of:
>
>  BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
>  BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
>  BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
>
> in aarch64_general_gimple_fold_builtin.  (Would be good to move the
> common code to aarch64.c so that both files can use it.)
>
> > + tree lhs = gimple_call_lhs (f.call);
> > + tree lhs_type = TREE_TYPE (lhs);
> > + int source_nelts = TYPE_VECTOR_SUBPARTS 
> > (vectype).to_constant ();
> > + vec_perm_builder sel (TYPE_VECTOR_SUBPARTS (lhs_type), 
> > source_nelts, 1);
> > + for (int i = 0; i < source_nelts; i++)
> > +   sel.quick_push (i);
> > +
> > + vec_perm_indices indices (sel, 1, source_nelts);
> > + if (!can_vec_perm_const_p (TYPE_MODE (lhs_type), indices))
> > +   return NULL;
>
> I don't think we need to check this: it should always be true.
> Probably worth keeping as a gcc_checking_assert though.
>
> > +
> > + tree mask = vec_perm_indices_to_tree (lhs_type, indices);
> > + return gimple_build_assign (lhs, VEC_PERM_EXPR, t, t, 
> > mask);
> > +   }
> > +   }
> > +   }
> > +  }
> > +
> > +return NULL;
> > +  }
> >  };
> >
> >  class svld1ro_impl : public load_replicate
> > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> > index f07330cff4f..af27f550be3 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -23002,8 +23002,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
> >
> >machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
> >rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
> > +
> >if (d->one_vector_p)
> > -emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, 
> > sel));
> > +{
> > +  bool use_dupq = false;
> > +  /* Check if sel is dup vector with encoded elements {0, 1, 2, ... 
> > nelts}  */
> > +  if (GET_CODE (sel) == CONST_VECTOR
> > +   && !GET_MODE_NUNITS (GET_MODE (sel)).is_constant ()
> > +   && CONST_VECTOR_DUPLICATE_P (sel))
> > +   {
> > + unsigned nelts =