Hi Richard,

On Wed, 15 May 2019 at 16:57, Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Thanks for doing this.
>
> kugan.vivekanandara...@linaro.org writes:
> > From: Kugan Vivekanandarajah <kugan.vivekanandara...@linaro.org>
> >
> > gcc/ChangeLog:
> >
> > 2019-05-15  Kugan Vivekanandarajah  <kugan.vivekanandara...@linaro.org>
> >
> >       PR target/88834
> >       * tree-ssa-loop-ivopts.c (get_mem_type_for_internal_fn): Handle
> >       IFN_MASK_LOAD_LANES and IFN_MASK_STORE_LANES.
> >       (find_interesting_uses_stmt): Likewise.
> >       (get_alias_ptr_type_for_ptr_address): Likewise.
> >       (add_iv_candidate_for_use): Add scaled index candidate if useful.
> >
> > Change-Id: I8e8151fe2dde2845dedf38b090103694da6fc9d1
> > ---
> >  gcc/tree-ssa-loop-ivopts.c | 60 
> > +++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 59 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
> > index 9864b59..115a70c 100644
> > --- a/gcc/tree-ssa-loop-ivopts.c
> > +++ b/gcc/tree-ssa-loop-ivopts.c
> > @@ -2451,11 +2451,13 @@ get_mem_type_for_internal_fn (gcall *call, tree 
> > *op_p)
> >    switch (gimple_call_internal_fn (call))
> >      {
> >      case IFN_MASK_LOAD:
> > +    case IFN_MASK_LOAD_LANES:
> >        if (op_p == gimple_call_arg_ptr (call, 0))
> >       return TREE_TYPE (gimple_call_lhs (call));
> >        return NULL_TREE;
> >
> >      case IFN_MASK_STORE:
> > +    case IFN_MASK_STORE_LANES:
> >        if (op_p == gimple_call_arg_ptr (call, 0))
> >       return TREE_TYPE (gimple_call_arg (call, 3));
> >        return NULL_TREE;
> > @@ -2545,7 +2547,7 @@ find_interesting_uses_stmt (struct ivopts_data *data, 
> > gimple *stmt)
> >         return;
> >       }
> >
> > -      /* TODO -- we should also handle address uses of type
> > +      /* TODO -- we should also handle all address uses of type
> >
> >        memory = call (whatever);
> >
> > @@ -2553,6 +2555,27 @@ find_interesting_uses_stmt (struct ivopts_data 
> > *data, gimple *stmt)
> >
> >        call (memory).  */
> >      }
> > +  else if (is_gimple_call (stmt))
> > +    {
> > +      gcall *call = dyn_cast <gcall *> (stmt);
> > +      if (call
> > +       && gimple_call_internal_p (call)
> > +       && (gimple_call_internal_fn (call) == IFN_MASK_LOAD_LANES
> > +           || gimple_call_internal_fn (call) == IFN_MASK_STORE_LANES))
> > +     {
> > +       tree *arg = gimple_call_arg_ptr (call, 0);
> > +       struct iv *civ = get_iv (data, *arg);
> > +       tree mem_type = get_mem_type_for_internal_fn (call, arg);
> > +       if (civ && mem_type)
> > +         {
> > +           civ = alloc_iv (data, civ->base, civ->step);
> > +           record_group_use (data, arg, civ, stmt, USE_PTR_ADDRESS,
> > +                             mem_type);
> > +           return;
> > +         }
> > +     }
> > +    }
> > +
>
> Why do you need to handle this specially?  Does:
>
>   FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
>     {
>       op = USE_FROM_PTR (use_p);
>
>       if (TREE_CODE (op) != SSA_NAME)
>         continue;
>
>       iv = get_iv (data, op);
>       if (!iv)
>         continue;
>
>       if (!find_address_like_use (data, stmt, use_p->use, iv))
>         find_interesting_uses_op (data, op);
>     }
>
> not do the right thing for the load/store lane case?
Right, I initially thought load lanes should be handled differently
but turned out they can be done the same way. I should have removed
it. Done now.

>
> > @@ -3500,6 +3523,39 @@ add_iv_candidate_for_use (struct ivopts_data *data, 
> > struct iv_use *use)
> >      basetype = sizetype;
> >    record_common_cand (data, build_int_cst (basetype, 0), iv->step, use);
> >
> > +  /* Compare the cost of an address with an unscaled index with the cost of
> > +    an address with a scaled index and add candidate if useful. */
> > +  if (use != NULL && use->type == USE_PTR_ADDRESS)
>
> I think we want this for all address uses.  E.g. for SVE, masked and
> unmasked accesses would both benefit.
OK.

>
> > +    {
> > +      struct mem_address parts = {NULL_TREE, integer_one_node,
> > +                               NULL_TREE, NULL_TREE, NULL_TREE};
>
> Might be better to use "= {}" and initialise the fields that matter by
> assignment.  As it stands this uses integer_one_node as the base, but I
> couldn't tell if that was deliberate.

I just copied this part from get_address_cost, similar to what is done
there. I have now changed the way you suggested but using the values
used in get_address_cost.
>
> > +      poly_uint64 temp;
> > +      poly_int64 fact;
> > +      bool speed = optimize_loop_for_speed_p (data->current_loop);
> > +      poly_int64 poly_step = tree_to_poly_int64 (iv->step);
>
> The step could be variable, so we should check whether this holds
> using poly_int_tree_p.
OK.

>
> > +      machine_mode mem_mode = TYPE_MODE (use->mem_type);
> > +      addr_space_t as = TYPE_ADDR_SPACE (TREE_TYPE (use->iv->base));
> > +
> > +      fact = GET_MODE_SIZE (GET_MODE_INNER (TYPE_MODE (use->mem_type)));
>
> This is simpler as:
>
>   GET_MODE_UNIT_SIZE (TYPE_MODE (use->mem_type));
>
OK.

> It's always a compile-time constant, so "fact" can be int/unsigned int
> rather than poly_int64.
OK.

>
> > +      parts.index = integer_one_node;
> > +
> > +      if (fact.is_constant ()
> > +       && can_div_trunc_p (poly_step, fact, &temp))
>
> I think it only makes sense to add the candidate if poly_step is an exact
> multiple of fact, so I think we should use multiple_p here.
OK.
>
> > +     {
> > +       /* Addressing mode "base + index".  */
> > +       rtx addr = addr_for_mem_ref (&parts, as, false);
> > +       unsigned cost = address_cost (addr, mem_mode, as, speed);
> > +       tree step = wide_int_to_tree (sizetype,
> > +                                     exact_div (poly_step, fact));
>
> The multiple_p mentioned above would provide this result too.
> We only need to calculate "step" if we decided to add the candidate,
> so I think it should be in the "if" below.
OK.
>
> > +       parts.step = wide_int_to_tree (sizetype, fact);
> > +       /* Addressing mode "base + index << scale".  */
> > +       addr = addr_for_mem_ref (&parts, as, false);
> > +       unsigned new_cost = address_cost (addr, mem_mode, as, speed);
> > +       if (new_cost < cost)
>
> I think it'd be worth splitting the guts of this check out into a helper,
> since it's something that could be reusable.  Maybe:
>
>   unsigned int preferred_mem_scalar_factor (machine_mode);
>
> with the only supported values for now being 1 and GET_MODE_INNER_SIZE.
> (Could be extended later if a target needs it.)
Done in the attached patch. Not tested yet but does this look good if
no regressions?

Thanks,
Kugan

> Thanks,
> Richard
From c41df9740018ea381d03958937d2addddee58081 Mon Sep 17 00:00:00 2001
From: Kugan Vivekanandarajah <kugan.vivekanandarajah@linaro.org>
Date: Wed, 15 May 2019 09:16:43 +1000
Subject: [PATCH 1/2] Add support for IVOPT

gcc/ChangeLog:

2019-05-15  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	PR target/88834
	* tree-ssa-loop-ivopts.c (get_mem_type_for_internal_fn): Handle
	IFN_MASK_LOAD_LANES and IFN_MASK_STORE_LANES.
	(find_interesting_uses_stmt): Likewise.
	(get_alias_ptr_type_for_ptr_address): Likewise.
	(add_iv_candidate_for_use): Add scaled index candidate if useful.

Change-Id: Ib11c53e6566318bc36e28abfe9772a5b802d2f59
---
 gcc/tree-ssa-loop-ivopts.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c
index 9864b59..5e29114 100644
--- a/gcc/tree-ssa-loop-ivopts.c
+++ b/gcc/tree-ssa-loop-ivopts.c
@@ -2451,11 +2451,13 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p)
   switch (gimple_call_internal_fn (call))
     {
     case IFN_MASK_LOAD:
+    case IFN_MASK_LOAD_LANES:
       if (op_p == gimple_call_arg_ptr (call, 0))
 	return TREE_TYPE (gimple_call_lhs (call));
       return NULL_TREE;
 
     case IFN_MASK_STORE:
+    case IFN_MASK_STORE_LANES:
       if (op_p == gimple_call_arg_ptr (call, 0))
 	return TREE_TYPE (gimple_call_arg (call, 3));
       return NULL_TREE;
@@ -3479,6 +3481,35 @@ add_iv_candidate_derived_from_uses (struct ivopts_data *data)
   data->iv_common_cands.truncate (0);
 }
 
+/* Return the preferred mem scale factor for accessing MEM_MODE
+   of BASE in LOOP.  */
+static unsigned int
+preferred_mem_scale_factor (struct loop *loop,
+			    tree base, machine_mode mem_mode)
+{
+  struct mem_address parts = {};
+  bool speed = optimize_loop_for_speed_p (loop);
+  addr_space_t as = TYPE_ADDR_SPACE (TREE_TYPE (base));
+  unsigned int fact = GET_MODE_UNIT_SIZE (mem_mode);
+
+  /* Addressing mode "base + index".  */
+  parts.index = integer_one_node;
+  parts.base = integer_one_node;
+  rtx addr = addr_for_mem_ref (&parts, as, false);
+  unsigned cost = address_cost (addr, mem_mode, as, speed);
+
+  /* Addressing mode "base + index << scale".  */
+  parts.step = wide_int_to_tree (sizetype, fact);
+  addr = addr_for_mem_ref (&parts, as, false);
+  unsigned new_cost = address_cost (addr, mem_mode, as, speed);
+
+  /* Compare the cost of an address with an unscaled index with
+     a scaled index and return factor if useful. */
+  if (new_cost < cost)
+    return GET_MODE_UNIT_SIZE (mem_mode);
+  return 1;
+}
+
 /* Adds candidates based on the value of USE's iv.  */
 
 static void
@@ -3500,6 +3531,28 @@ add_iv_candidate_for_use (struct ivopts_data *data, struct iv_use *use)
     basetype = sizetype;
   record_common_cand (data, build_int_cst (basetype, 0), iv->step, use);
 
+  /* Compare the cost of an address with an unscaled index with the cost of
+    an address with a scaled index and add candidate if useful. */
+  if (use != NULL
+      && poly_int_tree_p (iv->step)
+      && tree_fits_poly_int64_p (iv->step)
+      && address_p (use->type))
+    {
+      poly_int64 new_step;
+      poly_int64 poly_step = tree_to_poly_int64 (iv->step);
+      unsigned int fact
+	= preferred_mem_scale_factor (data->current_loop,
+				       use->iv->base,
+				       TYPE_MODE (use->mem_type));
+
+      if ((fact != 1)
+	  && multiple_p (poly_step, fact, &new_step))
+	{
+	  tree step = wide_int_to_tree (sizetype, new_step);
+	  add_candidate (data, size_int (0), step, true, NULL);
+	}
+    }
+
   /* Record common candidate with constant offset stripped in base.
      Like the use itself, we also add candidate directly for it.  */
   base = strip_offset (iv->base, &offset);
@@ -7112,6 +7165,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use)
     {
     case IFN_MASK_LOAD:
     case IFN_MASK_STORE:
+    case IFN_MASK_LOAD_LANES:
+    case IFN_MASK_STORE_LANES:
       /* The second argument contains the correct alias type.  */
       gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0));
       return TREE_TYPE (gimple_call_arg (call, 1));
-- 
2.7.4

Reply via email to