Hi Richard,

Thanks so much for your great review!

Richard Biener <rguent...@suse.de> writes:

> On Wed, 23 Aug 2023, Jiufu Guo wrote:
>
>> 
>> Hi,
>> 
>> I just updated the patch.  We could review this one.
>> 
>> Compare with previous patch:
>> https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627287.html
>> This version:
>> * Supports bitfield access from one register.
>> * Allow return scalar registers cleaned via contructor.
>> 
>> Bootstrapped and regtested on x86_64-redhat-linux, and
>> powerpc64{,le}-linux-gnu.
>> 
>> Is it ok for trunk?
>
> Some comments inline - not a full review (and sorry for the delay).
>
>> 
>>      PR target/65421
>>      PR target/69143
>> 
>> gcc/ChangeLog:
>> 
>>      * cfgexpand.cc (extract_bit_field): Extern declare.
>>      (struct access): New class.
>>      (struct expand_sra): New class.
>>      (expand_sra::build_access): New member function.
>>      (expand_sra::visit_base): Likewise.
>>      (expand_sra::analyze_default_stmt): Likewise.
>>      (expand_sra::analyze_assign): Likewise.
>>      (expand_sra::add_sra_candidate): Likewise.
>>      (expand_sra::collect_sra_candidates): Likewise.
>>      (expand_sra::valid_scalariable_accesses): Likewise.
>>      (expand_sra::prepare_expander_sra): Likewise.
>>      (expand_sra::expand_sra): Class constructor.
>>      (expand_sra::~expand_sra): Class destructor.
>>      (expand_sra::get_scalarized_rtx): New member function.
>>      (extract_one_reg): New function.
>>      (extract_bitfield): New function.
>>      (expand_sra::scalarize_access): New member function.
>>      (expand_sra::scalarize_accesses): New member function.
>>      (get_scalar_rtx_for_aggregate_expr): New function.
>>      (set_scalar_rtx_for_aggregate_access): New function.
>>      (set_scalar_rtx_for_returns): New function.
>>      (expand_return): Call get_scalar_rtx_for_aggregate_expr.
>>      (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr.
>>      (pass_expand::execute): Update to use the expand_sra.
>>      * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare.
>>      (expand_assignment): Call get_scalar_rtx_for_aggregate_expr.
>>      (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr.
>>      * function.cc (set_scalar_rtx_for_aggregate_access):  Extern declare.
>>      (set_scalar_rtx_for_returns): Extern declare.
>>      (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access.
>>      (assign_parms): Call set_scalar_rtx_for_aggregate_access. 
>>      (expand_function_start): Call set_scalar_rtx_for_returns.
>>      * tree-sra.h (struct base_access): New class.
>>      (struct default_analyzer): New class.
>>      (scan_function): New function template.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>      * g++.target/powerpc/pr102024.C: Updated.
>>      * gcc.target/powerpc/pr108073.c: New test.
>>      * gcc.target/powerpc/pr65421-1.c: New test.
>>      * gcc.target/powerpc/pr65421-2.c: New test.
>> 
>> ---
>>  gcc/cfgexpand.cc                             | 474 ++++++++++++++++++-
>>  gcc/expr.cc                                  |  29 +-
>>  gcc/function.cc                              |  28 +-
>>  gcc/tree-sra.h                               |  77 +++
>>  gcc/testsuite/g++.target/powerpc/pr102024.C  |   2 +-
>>  gcc/testsuite/gcc.target/powerpc/pr108073.c  |  29 ++
>>  gcc/testsuite/gcc.target/powerpc/pr65421-1.c |   6 +
>>  gcc/testsuite/gcc.target/powerpc/pr65421-2.c |  32 ++
>>  8 files changed, 668 insertions(+), 9 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> 
>> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
>> index 
>> edf292cfbe95ac2711faee7769e839cb4edb0dd3..385b6c781aa2805e7ca40293a0ae84f87e23e0b6
>>  100644
>> --- a/gcc/cfgexpand.cc
>> +++ b/gcc/cfgexpand.cc
>> @@ -74,6 +74,7 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "output.h"
>>  #include "builtins.h"
>>  #include "opts.h"
>> +#include "tree-sra.h"
>>  
>>  /* Some systems use __main in a way incompatible with its use in gcc, in 
>> these
>>     cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN 
>> to
>> @@ -97,6 +98,468 @@ static bool defer_stack_allocation (tree, bool);
>>  
>>  static void record_alignment_for_reg_var (unsigned int);
>>  
>> +extern rtx extract_bit_field (rtx, poly_uint64, poly_uint64, int, rtx,
>> +                          machine_mode, machine_mode, bool, rtx *);
>
> belongs in some header

Thanks. Will update.
>
>> +
>> +/* For light SRA in expander about paramaters and returns.  */
>> +struct access : public base_access
>> +{
>> +  /* The rtx for the access: link to incoming/returning register(s).  */
>> +  rtx rtx_val;
>> +};
>> +
>> +typedef struct access *access_p;
>> +
>> +struct expand_sra : public default_analyzer
>> +{
>
> Both 'base_access' and 'default_analyzer' need a more specific
> name I think.  Just throwing in two names here,
> 'sra_access_base' and 'sra_default_analyzer'
Thanks!
>
>> +  expand_sra ();
>> +  ~expand_sra ();
>> +
>> +  /* Now use default APIs, no actions for
>> +     pre_analyze_stmt, analyze_return.  */
>> +
>> +  /* overwrite analyze_default_stmt.  */
>> +  void analyze_default_stmt (gimple *);
>> +
>> +  /* overwrite analyze phi,call,asm .  */
>> +  void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); };
>> +  void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); };
>> +  void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); };
>
> that looks odd

I would define another function with a name like:
protect_mem_access_in_stmt (gimple*)
"analyze_default_stmt and analyze_phi" call this function.

analyze_call and analyze_asm would be updated to analyze sra
accesses on call/asm stmt later.

>
>> +  /* overwrite analyze_assign.  */
>> +  void analyze_assign (gassign *);
>> +
>> +  /* Compute the scalar rtx(s) for all access of BASE from a parrallel 
>> REGS. */
>> +  bool scalarize_accesses (tree base, rtx regs);
>> +  /* Return the scalarized rtx for EXPR.  */
>> +  rtx get_scalarized_rtx (tree expr);
>> +
>> +private:
>> +  void prepare_expander_sra (void);
>> +
>> +  /* Return true if VAR is a candidate for SRA.  */
>> +  bool add_sra_candidate (tree var);
>> +
>> +  /* Collect the parameter and returns with type which is suitable for
>> +     scalarization.  */
>> +  bool collect_sra_candidates (void);
>> +
>> +  /* Return true if EXPR has interesting access to the sra candidates,
>> +     and created access, return false otherwise.  */
>> +  access_p build_access (tree expr, bool write);
>> +
>> +  /* Check if the accesses of BASE are scalarizbale.
>> +     Now support the parms only with reading or returns only with writing.  
>> */
>> +  bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm);
>> +
>> +  /* Compute the scalar rtx for one access ACC from a parrallel REGS. */
>> +  bool scalarize_access (access_p acc, rtx regs);
>> +
>> +  /* Callback of walk_stmt_load_store_addr_ops, used to remove
>> +     unscalarizable accesses.  */
>> +  static bool visit_base (gimple *, tree op, tree, void *data);
>> +
>> +  /* Expr (tree) -> Scalarized value (rtx) map.  */
>> +  hash_map<tree, rtx> *expr_rtx_vec;
>> +
>> +  /* Base (tree) -> Vector (vec<access_p> *) map.  */
>> +  hash_map<tree, auto_vec<access_p> > *base_access_vec;
>> +};
>> +
>> +access_p
>> +expand_sra::build_access (tree expr, bool write)
>> +{
>> +  enum tree_code code = TREE_CODE (expr);
>> +  if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF
>> +      && code != ARRAY_REF && code != ARRAY_RANGE_REF)
>> +    return NULL;
>> +
>> +  HOST_WIDE_INT offset, size;
>> +  bool reverse;
>> +  tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse);
>> +  if (!base || !DECL_P (base))
>> +    return NULL;
>> +  if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr))
>> +    {
>> +      base_access_vec->remove (base);
>> +      return NULL;
>> +    }
>> +
>> +  vec<access_p> *access_vec = base_access_vec->get (base);
>> +  if (!access_vec)
>> +    return NULL;
>> +
>> +  /* TODO: support reverse. */
>> +  if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE 
>> (base)))
>> +    {
>> +      base_access_vec->remove (base);
>> +      return NULL;
>> +    }
>> +
>> +  struct access *access = XNEWVEC (struct access, 1);
>> +
>> +  memset (access, 0, sizeof (struct access));
>> +  access->offset = offset;
>> +  access->size = size;
>> +  access->expr = expr;
>> +  access->write = write;
>> +  access->rtx_val = NULL_RTX;
>> +
>> +  access_vec->safe_push (access);
>
> doesn't look like this shares anything with SRA?

For this code in tree-sra and ipa-sra, they are just
"**seems**" similar, but they are different, even
extracting the "common" code slightly, it may look
crushed.

I would still try to common the code.

>
>> +  Return access;
>> +}
>> +
>> +bool
>> +expand_sra::visit_base (gimple *, tree op, tree, void *data)
>> +{
>> +  op = get_base_address (op);
>> +  if (op && DECL_P (op))
>> +    {
>> +      expand_sra *p = (expand_sra *) data;
>> +      p->base_access_vec->remove (op);
>> +    }
>> +  return false;
>> +}
>> +
>> +void
>> +expand_sra::analyze_default_stmt (gimple *stmt)
>> +{
>> +  if (base_access_vec && !base_access_vec->is_empty ())
>> +    walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base,
>> +                               visit_base);
>
> comments would be helpful.  This seems to remove all bases for
> references based on a decl?!  Are you really interested in addrs?

Yes.  In tree-sra/ipa-sra, stmt (assign/call/asm/return) are
checked, and may disqualify(remove) the base if risk.
This patch is relatively conservative, it only keeps sra
candidates on assignment.

For example: "%_7=&arg", and %_7 is used for some purpose.
For this case, "arg" is addr taken, and then it needs a mem
for it, and then it would be at risk to do sra without deep
analyzing.

>
>> +}
>> +
>> +void
>> +expand_sra::analyze_assign (gassign *stmt)
>> +{
>> +  if (!base_access_vec || base_access_vec->is_empty ())
>> +    return;
>> +
>> +  if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
>> +    {
>> +      tree rhs = gimple_assign_rhs1 (stmt);
>> +      tree lhs = gimple_assign_lhs (stmt);
>> +      bool res_r = build_access (rhs, false);
>> +      bool res_l = build_access (lhs, true);
>> +
>> +      if (res_l || res_r)
>
> does this assume we can analyze all accesses that are in the
> lhs/rhs?  Because if we have an aggregate copy there might be one
> in both but one(?) might fail?

For the light-expand-sra, "lhs" of an assignment which we care
about is only "return", or say, assigning to a return-var.
"rhs" of an assignment that we care about is a parameter(or part
of parameter).

It is also ok for aggregate copy between parameter and return-var.

>
> Why do we remove bases - since this is all about heuristics it should
> be OK to "ignore" accesses we cannot / do not analyze?

Removing the whole base is simple.
To support partial sra, we need more work.
- Need more analysis to make sure the scalarized part is not
  affected by the access to other parts of the same base.
- We need to allow the parameter to be stored to stack for
  non-sra access, and also allow access to the scalarized
  at the same time. 
  Need to figure out one way to let DECL_RTL registers and
  DECL_RTL of stack co-exist.

>
> Why doesn't this use walk_stmt_load_store_addr_ops as well?

Two reasons:
1. The callback of this walk has "base" as a parameter, but
   we need the 'access expr' which has "base, offset, size"
   info.
2. The most common case for the parameter sra is reading/
   writing field(s) of parmater. Then what we care about is:
   if "rhs/rhs" is a field/member access of an aggregate
   parameter/return.
   Then checking "lhs/rhs" directly would be simpler.
   
>
>> +    return;
>> +    }
>> +
>> +  analyze_default_stmt (stmt);
>> +}
>> +
>> +/* Return true if VAR is a candidate for SRA.  */
>> +
>> +bool
>> +expand_sra::add_sra_candidate (tree var)
>> +{
>> +  tree type = TREE_TYPE (var);
>> +
>> +  if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type))
>> +      || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var)
>> +      || is_va_list_type (type))
>> +    return false;
>> +  gcc_assert (COMPLETE_TYPE_P (type));
>> +
>> +  base_access_vec->get_or_insert (var);
>> +
>> +  return true;
>> +}
>> +
>> +bool
>> +expand_sra::collect_sra_candidates (void)
>> +{
>> +  bool ret = false;
>> +
>> +  /* Collect parameters.  */
>> +  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
>> +       parm = DECL_CHAIN (parm))
>> +    ret |= add_sra_candidate (parm);
>> +
>> +  /* Collect VARs on returns.  */
>> +  if (DECL_RESULT (current_function_decl))
>> +    {
>> +      edge_iterator ei;
>> +      edge e;
>> +      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>> +    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
>> +      {
>> +        tree val = gimple_return_retval (r);
>> +        /* To sclaraized the return, the return value should be only
>> +           writen, except this return stmt.
>> +           Then using 'true(write)' to create the access. */
>> +        if (val && VAR_P (val))
>
> what about RESULT_DECLs?  I think you want to check
> needs_to_live_in_memory.

If RESULT_DECL is on the return-stmts and return via registers,
then the DECL_RTL of RESULT_DECL is already as "PARALLEL" code.
We already handled this case fine.

In this patch, we only need to handle a special case: e.g.
D.2872 = XXX; return D.2872.
In this case, a VAR(D.2872) is allocated, and it is not the
RESULT_DECL, and return-stmt returns the VAR(D.2872).

>
>> +          ret |= add_sra_candidate (val) && build_access (val, true);
>> +      }
>> +    }
>> +
>> +  return ret;
>> +}
>> +
>> +bool
>> +expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool 
>> is_parm)
>> +{
>> +  if (access_vec->is_empty ())
>> +    return false;
>> +
>> +  for (unsigned int j = 0; j < access_vec->length (); j++)
>> +    {
>> +      struct access *access = (*access_vec)[j];
>> +      if (is_parm && access->write)
>> +    return false;
>> +
>> +      if (!is_parm && !access->write)
>> +    return false;
>
> Can you explain why you are making this restriction?

With this restriction, it would be safe/simple, and
would be able to cover most cases.

tree-sra/ipa-sra handle most of the cases already.
For writing to parameter: like "a.d += xx;"
The SSA code may like: "a$d_4 = a.d; _2 = _1 + a$d_4;"
There is no "writing to parameter" anymore.

While there are still cases about "writing" to parameter:
e.g. "a$d_11 = a.d; a.d = _2;  _7 = bar (a)"
To support this, we may need more analysis to do.

>
>> +    }
>> +
>> +  return true;
>> +}
>> +
>> +void
>> +expand_sra::prepare_expander_sra ()
>> +{
>> +  if (optimize <= 0)
>> +    return;
>> +
>> +  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
>> +  expr_rtx_vec = new hash_map<tree, rtx>;
>> +
>> +  collect_sra_candidates ();
>> +}
>> +
>> +expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL)
>> +{
>> +  prepare_expander_sra ();
>
> inline this function here.
Thanks.
>
>> +}
>> +
>> +expand_sra::~expand_sra ()
>> +{
>> +  if (optimize <= 0)
>> +    return;
>> +  delete expr_rtx_vec;
>> +  expr_rtx_vec = NULL;
>> +  delete base_access_vec;
>> +  base_access_vec = NULL;
>> +}
>> +
>> +rtx
>> +expand_sra::get_scalarized_rtx (tree expr)
>> +{
>> +  if (!expr_rtx_vec)
>> +    return NULL_RTX;
>> +  rtx *val = expr_rtx_vec->get (expr);
>> +  return val ? *val : NULL_RTX;
>> +}
>> +
>> +/* Get the register at INDEX from a parallel REGS.  */
>> +
>> +static rtx
>> +extract_one_reg (rtx regs, int index)
>> +{
>> +  rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
>> +  if (!HARD_REGISTER_P (orig_reg))
>> +    return orig_reg;
>> +
>> +  /* Reading from param hard reg need to be moved to a temp.  */
>> +  rtx reg = gen_reg_rtx (GET_MODE (orig_reg));
>> +  emit_move_insn (reg, orig_reg);
>> +  return reg;
>> +}
>> +
>> +/* Extract bitfield from REG with SIZE as MODE, start from POS. */
>> +
>> +static rtx
>> +extract_bitfield (rtx reg, int size, int pos, machine_mode tmode, bool 
>> unsignedp)
>
> quite a bad name for extract_bit_field which we already have ...
> I wonder why you need this at all.

Thanks! extract_bit_field would be fine.

>
>> +{
>> +  scalar_int_mode imode;
>> +  if (!int_mode_for_mode (tmode).exists (&imode))
>> +    return NULL_RTX;
>> +
>> +  machine_mode mode = GET_MODE (reg);
>> +  bool reverse = false;
>> +  rtx bfld = extract_bit_field (reg, size, pos, unsignedp, NULL_RTX, mode,
>> +                            imode, reverse, NULL);
>> +  mode = GET_MODE (bfld);
>> +  if (mode != imode)
>> +    bfld = gen_lowpart (imode, bfld);
>> +  rtx result = gen_reg_rtx (imode);
>> +  emit_move_insn (result, bfld);
>> +
>> +  if (tmode != imode)
>> +    result = gen_lowpart (tmode, result);
>> +
>> +  return result;
>> +}
>> +
>> +bool
>> +expand_sra::scalarize_access (access_p acc, rtx regs)
>> +{
>> +  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
>> +
>> +  /* mode of mult registers. */
>> +  if (expr_mode != BLKmode
>> +      && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
>> +    return false;
>> +
>
> I'm confused now - you are rewriting RTL?

In this patch, yes, the computed rtx(s) are generated and
emitted. e.g. %r123=%r3; %r124=%r4#0

"scalarize_access" is called at the time when parameters
are set up, and incoming registers are confirmed.
Then, "scalarize_accesses" checks the aggregate parameters,
and computes scalarized rtx according to the (base, offset,
size) of the expression for each access.

As you mentioned below, another idea is: here only confirm
if accesses can be scalarized or not and set DECL_RTL for
the parameter.

>
>> +  /* Compute the position of the access in the whole parallel rtx.  */
>> +  int start_index = -1;
>> +  int end_index = -1;
>> +  HOST_WIDE_INT left_bits = 0;
>> +  HOST_WIDE_INT right_bits = 0;
>> +  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
>> +  for (; cur_index < XVECLEN (regs, 0); cur_index++)
>> +    {
>> +      rtx slot = XVECEXP (regs, 0, cur_index);
>> +      HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
>> +      machine_mode mode = GET_MODE (XEXP (slot, 0));
>> +      HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
>> +      if (off <= acc->offset && off + size > acc->offset)
>> +    {
>> +      start_index = cur_index;
>> +      left_bits = acc->offset - off;
>> +    }
>> +      if (off + size >= acc->offset + acc->size)
>> +    {
>> +      end_index = cur_index;
>> +      right_bits = off + size - (acc->offset + acc->size);
>> +      break;
>> +    }
>> +    }
>> +  /* Invalid access possition: padding or outof bound.  */
>> +  if (start_index < 0 || end_index < 0)
>> +    return false;
>> +
>> +  /* Need multi-registers in a parallel for the access.  */
>> +  if (expr_mode == BLKmode || end_index > start_index)
>> +    {
>> +      if (left_bits || right_bits)
>> +    return false;
>> +
>> +      int num_words = end_index - start_index + 1;
>> +      rtx *tmps = XALLOCAVEC (rtx, num_words);
>> +
>> +      int pos = 0;
>> +      HOST_WIDE_INT start;
>> +      start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
>> +      /* Extract whole registers.  */
>> +      for (; pos < num_words; pos++)
>> +    {
>> +      int index = start_index + pos;
>> +      rtx reg = extract_one_reg (regs, index);
>> +      machine_mode mode = GET_MODE (reg);
>> +      HOST_WIDE_INT off;
>> +      off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start;
>> +      tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off));
>> +    }
>> +
>> +      rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps));
>> +      acc->rtx_val = reg;
>> +      return true;
>> +    }
>> +
>> +  /* Just need one reg for the access.  */
>> +  if (end_index == start_index && left_bits == 0 && right_bits == 0)
>> +    {
>> +      rtx reg = extract_one_reg (regs, start_index);
>> +      if (GET_MODE (reg) != expr_mode)
>> +    reg = gen_lowpart (expr_mode, reg);
>> +
>> +      acc->rtx_val = reg;
>> +      return true;
>> +    }
>> +
>> +  /* Need to extract part reg for the access.  */
>> +  if (!acc->write && end_index == start_index)
>> +    {
>> +      rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
>> +      bool sgn = TYPE_UNSIGNED (TREE_TYPE (acc->expr));
>> +      acc->rtx_val
>> +    = extract_bitfield (reg, acc->size, left_bits, expr_mode, sgn);
>> +      if (acc->rtx_val)
>> +    return true;
>> +    }
>> +
>> +  return false;
>> +}
>> +
>> +bool
>> +expand_sra::scalarize_accesses (tree base, rtx regs)
>> +{
>> +  if (!base_access_vec)
>> +    return false;
>> +  vec<access_p> *access_vec = base_access_vec->get (base);
>> +  if (!access_vec)
>> +    return false;
>> +  bool is_parm = TREE_CODE (base) == PARM_DECL;
>> +  if (!valid_scalariable_accesses (access_vec, is_parm))
>> +    return false;
>> +
>> +  /* Go through each access, compute corresponding rtx(regs or subregs)
>> +     for the expression.  */
>> +  int n = access_vec->length ();
>> +  int cur_access_index = 0;
>> +  for (; cur_access_index < n; cur_access_index++)
>> +    if (!scalarize_access ((*access_vec)[cur_access_index], regs))
>> +      break;
>> +
>> +  /* Avoid un-scalarized accesses. */
>> +  if (cur_access_index != n)
>> +    {
>> +      base_access_vec->remove (base);
>> +      return false;
>> +    }
>> +
>> +  /* Bind/map expr(tree) to scalarized rtx.  */
>> +  for (int j = 0; j < n; j++)
>> +    {
>> +      access_p access = (*access_vec)[j];
>> +      expr_rtx_vec->put (access->expr, access->rtx_val);
>
> why do we compute RTL for each access?  We only want to change
> the representation of the decl - how RTL expansion "scalarizes"
> an aggregate.  It does this by assigning it a mode which we want
> to change.

Changing the representation of the decl (e.g. set DECL_RTL
as parallel with registers) could handle most cases.

There may be one issue with partial scalarization. (which
is not supported by this patch either.) e.g.

"struct A { int i,j; long d;}" and insns:
"_3=arg.i+arg.j; foo (&arg.d);"

arg.i and arg.j may be accessed through register directly,
but arg.d may need to be handled in memory.

To support this kind of case, we may need a way to tell
which member of 'arg' can (or can not) be accessed through
the registers.

This is the reason why I'm trying to bind an RTL to a TREE
expression(not only DECL TREE): If a TREE expr has the
registers RTL then uses it, otherwise, the expr would be
in the stack.


>
> If we want to really dis-aggregate the aggregate in the IL then
> I believe we should do this on the GIMPLE level - which means
> having a way to tie the "default definitions" to the incoming RTL.
> Which I think we have.

Yes, most aggregates have been scalarized(dis-aggregate) by
tree-sra/ipa-sra on the GIMPLE level.

In expander, there are some special cases that need to be
handled:
- aggregate parameters/returns passing by registers, but
  combined as INT mode: e.g. "struct A{float a; float b}" may
  has DImode.
- aggregate parameters/returns passing by registers, but with
  BLKmode and stored to stack even if it is safe to use
  dis-aggregate.
   
>
> foo (struct X x)
> {
> <bb2>:
>   x_a_1 = x.a;
>   x_b_2 = x.b;
>   x_c_3 = x.c;
> ...
>
> }
>
> Another possibility would be to have the parameters be SSA
> names when the parameter is not (fully) passed in memory
> and have the stack materialization explicit as aggregate
> copy.
>
> I realize the complication is the way we transfer the incoming
> hard registers to pseudos - and I think we simply need to
> extend what DECL_RTL a PARM_DECL can have for the case we are
> after - allow sth like (vec:TI [(reg:SI ..) (reg:SI ..) ... ])
> or so and have access expansion handle that in addition to
> the already supported (concat:..).  There's some support for
> (parallel ...), at least for returns, but it might be too
> sparse.

Yes, it is complicated to handle various expressions (e.g.
extract/store_bit_fields) on "vec:TI, concat:, parallel:".

>
> What I'm missing is (or I didn't find it) is code creating
> accesses for the actual incoming / outgoing hardregister parts.
This part is handled.
It is where "set_scalar_rtx_for_aggregate_access" is
called.
e.g. in "assign_parm_setup_block".

But, this patch directly uses the hard registers from
incoming/outgoing which is computed by existing code, e.g.
----------
        set_decl_incoming_rtl (parm, data.entry_parm, false);
  
 +      rtx incoming = DECL_INCOMING_RTL (parm);
 +      if (GET_CODE (incoming) == PARALLEL)
 +      set_scalar_rtx_for_aggregate_access (parm, incoming);


> Where's that taken into account?  I also don't see any handling
> of overlapping accesses - that is, when the choice for the

This patch does not check overlapping access.  One reason:
this patch supports "reading" from parameter only. So it is
safe to use the registers/pseudo for reading expression, even
there are overlapping accesses.

(Yes, for some cases, it is also safe to use reg for writing.
But, this patch need to be enhanced carefully.)

> pieces isn't obvious?  In fact I can't see _any_ code that
> tries to compute the set of scalar replacements?  Consider
>
> struct X { int a; int b; };
>
> long foo (struct X x)
> {
>   long y;
>   memcpy (&y, &x, sizeof (long));
>   return y + x.a + x.b;
> }
>
> you'll get a 8 byte read and two 4 byte reads.  What's going
> to be your choice for decomposing 'X'?  On x86 it will have
> DImode, passed in %rdi so decomposing it will not improve
> code generation.

Since there is a call that takes the address of the parameter,
So, the current patch uses the stack for the parameter but does
not use the incoming registers.

>
> struct X { int a; int b; int c; int d; };
>
> long foo (struct X x)
> {
>   long y;
>   memcpy (&y, &x, sizeof (long));
>   return y + x.c + x.d;
> }
>
> on x86 we have TImode for X, but it's passed in two DImode
> registers.  For the first half you have an 8 byte access,
> for the second half you have two 4 byte accesses.  I'd
> expect to decompose this to two DImode halves, based on
> the incoming register layout and roughly matching
> accesses.

Assume there is no addr-taken, memcpy is sth like "y=x"
(via union).  Then, there are three accesses for "x":
(offset=0, size=8), (offset=8,size=4), (12,4).

Then three rtx is generated:
scalar1=%rx:DI, scalar2=%ry:DI#0, scalar3=(%ry>>32)#0

>
> struct X { int a; int b; int c; int d; };
>
> long foo (struct X x)
> {
>   long y;
>   memcpy (&y, &x.b, sizeof (long));
>   return y + x.a + x.d;
> }
>
> two DImode halves for the pseudos doens't make much sense
> in this case - we have 'y' overlapping both.  RTL opts
> recover nicely in all the above cases when using TImode
> so it's going to be important to not be overly aggressive.
>
> That IMHO also means when we need to do more fancy things
> like rewriting accesses (thus when RTL expansion cannot
> cope with our idea of how the base object expands) we should
> think of dealing with this on the GIMPLE level.
Thanks, I try to understand this. :)
>
> For power you seem to have the situation of BLKmode aggregates
> that are passed in registers?  I don't think we have this
> situation on x86 for example.

The mode of an aggregate may be DI/TI, and also BLK:
If the size of the aggregate is not greater than the
biggest INT, then it would use INT mode, otherwise,
BLK would be used.  If I remember correctly, x86 may
also be similar.

>
> That said - I'm not sure I reverse engineered what you do
> correctly.  A more thorough description with the problem
> you are solving plus how you attack it from a high level
> perspective would have been useful.

Thanks for pointing out this!
And thanks so much for your time on this review!

BR,
Jeff (Jiufu Guo)

>
> Thanks,
> Richard.
>
>> +    }
>> +
>> +  return true;
>> +}
>> +
>> +static expand_sra *current_sra = NULL;
>> +
>> +/* Check If there is an sra access for the expr.
>> +   Return the correspond scalar sym for the access. */
>> +
>> +rtx
>> +get_scalar_rtx_for_aggregate_expr (tree expr)
>> +{
>> +  return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX;
>> +}
>> +
>> +/* Compute/Set RTX registers for those accesses on BASE.  */
>> +
>> +void
>> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
>> +{
>> +  if (!current_sra)
>> +    return;
>> +  current_sra->scalarize_accesses (base, regs);
>> +}
>> +
>> +void
>> +set_scalar_rtx_for_returns ()
>> +{
>> +  if (!current_sra)
>> +    return;
>> +
>> +  tree res = DECL_RESULT (current_function_decl);
>> +  gcc_assert (res);
>> +  edge_iterator ei;
>> +  edge e;
>> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>> +    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
>> +      {
>> +    tree val = gimple_return_retval (r);
>> +    if (val && VAR_P (val))
>> +      current_sra->scalarize_accesses (val, DECL_RTL (res));
>> +      }
>> +}
>> +
>>  /* Return an expression tree corresponding to the RHS of GIMPLE
>>     statement STMT.  */
>>  
>> @@ -3778,7 +4241,8 @@ expand_return (tree retval)
>>  
>>    /* If we are returning the RESULT_DECL, then the value has already
>>       been stored into it, so we don't have to do anything special.  */
>> -  if (TREE_CODE (retval_rhs) == RESULT_DECL)
>> +  if (TREE_CODE (retval_rhs) == RESULT_DECL
>> +      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
>>      expand_value_return (result_rtl);
>>  
>>    /* If the result is an aggregate that is being returned in one (or more)
>> @@ -4422,6 +4886,9 @@ expand_debug_expr (tree exp)
>>    int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
>>    addr_space_t as;
>>    scalar_int_mode op0_mode, op1_mode, addr_mode;
>> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>> +  if (x)
>> +    return NULL_RTX;/* optimized out.  */
>>  
>>    switch (TREE_CODE_CLASS (TREE_CODE (exp)))
>>      {
>> @@ -6624,6 +7091,9 @@ pass_expand::execute (function *fun)
>>    auto_bitmap forced_stack_vars;
>>    discover_nonconstant_array_refs (forced_stack_vars);
>>  
>> +  current_sra = new expand_sra;
>> +  scan_function (cfun, *current_sra);
>> +
>>    /* Make sure all values used by the optimization passes have sane
>>       defaults.  */
>>    reg_renumber = 0;
>> @@ -7052,6 +7522,8 @@ pass_expand::execute (function *fun)
>>        loop_optimizer_finalize ();
>>      }
>>  
>> +  delete current_sra;
>> +  current_sra = NULL;
>>    timevar_pop (TV_POST_EXPAND);
>>  
>>    return 0;
>> diff --git a/gcc/expr.cc b/gcc/expr.cc
>> index 
>> 174f8acb269ab5450fc799516471d5a2bd9b9efa..57b037040d6d8e8c98b2befcb556221c0c5604c4
>>  100644
>> --- a/gcc/expr.cc
>> +++ b/gcc/expr.cc
>> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, 
>> rtx,
>>  static rtx const_vector_from_tree (tree);
>>  static tree tree_expr_size (const_tree);
>>  static void convert_mode_scalar (rtx, rtx, int);
>> +rtx get_scalar_rtx_for_aggregate_expr (tree);
>>  
>>  
>>  /* This is run to set up which modes can be used
>> @@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool 
>> nontemporal)
>>       Assignment of an array element at a constant index, and assignment of
>>       an array element in an unaligned packed structure field, has the same
>>       problem.  Same for (partially) storing into a non-memory object.  */
>> -  if (handled_component_p (to)
>> -      || (TREE_CODE (to) == MEM_REF
>> -      && (REF_REVERSE_STORAGE_ORDER (to)
>> -          || mem_ref_refers_to_non_mem_p (to)))
>> -      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
>> +  if (!get_scalar_rtx_for_aggregate_expr (to)
>> +      && (handled_component_p (to)
>> +      || (TREE_CODE (to) == MEM_REF
>> +          && (REF_REVERSE_STORAGE_ORDER (to)
>> +              || mem_ref_refers_to_non_mem_p (to)))
>> +      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
>>      {
>>        machine_mode mode1;
>>        poly_int64 bitsize, bitpos;
>> @@ -8912,6 +8914,20 @@ expand_constructor (tree exp, rtx target, enum 
>> expand_modifier modifier,
>>        && ! mostly_zeros_p (exp))
>>      return NULL_RTX;
>>  
>> +  if (target && GET_CODE (target) == PARALLEL && all_zeros_p (exp))
>> +    {
>> +      int length = XVECLEN (target, 0);
>> +      int start = XEXP (XVECEXP (target, 0, 0), 0) ? 0 : 1;
>> +      for (int i = start; i < length; i++)
>> +    {
>> +      rtx dst = XEXP (XVECEXP (target, 0, i), 0);
>> +      rtx zero = CONST0_RTX (GET_MODE (dst));
>> +      gcc_assert (zero);
>> +      emit_move_insn (dst, zero);
>> +    }
>> +      return target;
>> +    }
>> +
>>    /* Handle calls that pass values in multiple non-contiguous
>>       locations.  The Irix 6 ABI has examples of this.  */
>>    if (target == 0 || ! safe_from_p (target, exp, 1)
>> @@ -9006,6 +9022,9 @@ expand_expr_real (tree exp, rtx target, machine_mode 
>> tmode,
>>        ret = CONST0_RTX (tmode);
>>        return ret ? ret : const0_rtx;
>>      }
>> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>> +  if (x)
>> +    return x;
>>  
>>    ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
>>                          inner_reference_p);
>> diff --git a/gcc/function.cc b/gcc/function.cc
>> index 
>> dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be
>>  100644
>> --- a/gcc/function.cc
>> +++ b/gcc/function.cc
>> @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct 
>> assign_parm_data_one *data)
>>    data->stack_parm = stack_parm;
>>  }
>>  
>> +extern void set_scalar_rtx_for_aggregate_access (tree, rtx);
>> +extern void set_scalar_rtx_for_returns ();
>> +
>>  /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that it's
>>     always valid and contiguous.  */
>>  
>> @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all 
>> *all,
>>        emit_move_insn (mem, entry_parm);
>>      }
>>        else
>> -    move_block_from_reg (REGNO (entry_parm), mem,
>> -                         size_stored / UNITS_PER_WORD);
>> +    {
>> +      int regno = REGNO (entry_parm);
>> +      int nregs = size_stored / UNITS_PER_WORD;
>> +      move_block_from_reg (regno, mem, nregs);
>> +
>> +      rtx *tmps = XALLOCAVEC (rtx, nregs);
>> +      machine_mode mode = word_mode;
>> +      HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
>> +      for (int i = 0; i < nregs; i++)
>> +        {
>> +          rtx reg = gen_rtx_REG (mode, regno + i);
>> +          rtx off = GEN_INT (word_size * i);
>> +          tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
>> +        }
>> +
>> +      rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
>> +      set_scalar_rtx_for_aggregate_access (parm, regs);
>> +    }
>>      }
>>    else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
>>      {
>> @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl)
>>        else
>>      set_decl_incoming_rtl (parm, data.entry_parm, false);
>>  
>> +      rtx incoming = DECL_INCOMING_RTL (parm);
>> +      if (GET_CODE (incoming) == PARALLEL)
>> +    set_scalar_rtx_for_aggregate_access (parm, incoming);
>> +
>>        assign_parm_adjust_stack_rtl (&data);
>>  
>>        if (assign_parm_setup_block_p (&data))
>> @@ -5136,6 +5159,7 @@ expand_function_start (tree subr)
>>          {
>>            gcc_assert (GET_CODE (hard_reg) == PARALLEL);
>>            set_parm_rtl (res, gen_group_rtx (hard_reg));
>> +          set_scalar_rtx_for_returns ();
>>          }
>>      }
>>  
>> diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h
>> index 
>> f20266c46226f7840299a768cb575f6f92b54207..bd0396e672b30f7ef66c305d8d131e91639039d7
>>  100644
>> --- a/gcc/tree-sra.h
>> +++ b/gcc/tree-sra.h
>> @@ -19,6 +19,83 @@ You should have received a copy of the GNU General Public 
>> License
>>  along with GCC; see the file COPYING3.  If not see
>>  <http://www.gnu.org/licenses/>.  */
>>  
>> +struct base_access
>> +{
>> +  /* Values returned by get_ref_base_and_extent, indicates the
>> +     OFFSET, SIZE and BASE of the access.  */
>> +  HOST_WIDE_INT offset;
>> +  HOST_WIDE_INT size;
>> +  tree base;
>> +
>> +  /* The context expression of this access.  */
>> +  tree expr;
>> +
>> +  /* Indicates this is a write access.  */
>> +  bool write : 1;
>> +
>> +  /* Indicates if this access is made in reverse storage order.  */
>> +  bool reverse : 1;
>> +};
>> +
>> +/* Default template for sra_scan_function.  */
>> +
>> +struct default_analyzer
>> +{
>> +  /* Template analyze functions.  */
>> +  void analyze_phi (gphi *){};
>> +  void pre_analyze_stmt (gimple *){};
>> +  void analyze_return (greturn *){};
>> +  void analyze_assign (gassign *){};
>> +  void analyze_call (gcall *){};
>> +  void analyze_asm (gasm *){};
>> +  void analyze_default_stmt (gimple *){};
>> +};
>> +
>> +/* Scan function and look for interesting expressions.  */
>> +
>> +template <typename analyzer>
>> +void
>> +scan_function (struct function *fun, analyzer &a)
>> +{
>> +  basic_block bb;
>> +  FOR_EACH_BB_FN (bb, fun)
>> +    {
>> +      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
>> +       gsi_next (&gsi))
>> +    a.analyze_phi (gsi.phi ());
>> +
>> +      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
>> +       gsi_next (&gsi))
>> +    {
>> +      gimple *stmt = gsi_stmt (gsi);
>> +      a.pre_analyze_stmt (stmt);
>> +
>> +      switch (gimple_code (stmt))
>> +        {
>> +        case GIMPLE_RETURN:
>> +          a.analyze_return (as_a<greturn *> (stmt));
>> +          break;
>> +
>> +        case GIMPLE_ASSIGN:
>> +          a.analyze_assign (as_a<gassign *> (stmt));
>> +          break;
>> +
>> +        case GIMPLE_CALL:
>> +          a.analyze_call (as_a<gcall *> (stmt));
>> +          break;
>> +
>> +        case GIMPLE_ASM:
>> +          a.analyze_asm (as_a<gasm *> (stmt));
>> +          break;
>> +
>> +        default:
>> +          a.analyze_default_stmt (stmt);
>> +          break;
>> +        }
>> +    }
>> +    }
>> +}
>> +
>>  bool type_internals_preclude_sra_p (tree type, const char **msg);
>>  
>>  /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA
>> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
>> b/gcc/testsuite/g++.target/powerpc/pr102024.C
>> index 
>> 769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966
>>  100644
>> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C
>> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
>> @@ -5,7 +5,7 @@
>>  // Test that a zero-width bit field in an otherwise homogeneous aggregate
>>  // generates a psabi warning and passes arguments in GPRs.
>>  
>> -// { dg-final { scan-assembler-times {\mstd\M} 4 } }
>> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
>>  
>>  struct a_thing
>>  {
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>> @@ -0,0 +1,29 @@
>> +/* { dg-do run } */
>> +/* { dg-options "-O2 -save-temps" } */
>> +
>> +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } 
>> DF;
>> +typedef struct SF {float a[4]; int i1; int i2; } SF;
>> +
>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { 
>> has_arch_ppc64 && has_arch_pwr8 } } } } */
>> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && 
>> has_arch_pwr8 } } } } */
>> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && 
>> has_arch_pwr8 } } } } */
>> +short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return 
>> a.s2+a.s3;return 0;}
>> +int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return 
>> a.i2+a.i1;return 0;}
>> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 
>> 2)return arg.a[3];else return 0.0;}
>> +float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 
>> 2)return arg.a[2]; return 0;}
>> +float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 
>> 2)return arg.a[1];return 0;}
>> +
>> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
>> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
>> +
>> +int main()
>> +{
>> +  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 
>> 4.0
>> +    && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
>> +    __builtin_abort ();
>> +  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
>> +    && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
>> +    __builtin_abort ();
>> +  return 0;
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>> @@ -0,0 +1,6 @@
>> +/* PR target/65421 */
>> +/* { dg-options "-O2" } */
>> +
>> +typedef struct LARGE {double a[4]; int arr[32];} LARGE;
>> +LARGE foo (LARGE a){return a;}
>> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>> @@ -0,0 +1,32 @@
>> +/* PR target/65421 */
>> +/* { dg-options "-O2" } */
>> +/* { dg-require-effective-target powerpc_elfv2 } */
>> +/* { dg-require-effective-target has_arch_ppc64 } */
>> +
>> +typedef struct FLOATS
>> +{
>> +  double a[3];
>> +} FLOATS;
>> +
>> +/* 3 lfd after returns also optimized */
>> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
>> +
>> +/* 3 stfd */
>> +void st_arg (FLOATS a, FLOATS *p) {*p = a;}
>> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
>> +
>> +/* blr */
>> +FLOATS ret_arg (FLOATS a) {return a;}
>> +
>> +typedef struct MIX
>> +{
>> +  double a[2];
>> +  long l;
>> +} MIX;
>> +
>> +/* std 3 param regs to return slot */
>> +MIX ret_arg1 (MIX a) {return a;}
>> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
>> +
>> +/* count insns */
>> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
>> 

Reply via email to