Hi,

Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

> Hi,
>
> Richard Biener <rguent...@suse.de> writes:
>
>> On Mon, 24 Jul 2023, Jiufu Guo wrote:
>>
>>> 
>>> Hi Martin,
>>> 
>>> Not sure about your current option about re-using the ipa-sra code
>>> in the light-expander-sra. And if anything I could input please
>>> let me know.
>>>
...
>>
>> What I was hoping for is shared stmt-level analysis and a shared
>> data structure for the "access"(es) a stmt performs.  Because that
>> can come up handy in multiple places.  The existing SRA data
>> structures could easily embed that subset for example if sharing
>> the whole data structure of [IPA] SRA seems too unwieldly.
>
> Understand.
> The stmt-level analysis and "access" data structure are similar
> between ipa-sra/tree-sra and the expander-sra.
>
> I just update the patch, this version does not change the behaviors of
> the previous version.  It is just cleaning/merging some functions only.
> The patch is attached.
>
> This version (and tree-sra/ipa-sra) is still using the similar
> "stmt analyze" and "access struct"".  This could be extracted as
> shared code.
> I'm thinking to update the code to use the same "base_access" and
> "walk function".

I'm drafting code for the shared stmt-analyze and access-structure.
The code may like below.

BR,
Jeff (Jiufu Guo)

-----------------------
struct base_access
{
  /* Values returned by get_ref_base_and_extent, indicates the
     OFFSET, SIZE and BASE of the access.  */
  HOST_WIDE_INT offset;
  HOST_WIDE_INT size;
  tree base;

  /* The context expression of this access.  */
  tree expr;

  /* Indicates this is a write access.  */
  bool write : 1;

  /* Indicates if this access is made in reverse storage order.  */
  bool reverse : 1;
};

/* Default template for sra_scan_function.  */

struct default_analyzer
{
  /* Template analyze functions.  */
  void analyze_phi (gphi *){};
  void pre_analyze_stmt (gimple *){};
  void analyze_return (greturn *){};
  void analyze_assign (gassign *){};
  void analyze_call (gcall *){};
  void analyze_asm (gasm *){};
  void analyze_default_stmt (gimple *){};
};

/* Scan function and look for interesting expressions.  */

template <typename analyzer>
void
sra_scan_function (struct function *fun, analyzer &a)
{
  basic_block bb;
  FOR_EACH_BB_FN (bb, fun)
    {
      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
           gsi_next (&gsi))
        a.analyze_phi (gsi.phi ());

      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
           gsi_next (&gsi))
        {
          gimple *stmt = gsi_stmt (gsi);
          a.pre_analyze_stmt (stmt);

          switch (gimple_code (stmt))
            {
            case GIMPLE_RETURN:
              a.analyze_return (as_a<greturn *> (stmt));
              break;

            case GIMPLE_ASSIGN:
              a.analyze_assign (as_a<gassign *> (stmt));
              break;

            case GIMPLE_CALL:
              a.analyze_call (as_a<gcall *> (stmt));
              break;

            case GIMPLE_ASM:
              a.analyze_asm (as_a<gasm *> (stmt));
              break;

            default:
              a.analyze_default_stmt (stmt);
              break;
            }
        }
    }
}


struct access : public base_access
{
  /* The rtx for the access: link to incoming/returning register(s).  */
  rtx rtx_val;
};

struct expand_access_analyzer : public default_analyzer
{
  /* Now use default APIs, no actions for
     pre_analyze_stmt, analyze_return.  */

  /* overwrite analyze_default_stmt.  */
  void analyze_default_stmt (gimple *);

  /* overwrite analyze phi,call,asm .  */
  void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); };
  void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); };
  void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); };  

  /* overwrite analyze_assign.  */
  void analyze_assign (gassign *);
};


>
>>
>> With a stmt-leve API using FOR_EACH_IMM_USE_STMT would still be
>> possible (though RTL expansion pre-walks all stmts anyway).
>
> Yeap, I also notice that "FOR_EACH_IMM_USE_STMT" is not enough.
> For struct parameters, walking stmt is needed.
>
>
> BR,
> Jeff (Jiufu Guo)
>
> -----------------------------
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index edf292cfbe9..8c36ad5df79 100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -97,6 +97,502 @@ static bool defer_stack_allocation (tree, bool);
>  
>  static void record_alignment_for_reg_var (unsigned int);
>  
> +extern rtx
> +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int);
> +
> +/* For light SRA in expander about paramaters and returns.  */
> +namespace
> +{
> +
> +struct access
> +{
> +  /* Each accessing on the aggragate is about OFFSET/SIZE.  */
> +  HOST_WIDE_INT offset;
> +  HOST_WIDE_INT size;
> +
> +  bool writing;
> +
> +  /* The context expression of this access.  */
> +  tree expr;
> +
> +  /* The rtx for the access: link to incoming/returning register(s).  */
> +  rtx rtx_val;
> +};
> +
> +typedef struct access *access_p;
> +
> +/* Expr (tree) -> Scalarized value (rtx) map.  */
> +static hash_map<tree, rtx> *expr_rtx_vec;
> +
> +/* Base (tree) -> Vector (vec<access_p> *) map.  */
> +static hash_map<tree, auto_vec<access_p> > *base_access_vec;
> +
> +/* Return true if EXPR has interesting access to the sra candidates,
> +   and created access, return false otherwise.  */
> +
> +static struct access *
> +build_access (tree expr, bool write)
> +{
> +  enum tree_code code = TREE_CODE (expr);
> +  if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF
> +      && code != ARRAY_REF && code != ARRAY_RANGE_REF)
> +    return NULL;
> +
> +  HOST_WIDE_INT offset, size;
> +  bool reverse;
> +  tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse);
> +  if (!base || !DECL_P (base))
> +    return NULL;
> +
> +  vec<access_p> *access_vec = base_access_vec->get (base);
> +  if (!access_vec)
> +    return NULL;
> +
> +  /* TODO: support reverse. */
> +  if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE 
> (base)))
> +    {
> +      base_access_vec->remove (base);
> +      return NULL;
> +    }
> +
> +  struct access *access = XNEWVEC (struct access, 1);
> +
> +  memset (access, 0, sizeof (struct access));
> +  access->offset = offset;
> +  access->size = size;
> +  access->expr = expr;
> +  access->writing = write;
> +  access->rtx_val = NULL_RTX;
> +
> +  access_vec->safe_push (access);
> +
> +  return access;
> +}
> +
> +/* Callback of walk_stmt_load_store_addr_ops visit_base used to remove
> +   operands with address taken.  */
> +
> +static bool
> +visit_base (gimple *, tree op, tree, void *)
> +{
> +  op = get_base_address (op);
> +  if (op && DECL_P (op))
> +    base_access_vec->remove (op);
> +
> +  return false;
> +}
> +
> +/* Scan function and look for interesting expressions and create access
> +   structures for them.  */
> +
> +static void
> +collect_acccesses (void)
> +{
> +  basic_block bb;
> +
> +  FOR_EACH_BB_FN (bb, cfun)
> +    {
> +      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
> +        gsi_next (&gsi))
> +     walk_stmt_load_store_addr_ops (gsi.phi (), NULL, NULL, NULL,
> +                                    visit_base);
> +
> +      for (gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb 
> (bb);
> +        !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
> +     {
> +       gimple *stmt = gsi_stmt (gsi);
> +       switch (gimple_code (stmt))
> +         {
> +         case GIMPLE_RETURN:
> +           continue;
> +         case GIMPLE_ASSIGN:
> +           if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt))
> +             {
> +               tree rhs = gimple_assign_rhs1 (stmt);
> +               tree lhs = gimple_assign_lhs (stmt);
> +               bool res_r = build_access (rhs, false);
> +               bool res_l = build_access (lhs, true);
> +               if (res_l && TREE_CODE (rhs) != CONSTRUCTOR)
> +                 base_access_vec->remove (get_base_address (lhs));
> +
> +               if (res_l || res_r)
> +                 continue;
> +             }
> +           break;
> +         default:
> +           break;
> +         }
> +
> +       walk_stmt_load_store_addr_ops (stmt, NULL, visit_base, visit_base,
> +                                      visit_base);
> +     }
> +    }
> +}
> +
> +/* Return true if VAR is a candidate for SRA.  */
> +
> +static bool
> +add_sra_candidate (tree var)
> +{
> +  tree type = TREE_TYPE (var);
> +
> +  if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type))
> +      || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var)
> +      || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (va_list_type_node))
> +    return false;
> +  gcc_assert (COMPLETE_TYPE_P (type));
> +
> +  base_access_vec->get_or_insert (var);
> +
> +  return true;
> +}
> +
> +/* Collect the parameter and returns with type which is suitable for
> + * scalarization.  */
> +
> +static bool
> +collect_sra_candidates (void)
> +{
> +  bool ret = false;
> +
> +  /* Collect parameters.  */
> +  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
> +       parm = DECL_CHAIN (parm))
> +    ret |= add_sra_candidate (parm);
> +
> +  /* Collect VARs on returns.  */
> +  if (DECL_RESULT (current_function_decl))
> +    {
> +      edge_iterator ei;
> +      edge e;
> +      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> +     if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
> +       {
> +         tree val = gimple_return_retval (r);
> +         /* To sclaraized the return, the return value should be only
> +            writen, except this return stmt.
> +            Then using 'true(write)' to create the access. */
> +         if (val && VAR_P (val))
> +           ret |= add_sra_candidate (val) && build_access (val, true);
> +       }
> +    }
> +
> +  return ret;
> +}
> +
> +/* Check if the accesses of BASE are scalarizbale.
> +   Now, only scalarize the parms only with reading
> +   or returns only with writing.  */
> +
> +static bool
> +with_scalariable_accesses (vec<access_p> *access_vec, bool is_parm)
> +{
> +  if (access_vec->is_empty ())
> +    return false;
> +
> +  for (unsigned int j = 0; j < access_vec->length (); j++)
> +    {
> +      struct access *access = (*access_vec)[j];
> +      /* Writing to a field of parameter. */
> +      if (is_parm && access->writing)
> +     return false;
> +
> +      /* Writing to a field of parameter. */
> +      if (!is_parm && !access->writing)
> +     return false;
> +    }
> +
> +  return true;
> +}
> +
> +static void
> +prepare_expander_sra ()
> +{
> +  if (optimize <= 0)
> +    return;
> +
> +  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
> +  expr_rtx_vec = new hash_map<tree, rtx>;
> +
> +  if (collect_sra_candidates ())
> +    collect_acccesses ();
> +}
> +
> +static void
> +free_expander_sra ()
> +{
> +  if (optimize <= 0)
> +    return;
> +  delete expr_rtx_vec;
> +  expr_rtx_vec = 0;
> +  delete base_access_vec;
> +  base_access_vec = 0;
> +}
> +} /* namespace */
> +
> +namespace
> +{
> +/* Get the register at INDEX from a parallel REGS.  */
> +
> +static rtx
> +extract_parallel_reg (rtx regs, int index)
> +{
> +  rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
> +  if (!HARD_REGISTER_P (orig_reg))
> +    return orig_reg;
> +
> +  /* Reading from param hard reg need to be moved to a temp.  */
> +  rtx reg = gen_reg_rtx (GET_MODE (orig_reg));
> +  emit_move_insn (reg, orig_reg);
> +  return reg;
> +}
> +
> +/* Get IMODE part from REG at OFF_BITS.  */
> +
> +static rtx
> +extract_sub_reg (rtx orig_reg, int off_bits, machine_mode mode)
> +{
> +  scalar_int_mode imode;
> +  if (!int_mode_for_mode (mode).exists (&imode))
> +    return NULL_RTX;
> +
> +  machine_mode orig_mode = GET_MODE (orig_reg);
> +  gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
> +
> +  poly_uint64 lowpart_off = subreg_lowpart_offset (imode, orig_mode);
> +  int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT;
> +  int shift_bits;
> +  if (lowpart_off_bits >= off_bits)
> +    shift_bits = lowpart_off_bits - off_bits;
> +  else
> +    shift_bits = off_bits - lowpart_off_bits;
> +
> +  rtx reg = orig_reg;
> +  if (shift_bits > 0)
> +    reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, shift_bits, NULL, 1);
> +
> +  rtx subreg = gen_lowpart (imode, reg);
> +  rtx result = gen_reg_rtx (imode);
> +  emit_move_insn (result, subreg);
> +
> +  if (mode != imode)
> +    result = gen_lowpart (mode, result);
> +
> +  return result;
> +}
> +
> +/* Extract subfields from the REG at START bits to TARGET at OFF,
> +   BITS parameter is the total number extract bits.  */
> +
> +static int
> +extract_fields_from_reg (rtx reg, int bits, int start, rtx *target,
> +                      HOST_WIDE_INT off)
> +{
> +  machine_mode mode_aux[] = {SImode, HImode, QImode};
> +  int margins = sizeof (mode_aux) / sizeof (mode_aux[0]);
> +  HOST_WIDE_INT off_bits = start;
> +  rtx *p = target;
> +  for (int n = 0; n < margins; n++)
> +    {
> +      machine_mode mode = mode_aux[n];
> +      HOST_WIDE_INT bitsize = GET_MODE_BITSIZE (mode).to_constant ();
> +      if (bits < bitsize)
> +     continue;
> +
> +      rtx subreg = extract_sub_reg (reg, off_bits, mode);
> +      *p++ = gen_rtx_EXPR_LIST (mode, subreg, GEN_INT (off));
> +      off += bitsize / BITS_PER_UNIT;
> +      off_bits += bitsize;
> +      bits -= bitsize;
> +    }
> +
> +  return p - target;
> +}
> +} /* namespace */
> +
> +/* Check If there is an sra access for the expr.
> +   Return the correspond scalar sym for the access. */
> +
> +rtx
> +get_scalar_rtx_for_aggregate_expr (tree expr)
> +{
> +  if (!expr_rtx_vec)
> +    return NULL_RTX;
> +  rtx *val = expr_rtx_vec->get (expr);
> +  return val ? *val : NULL_RTX;
> +}
> +
> +/* Compute/Set RTX registers for those accesses on BASE.  */
> +
> +void
> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
> +{
> +  if (!base_access_vec)
> +    return;
> +  vec<access_p> *access_vec = base_access_vec->get (base);
> +  if (!access_vec)
> +    return;
> +  bool is_parm = TREE_CODE (base) == PARM_DECL;
> +  if (!with_scalariable_accesses (access_vec, is_parm))
> +    return;
> +
> +  /* Go through each access, compute corresponding rtx(regs or subregs)
> +     for the expression.  */
> +  int n = access_vec->length ();
> +  int cur_access_index = 0;
> +  for (; cur_access_index < n; cur_access_index++)
> +    {
> +      access_p acc = (*access_vec)[cur_access_index];
> +      machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
> +
> +      /* mode of mult registers. */
> +      if (expr_mode != BLKmode
> +       && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
> +     break;
> +
> +      /* Compute the position of the access in the whole parallel rtx.  */
> +      int start_index = -1;
> +      int end_index = -1;
> +      HOST_WIDE_INT left_bits = 0;
> +      HOST_WIDE_INT right_bits = 0;
> +      int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
> +      for (; cur_index < XVECLEN (regs, 0); cur_index++)
> +     {
> +       rtx slot = XVECEXP (regs, 0, cur_index);
> +       HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
> +       machine_mode mode = GET_MODE (XEXP (slot, 0));
> +       HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
> +       if (off <= acc->offset && off + size > acc->offset)
> +         {
> +           start_index = cur_index;
> +           left_bits = acc->offset - off;
> +         }
> +       if (off + size >= acc->offset + acc->size)
> +         {
> +           end_index = cur_index;
> +           right_bits = off + size - (acc->offset + acc->size);
> +           break;
> +         }
> +     }
> +      /* Invalid access possition: padding or outof bound.  */
> +      if (start_index < 0 || end_index < 0)
> +     break;
> +
> +      /* Need a parallel for possible multi-registers. */
> +      if (expr_mode == BLKmode || end_index > start_index)
> +     {
> +       /* More possible space for SI, HI, QI.  */
> +       machine_mode mode_aux[] = {SImode, HImode, QImode};
> +       int margins = sizeof (mode_aux) / sizeof (mode_aux[0]);
> +       int extra = (right_bits ? margins : 0) + (left_bits ? margins : 0);
> +       int num_words = end_index - start_index + 1;
> +       num_words -= (right_bits ? 1 : 0);
> +       num_words -= (left_bits ? 1 : 0);
> +       rtx *tmps = XALLOCAVEC (rtx, num_words + extra);
> +
> +       int pos = 0;
> +       /* There are extra fields from the left part of the start reg.  */
> +       if (left_bits)
> +         {
> +           gcc_assert (!acc->writing);
> +           gcc_assert ((left_bits % BITS_PER_UNIT) == 0);
> +
> +           rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
> +           machine_mode mode = GET_MODE (reg);
> +           int reg_size = GET_MODE_BITSIZE (mode).to_constant ();
> +           int bits = reg_size - left_bits;
> +           pos = extract_fields_from_reg (reg, bits, left_bits, tmps, 0);
> +         }
> +
> +       HOST_WIDE_INT start;
> +       start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
> +       start -= left_bits / BITS_PER_UNIT;
> +       /* Extract whole registers.  */
> +       for (; pos < num_words; pos++)
> +         {
> +           int index = start_index + pos;
> +           rtx reg = extract_parallel_reg (regs, index);
> +           machine_mode mode = GET_MODE (reg);
> +           HOST_WIDE_INT off;
> +           off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start;
> +           tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off));
> +         }
> +
> +       /* No more fields.   */
> +       if (right_bits == 0)
> +         {
> +           rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps));
> +           acc->rtx_val = reg;
> +           continue;
> +         }
> +
> +       /* There are extra fields from the part of register.   */
> +       gcc_assert (!acc->writing);
> +       gcc_assert ((right_bits % BITS_PER_UNIT) == 0);
> +
> +       HOST_WIDE_INT off;
> +       off = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - start;
> +       rtx reg = XEXP (XVECEXP (regs, 0, end_index), 0);
> +       machine_mode mode = GET_MODE (reg);
> +       int reg_size = GET_MODE_BITSIZE (mode).to_constant ();
> +       int bits = reg_size - right_bits;
> +       pos += extract_fields_from_reg (reg, bits, 0, tmps + pos, off);
> +
> +       /* Currently, PARALLELs with register elements for param/returns
> +          are using BLKmode.  */
> +       acc->rtx_val = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps));
> +       continue;
> +     }
> +
> +      /* Just need one reg for the correspond access.  */
> +      if (end_index == start_index && left_bits == 0 && right_bits == 0)
> +     {
> +       rtx reg = extract_parallel_reg (regs, start_index);
> +       if (GET_MODE (reg) != expr_mode)
> +         reg = gen_lowpart (expr_mode, reg);
> +
> +       acc->rtx_val = reg;
> +       continue;
> +     }
> +
> +      /* Need to shift to extract a part reg for the access.  */
> +      if (!acc->writing && end_index == start_index)
> +     {
> +       rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
> +       acc->rtx_val = extract_sub_reg (orig_reg, left_bits, expr_mode);
> +       if (acc->rtx_val)
> +         continue;
> +     }
> +
> +      break;
> +    }
> +
> +  /* If all access expr(s) are not scalarized,
> +     bind/map all expr(tree) to sclarized rtx.  */
> +  if (cur_access_index == n)
> +    for (int j = 0; j < n; j++)
> +      {
> +     access_p access = (*access_vec)[j];
> +     expr_rtx_vec->put (access->expr, access->rtx_val);
> +      }
> +}
> +
> +void
> +set_scalar_rtx_for_returns ()
> +{
> +  tree res = DECL_RESULT (current_function_decl);
> +  gcc_assert (res);
> +  edge_iterator ei;
> +  edge e;
> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
> +    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
> +      {
> +     tree val = gimple_return_retval (r);
> +     if (val && VAR_P (val))
> +       set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res));
> +      }
> +}
> +
>  /* Return an expression tree corresponding to the RHS of GIMPLE
>     statement STMT.  */
>  
> @@ -3778,7 +4274,8 @@ expand_return (tree retval)
>  
>    /* If we are returning the RESULT_DECL, then the value has already
>       been stored into it, so we don't have to do anything special.  */
> -  if (TREE_CODE (retval_rhs) == RESULT_DECL)
> +  if (TREE_CODE (retval_rhs) == RESULT_DECL
> +      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
>      expand_value_return (result_rtl);
>  
>    /* If the result is an aggregate that is being returned in one (or more)
> @@ -4422,6 +4919,9 @@ expand_debug_expr (tree exp)
>    int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
>    addr_space_t as;
>    scalar_int_mode op0_mode, op1_mode, addr_mode;
> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
> +  if (x)
> +    return NULL_RTX;/* optimized out.  */
>  
>    switch (TREE_CODE_CLASS (TREE_CODE (exp)))
>      {
> @@ -6620,6 +7120,8 @@ pass_expand::execute (function *fun)
>           avoid_deep_ter_for_debug (gsi_stmt (gsi), 0);
>      }
>  
> +  prepare_expander_sra ();
> +
>    /* Mark arrays indexed with non-constant indices with TREE_ADDRESSABLE.  */
>    auto_bitmap forced_stack_vars;
>    discover_nonconstant_array_refs (forced_stack_vars);
> @@ -7052,6 +7554,7 @@ pass_expand::execute (function *fun)
>        loop_optimizer_finalize ();
>      }
>  
> +  free_expander_sra ();
>    timevar_pop (TV_POST_EXPAND);
>  
>    return 0;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index fff09dc9951..d487fe3b53b 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, 
> rtx,
>  static rtx const_vector_from_tree (tree);
>  static tree tree_expr_size (const_tree);
>  static void convert_mode_scalar (rtx, rtx, int);
> +rtx get_scalar_rtx_for_aggregate_expr (tree);
>  
>  
>  /* This is run to set up which modes can be used
> @@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool 
> nontemporal)
>       Assignment of an array element at a constant index, and assignment of
>       an array element in an unaligned packed structure field, has the same
>       problem.  Same for (partially) storing into a non-memory object.  */
> -  if (handled_component_p (to)
> -      || (TREE_CODE (to) == MEM_REF
> -       && (REF_REVERSE_STORAGE_ORDER (to)
> -           || mem_ref_refers_to_non_mem_p (to)))
> -      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
> +  if (!get_scalar_rtx_for_aggregate_expr (to)
> +      && (handled_component_p (to)
> +       || (TREE_CODE (to) == MEM_REF
> +           && (REF_REVERSE_STORAGE_ORDER (to)
> +               || mem_ref_refers_to_non_mem_p (to)))
> +       || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
>      {
>        machine_mode mode1;
>        poly_int64 bitsize, bitpos;
> @@ -9011,6 +9013,9 @@ expand_expr_real (tree exp, rtx target, machine_mode 
> tmode,
>        ret = CONST0_RTX (tmode);
>        return ret ? ret : const0_rtx;
>      }
> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
> +  if (x)
> +    return x;
>  
>    ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
>                           inner_reference_p);
> diff --git a/gcc/function.cc b/gcc/function.cc
> index dd2c1136e07..7fe927bd36b 100644
> --- a/gcc/function.cc
> +++ b/gcc/function.cc
> @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct 
> assign_parm_data_one *data)
>    data->stack_parm = stack_parm;
>  }
>  
> +extern void set_scalar_rtx_for_aggregate_access (tree, rtx);
> +extern void set_scalar_rtx_for_returns ();
> +
>  /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that it's
>     always valid and contiguous.  */
>  
> @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all 
> *all,
>         emit_move_insn (mem, entry_parm);
>       }
>        else
> -     move_block_from_reg (REGNO (entry_parm), mem,
> -                          size_stored / UNITS_PER_WORD);
> +     {
> +       int regno = REGNO (entry_parm);
> +       int nregs = size_stored / UNITS_PER_WORD;
> +       move_block_from_reg (regno, mem, nregs);
> +
> +       rtx *tmps = XALLOCAVEC (rtx, nregs);
> +       machine_mode mode = word_mode;
> +       HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
> +       for (int i = 0; i < nregs; i++)
> +         {
> +           rtx reg = gen_rtx_REG (mode, regno + i);
> +           rtx off = GEN_INT (word_size * i);
> +           tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
> +         }
> +
> +       rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
> +       set_scalar_rtx_for_aggregate_access (parm, regs);
> +     }
>      }
>    else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
>      {
> @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl)
>        else
>       set_decl_incoming_rtl (parm, data.entry_parm, false);
>  
> +      rtx incoming = DECL_INCOMING_RTL (parm);
> +      if (GET_CODE (incoming) == PARALLEL)
> +     set_scalar_rtx_for_aggregate_access (parm, incoming);
> +
>        assign_parm_adjust_stack_rtl (&data);
>  
>        if (assign_parm_setup_block_p (&data))
> @@ -5136,6 +5159,7 @@ expand_function_start (tree subr)
>           {
>             gcc_assert (GET_CODE (hard_reg) == PARALLEL);
>             set_parm_rtl (res, gen_group_rtx (hard_reg));
> +           set_scalar_rtx_for_returns ();
>           }
>       }
>  
> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
> b/gcc/testsuite/g++.target/powerpc/pr102024.C
> index 769585052b5..c8995cae707 100644
> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C
> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
> @@ -5,7 +5,7 @@
>  // Test that a zero-width bit field in an otherwise homogeneous aggregate
>  // generates a psabi warning and passes arguments in GPRs.
>  
> -// { dg-final { scan-assembler-times {\mstd\M} 4 } }
> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
>  
>  struct a_thing
>  {
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
> b/gcc/testsuite/gcc.target/powerpc/pr108073.c
> new file mode 100644
> index 00000000000..7dd1a4a326a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
> @@ -0,0 +1,29 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -save-temps" } */
> +
> +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF;
> +typedef struct SF {float a[4]; int i1; int i2; } SF;
> +
> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 
> && has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && 
> has_arch_pwr8 } } } } */
> +short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return 
> a.s2+a.s3;return 0;}
> +int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return 
> a.i2+a.i1;return 0;}
> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 
> 2)return arg.a[3];else return 0.0;}
> +float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 
> 2)return arg.a[2]; return 0;}
> +float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 
> 2)return arg.a[1];return 0;}
> +
> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
> +
> +int main()
> +{
> +  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 
> 4.0
> +     && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
> +    __builtin_abort ();
> +  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
> +     && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c 
> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
> new file mode 100644
> index 00000000000..4e1f87f7939
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
> @@ -0,0 +1,6 @@
> +/* PR target/65421 */
> +/* { dg-options "-O2" } */
> +
> +typedef struct LARGE {double a[4]; int arr[32];} LARGE;
> +LARGE foo (LARGE a){return a;}
> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c 
> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
> new file mode 100644
> index 00000000000..8a8e1a0e996
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
> @@ -0,0 +1,32 @@
> +/* PR target/65421 */
> +/* { dg-options "-O2" } */
> +/* { dg-require-effective-target powerpc_elfv2 } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +typedef struct FLOATS
> +{
> +  double a[3];
> +} FLOATS;
> +
> +/* 3 lfd after returns also optimized */
> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
> +
> +/* 3 stfd */
> +void st_arg (FLOATS a, FLOATS *p) {*p = a;}
> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
> +
> +/* blr */
> +FLOATS ret_arg (FLOATS a) {return a;}
> +
> +typedef struct MIX
> +{
> +  double a[2];
> +  long l;
> +} MIX;
> +
> +/* std 3 param regs to return slot */
> +MIX ret_arg1 (MIX a) {return a;}
> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
> +
> +/* count insns */
> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
>
>>
>> Richard.
>>
>>> 
>>> BR,
>>> Jeff (Jiufu Guo)
>>> 
>>> 
>>> Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> 
>>> > Hi Martin,
>>> >
>>> > Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> >
>>> >> Hi,
>>> >>
>>> >> Martin Jambor <mjam...@suse.cz> writes:
>>> >>
>>> >>> Hi,
>>> >>>
>>> >>> On Tue, May 30 2023, Richard Biener wrote:
>>> >>>> On Mon, 29 May 2023, Jiufu Guo wrote:
>>> >>>>
>>> >>>>> Hi,
>>> >>>>> 
>>> >>>>> Previously, I was investigating some struct parameters and returns 
>>> >>>>> related
>>> >>>>> PRs 69143/65421/108073.
>>> >>>>> 
>>> >>>>> Investigating the issues case by case, and drafting patches for each 
>>> >>>>> of
>>> >>>>> them one by one. This would help us to enhance code incrementally.
>>> >>>>> While, this way, patches would interact with each other and implement
>>> >>>>> different codes for similar issues (because of the different paths in
>>> >>>>> gimple/rtl).  We may have a common fix for those issues.
>>> >>>>> 
>>> >>>>> We know a few other related PRs(such as meta-bug PR101926) exist. For 
>>> >>>>> those
>>> >>>>> PRs in different targets with different symptoms (and also different 
>>> >>>>> root
>>> >>>>> cause), I would expect a method could help some of them, but it may
>>> >>>>> be hard to handle all of them in one fix.
>>> >>>>> 
>>> >>>>> With investigation and check discussion for the issues, I remember a
>>> >>>>> suggestion from Richard: it would be nice to perform some SRA-like 
>>> >>>>> analysis
>>> >>>>> for the accesses on the structs (parameter/returns).
>>> >>>>> https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605117.html
>>> >>>>> This may be a 'fairly common method' for those issues. With this idea,
>>> >>>>> I drafted a patch as below in this mail.
>>> >>>>> 
>>> >>>>> I also thought about directly using tree-sra.cc, e.g. enhance it and 
>>> >>>>> rerun it
>>> >>>>> at the end of GIMPLE passes. While since some issues are introduced 
>>> >>>>> inside
>>> >>>>> the expander, so below patch also co-works with other parts of the 
>>> >>>>> expander.
>>> >>>>> And since we already have tree-sra in gimple pass, we only need to 
>>> >>>>> take more
>>> >>>>> care on parameter and return in this patch: other decls could be 
>>> >>>>> handled
>>> >>>>> well in tree-sra.
>>> >>>>> 
>>> >>>>> The steps of this patch are:
>>> >>>>> 1. Collect struct type parameters and returns, and then scan the 
>>> >>>>> function to
>>> >>>>> get the accesses on them. And figure out the accesses which would be 
>>> >>>>> profitable
>>> >>>>> to be scalarized (using registers of the parameter/return ). Now, 
>>> >>>>> reading on
>>> >>>>> parameter and writing on returns are checked in the current patch.
>>> >>>>> 2. When/after the scalar registers are determined/expanded for the 
>>> >>>>> return or
>>> >>>>> parameters, compute the corresponding scalar register(s) for each 
>>> >>>>> accesses of
>>> >>>>> the return/parameter, and prepare the scalar RTLs for those accesses.
>>> >>>>> 3. When using/expanding the accesses expression, leverage the 
>>> >>>>> computed/prepared
>>> >>>>> scalars directly.
>>> >>>>> 
>>> >>>>> This patch is tested on ppc64 both LE and BE.
>>> >>>>> To continue, I would ask for comments and suggestions first. And then 
>>> >>>>> I would
>>> >>>>> update/enhance accordingly.  Thanks in advance!
>>> >>>>
>>> >>>> Thanks for working on this - the description above sounds exactly like
>>> >>>> what should be done.
>>> >>>>
>>> >>>> Now - I'd like the code to re-use the access tree data structure from
>>> >>>> SRA plus at least the worker creating the accesses from a stmt.
>>> >>>
>>> >
>>> > I'm thinking about which part of the code can be re-used from
>>> > ipa-sra and tree-sra.
>>> > It seems there are some similar concepts between them:
>>> > "access with offset/size", "collect and check candidates",
>>> > "analyze accesses"...
>>> >
>>> > While because the purposes are different, the logic and behavior
>>> > between them (ipa-sra, tree-sra, and expander-sra) are different,
>>> > even for similar concepts.
>>> >
>>> > The same behavior and similar concept may be reusable. Below list
>>> > may be part of them.
>>> > *. allocate and maintain access
>>> >    basic access structure: offset, size, reverse
>>> > *. type or expr checking
>>> > *. disqualify
>>> > *. scan and build expr access
>>> > *. scan and walk stmts (return/assign/call/asm)
>>> > *. collect candidates
>>> > *. initialize/deinitialize
>>> > *. access dump
>>> >
>>> > There are different behaviors for a similar concept.
>>> > For examples:
>>> > *. Access has grg/queues in tree-sra, access has nonarg in ipa-sra,
>>> > and expander-sra does not check access's child/sibling yet.
>>> > *. for same stmt(assign/call), different sra checks different logic.
>>> > *. candidates have different checking logic: ipa-sra checks more stuff.
>>> >
>>> > Is this align with your thoughts?  Thanks for comments!
>>> >
>>> > BR,
>>> > Jeff (Jiufu Guo)
>>> >
>>> >> Thanks Martin for your reply and thanks for your time!
>>> >>
>>> >>> I have had a first look at the patch but still need to look into it more
>>> >>> to understand how it uses the information it gathers.
>>> >>>
>>> >>> My plan is to make the access-tree infrastructure of IPA-SRA more
>>> >>> generic and hopefully usable even for this purpose, rather than the one
>>> >>> in tree-sra.cc.  But that really builds a tree of accesses, bailing out
>>> >>> on any partial overlaps, for example, which may not be the right thing
>>> >>> here since I don't see any tree-building here.
>>> >>
>>> >> Yeap, both in tree-sra and ipa-sra, there are concepts about
>>> >> "access" and "scan functions/stmts". In this light-sra, these concepts
>>> >> are also used. And you may notice that ipa-sra and tree-sra have more
>>> >> logic than the current 'light-expand-sra'.
>>> >>
>>> >> Currently, the 'light-expand-sra' just takes care few things: reading
>>> >> from parameter, writing to returns, and disabling sra if address-taken.
>>> >> As you notice, now the "access" in this patch is not in a 'tree-struct',
>>> >> it is just a 'flat' (or say map & vector). And overlaps between
>>> >> accesses are not checked because they are all just reading (for parm).
>>> >>
>>> >> When we take care of more stuff: passing to call argument, occur in
>>> >> memory assignment, occur in line asm... This light-expander-sra would be
>>> >> more and more like tee-sra and ipa-sra. And it would be good to leverage
>>> >> more capabilities from tree-sra and ipa-sra. So, I agree that it would be
>>> >> a great idea to share and reuse the same struct.
>>> >>
>>> >>> But I still need to
>>> >>> properly read set_scalar_rtx_for_aggregate_access function in the patch,
>>> >>> which I plan to do next week.
>>> >>
>>> >> set_scalar_rtx_for_aggregate_access is another key part of this patch.
>>> >> Different from tree-sra/ipa-sra (which creates new scalars SSA for each
>>> >> access), this patch invokes "set_scalar_rtx_for_aggregate_access" to
>>> >> create an rtx expression for each access. Now, this part may not common
>>> >> with tree-sra and ipa-sra.
>>> >>
>>> >> This function is invoked for each parameter if the parameter is
>>> >> aggregate type and passed via registers.
>>> >> For each access about this parameter, the function creates an rtx
>>> >> according to the offset/size/mode of the access. The created rtx maybe:
>>> >> 1. one rtx pseudo corresponds to an incoming reg,
>>> >> 2. one rtx pseudo which is assigned by a part of incoming reg after
>>> >> shift and mode adjust,
>>> >> 3. a parallel rtx contains a few rtx pseudos corresponding to the
>>> >> incoming registers.
>>> >> For return, only 1 and 3 are ok.
>>> >>
>>> >> BR,
>>> >> Jeff (Jiufu Guo)
>>> >>
>>> >>>
>>> >>> Thanks,
>>> >>>
>>> >>> Martin
>>> >>>
>>> >>>>
>>> >>>> The RTL expansion code already does a sweep over stmts in
>>> >>>> discover_nonconstant_array_refs which makes sure RTL expansion doesn't
>>> >>>> scalarize (aka assign non-stack) to variables which have accesses
>>> >>>> that would later eventually FAIL to expand when operating on registers.
>>> >>>> That's very much related to the task at hand so we should try to
>>> >>>> at least merge the CFG walks of both (it produces a forced_stack_vars
>>> >>>> bitmap).
>>> >>>>
>>> >>>> Can you work together with Martin to split out the access tree
>>> >>>> data structure and share it?
>>> >>>>
>>> >>>> I didn't look in detail as of how you make use of the information
>>> >>>> yet.
>>> >>>>
>>> >>>> Thanks,
>>> >>>> Richard.
>>> >>>>
>>> >>>>> 
>>> >>>>> BR,
>>> >>>>> Jeff (Jiufu)
>>> >>>>> 
>>> >>>>> 
>>> >>>>> ---
>>> >>>>>  gcc/cfgexpand.cc                             | 567 
>>> >>>>> ++++++++++++++++++-
>>> >>>>>  gcc/expr.cc                                  |  15 +-
>>> >>>>>  gcc/function.cc                              |  26 +-
>>> >>>>>  gcc/opts.cc                                  |   8 +-
>>> >>>>>  gcc/testsuite/g++.target/powerpc/pr102024.C  |   2 +-
>>> >>>>>  gcc/testsuite/gcc.target/powerpc/pr108073.c  |  29 +
>>> >>>>>  gcc/testsuite/gcc.target/powerpc/pr65421-1.c |   6 +
>>> >>>>>  gcc/testsuite/gcc.target/powerpc/pr65421-2.c |  32 ++
>>> >>>>>  8 files changed, 675 insertions(+), 10 deletions(-)
>>> >>>>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
>>> >>>>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>>> >>>>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>>> >>>>> 
>>> >>>>> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
>>> >>>>> index 85a93a547c0..95c29b6b6fe 100644
>>> >>>>> --- a/gcc/cfgexpand.cc
>>> >>>>> +++ b/gcc/cfgexpand.cc
>>> >>>>> @@ -97,6 +97,564 @@ static bool defer_stack_allocation (tree, bool);
>>> >>>>>  
>>> >>>>>  static void record_alignment_for_reg_var (unsigned int);
>>> >>>>>  
>>> >>>>> +/* For light SRA in expander about paramaters and returns.  */
>>> >>>>> +namespace {
>>> >>>>> +
>>> >>>>> +struct access
>>> >>>>> +{
>>> >>>>> +  /* Each accessing on the aggragate is about OFFSET/SIZE and BASE.  
>>> >>>>> */
>>> >>>>> +  HOST_WIDE_INT offset;
>>> >>>>> +  HOST_WIDE_INT size;
>>> >>>>> +  tree base;
>>> >>>>> +  bool writing;
>>> >>>>> +
>>> >>>>> +  /* The context expression of this access.  */
>>> >>>>> +  tree expr;
>>> >>>>> +
>>> >>>>> +  /* The rtx for the access: link to incoming/returning register(s). 
>>> >>>>>  */
>>> >>>>> +  rtx rtx_val;
>>> >>>>> +};
>>> >>>>> +
>>> >>>>> +typedef struct access *access_p;
>>> >>>>> +
>>> >>>>> +/* Expr (tree) -> Acess (access_p) map.  */
>>> >>>>> +static hash_map<tree, access_p> *expr_access_vec;
>>> >>>>> +
>>> >>>>> +/* Base (tree) -> Vector (vec<access_p> *) map.  */
>>> >>>>> +static hash_map<tree, auto_vec<access_p> > *base_access_vec;
>>> >>>>> +
>>> >>>>> +/* Return a vector of pointers to accesses for the variable given in 
>>> >>>>> BASE or
>>> >>>>> + NULL if there is none.  */
>>> >>>>> +
>>> >>>>> +static vec<access_p> *
>>> >>>>> +get_base_access_vector (tree base)
>>> >>>>> +{
>>> >>>>> +  return base_access_vec->get (base);
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Remove DECL from candidates for SRA.  */
>>> >>>>> +static void
>>> >>>>> +disqualify_candidate (tree decl)
>>> >>>>> +{
>>> >>>>> +  decl = get_base_address (decl);
>>> >>>>> +  base_access_vec->remove (decl);
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Create and insert access for EXPR. Return created access, or NULL 
>>> >>>>> if it is
>>> >>>>> +   not possible.  */
>>> >>>>> +static struct access *
>>> >>>>> +create_access (tree expr, bool write)
>>> >>>>> +{
>>> >>>>> +  poly_int64 poffset, psize, pmax_size;
>>> >>>>> +  bool reverse;
>>> >>>>> +
>>> >>>>> +  tree base
>>> >>>>> +    = get_ref_base_and_extent (expr, &poffset, &psize, &pmax_size, 
>>> >>>>> &reverse);
>>> >>>>> +
>>> >>>>> +  if (!DECL_P (base))
>>> >>>>> +    return NULL;
>>> >>>>> +
>>> >>>>> +  vec<access_p> *access_vec = get_base_access_vector (base);
>>> >>>>> +  if (!access_vec)
>>> >>>>> +    return NULL;
>>> >>>>> +
>>> >>>>> +  /* TODO: support reverse. */
>>> >>>>> +  if (reverse)
>>> >>>>> +    {
>>> >>>>> +      disqualify_candidate (expr);
>>> >>>>> +      return NULL;
>>> >>>>> +    }
>>> >>>>> +
>>> >>>>> +  HOST_WIDE_INT offset, size, max_size;
>>> >>>>> +  if (!poffset.is_constant (&offset) || !psize.is_constant (&size)
>>> >>>>> +      || !pmax_size.is_constant (&max_size))
>>> >>>>> +    return NULL;
>>> >>>>> +
>>> >>>>> +  if (size != max_size || size == 0 || offset < 0 || size < 0
>>> >>>>> +      || offset + size > tree_to_shwi (DECL_SIZE (base)))
>>> >>>>> +    return NULL;
>>> >>>>> +
>>> >>>>> +  struct access *access = XNEWVEC (struct access, 1);
>>> >>>>> +
>>> >>>>> +  memset (access, 0, sizeof (struct access));
>>> >>>>> +  access->base = base;
>>> >>>>> +  access->offset = offset;
>>> >>>>> +  access->size = size;
>>> >>>>> +  access->expr = expr;
>>> >>>>> +  access->writing = write;
>>> >>>>> +  access->rtx_val = NULL_RTX;
>>> >>>>> +
>>> >>>>> +  access_vec->safe_push (access);
>>> >>>>> +
>>> >>>>> +  return access;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Return true if VAR is a candidate for SRA.  */
>>> >>>>> +static bool
>>> >>>>> +add_sra_candidate (tree var)
>>> >>>>> +{
>>> >>>>> +  tree type = TREE_TYPE (var);
>>> >>>>> +
>>> >>>>> +  if (!AGGREGATE_TYPE_P (type) || TREE_THIS_VOLATILE (var)
>>> >>>>> +      || !COMPLETE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE 
>>> >>>>> (type))
>>> >>>>> +      || tree_to_shwi (TYPE_SIZE (type)) == 0
>>> >>>>> +      || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT 
>>> >>>>> (va_list_type_node))
>>> >>>>> +    return false;
>>> >>>>> +
>>> >>>>> +  base_access_vec->get_or_insert (var);
>>> >>>>> +
>>> >>>>> +  return true;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Callback of walk_stmt_load_store_addr_ops visit_addr used to 
>>> >>>>> remove
>>> >>>>> +   operands with address taken.  */
>>> >>>>> +static tree
>>> >>>>> +visit_addr (tree *tp, int *, void *)
>>> >>>>> +{
>>> >>>>> +  tree op = *tp;
>>> >>>>> +  if (op && DECL_P (op))
>>> >>>>> +    disqualify_candidate (op);
>>> >>>>> +
>>> >>>>> +  return NULL;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Scan expression EXPR and create access structures for all 
>>> >>>>> accesses to
>>> >>>>> +   candidates for scalarization.  Return the created access or NULL 
>>> >>>>> if none is
>>> >>>>> +   created.  */
>>> >>>>> +static struct access *
>>> >>>>> +build_access_from_expr (tree expr, bool write)
>>> >>>>> +{
>>> >>>>> +  if (TREE_CODE (expr) == VIEW_CONVERT_EXPR)
>>> >>>>> +    expr = TREE_OPERAND (expr, 0);
>>> >>>>> +
>>> >>>>> +  if (TREE_CODE (expr) == BIT_FIELD_REF || storage_order_barrier_p 
>>> >>>>> (expr)
>>> >>>>> +      || TREE_THIS_VOLATILE (expr))
>>> >>>>> +    {
>>> >>>>> +      disqualify_candidate (expr);
>>> >>>>> +      return NULL;
>>> >>>>> +    }
>>> >>>>> +
>>> >>>>> +  switch (TREE_CODE (expr))
>>> >>>>> +    {
>>> >>>>> +      case MEM_REF: {
>>> >>>>> +     tree op = TREE_OPERAND (expr, 0);
>>> >>>>> +     if (TREE_CODE (op) == ADDR_EXPR)
>>> >>>>> +       disqualify_candidate (TREE_OPERAND (op, 0));
>>> >>>>> +     break;
>>> >>>>> +      }
>>> >>>>> +    case ADDR_EXPR:
>>> >>>>> +    case IMAGPART_EXPR:
>>> >>>>> +    case REALPART_EXPR:
>>> >>>>> +      disqualify_candidate (TREE_OPERAND (expr, 0));
>>> >>>>> +      break;
>>> >>>>> +    case VAR_DECL:
>>> >>>>> +    case PARM_DECL:
>>> >>>>> +    case RESULT_DECL:
>>> >>>>> +    case COMPONENT_REF:
>>> >>>>> +    case ARRAY_REF:
>>> >>>>> +    case ARRAY_RANGE_REF:
>>> >>>>> +      return create_access (expr, write);
>>> >>>>> +      break;
>>> >>>>> +    default:
>>> >>>>> +      break;
>>> >>>>> +    }
>>> >>>>> +
>>> >>>>> +  return NULL;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Scan function and look for interesting expressions and create 
>>> >>>>> access
>>> >>>>> +   structures for them.  */
>>> >>>>> +static void
>>> >>>>> +scan_function (void)
>>> >>>>> +{
>>> >>>>> +  basic_block bb;
>>> >>>>> +
>>> >>>>> +  FOR_EACH_BB_FN (bb, cfun)
>>> >>>>> +    {
>>> >>>>> +      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
>>> >>>>> +        gsi_next (&gsi))
>>> >>>>> +     {
>>> >>>>> +       gphi *phi = gsi.phi ();
>>> >>>>> +       for (size_t i = 0; i < gimple_phi_num_args (phi); i++)
>>> >>>>> +         {
>>> >>>>> +           tree t = gimple_phi_arg_def (phi, i);
>>> >>>>> +           walk_tree (&t, visit_addr, NULL, NULL);
>>> >>>>> +         }
>>> >>>>> +     }
>>> >>>>> +
>>> >>>>> +      for (gimple_stmt_iterator gsi = 
>>> >>>>> gsi_start_nondebug_after_labels_bb (bb);
>>> >>>>> +        !gsi_end_p (gsi); gsi_next_nondebug (&gsi))
>>> >>>>> +     {
>>> >>>>> +       gimple *stmt = gsi_stmt (gsi);
>>> >>>>> +       switch (gimple_code (stmt))
>>> >>>>> +         {
>>> >>>>> +           case GIMPLE_RETURN: {
>>> >>>>> +             tree r = gimple_return_retval (as_a<greturn *> (stmt));
>>> >>>>> +             if (r && VAR_P (r) && r != DECL_RESULT 
>>> >>>>> (current_function_decl))
>>> >>>>> +               build_access_from_expr (r, true);
>>> >>>>> +           }
>>> >>>>> +           break;
>>> >>>>> +         case GIMPLE_ASSIGN:
>>> >>>>> +           if (gimple_assign_single_p (stmt) && !gimple_clobber_p 
>>> >>>>> (stmt))
>>> >>>>> +             {
>>> >>>>> +               tree lhs = gimple_assign_lhs (stmt);
>>> >>>>> +               tree rhs = gimple_assign_rhs1 (stmt);
>>> >>>>> +               if (TREE_CODE (rhs) == CONSTRUCTOR)
>>> >>>>> +                 disqualify_candidate (lhs);
>>> >>>>> +               else
>>> >>>>> +                 {
>>> >>>>> +                   build_access_from_expr (rhs, false);
>>> >>>>> +                   build_access_from_expr (lhs, true);
>>> >>>>> +                 }
>>> >>>>> +             }
>>> >>>>> +           break;
>>> >>>>> +         default:
>>> >>>>> +           walk_gimple_op (stmt, visit_addr, NULL);
>>> >>>>> +           break;
>>> >>>>> +         }
>>> >>>>> +     }
>>> >>>>> +    }
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Collect the parameter and returns with type which is suitable for
>>> >>>>> + * scalarization.  */
>>> >>>>> +static bool
>>> >>>>> +collect_light_sra_candidates (void)
>>> >>>>> +{
>>> >>>>> +  bool ret = false;
>>> >>>>> +
>>> >>>>> +  /* Collect parameters.  */
>>> >>>>> +  for (tree parm = DECL_ARGUMENTS (current_function_decl); parm;
>>> >>>>> +       parm = DECL_CHAIN (parm))
>>> >>>>> +    ret |= add_sra_candidate (parm);
>>> >>>>> +
>>> >>>>> +  /* Collect VARs on returns.  */
>>> >>>>> +  if (DECL_RESULT (current_function_decl))
>>> >>>>> +    {
>>> >>>>> +      edge_iterator ei;
>>> >>>>> +      edge e;
>>> >>>>> +      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>>> >>>>> +     if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb 
>>> >>>>> (e->src)))
>>> >>>>> +       {
>>> >>>>> +         tree val = gimple_return_retval (r);
>>> >>>>> +         if (val && VAR_P (val))
>>> >>>>> +           ret |= add_sra_candidate (val);
>>> >>>>> +       }
>>> >>>>> +    }
>>> >>>>> +
>>> >>>>> +  return ret;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Now, only scalarize the parms only with reading
>>> >>>>> +   or returns only with writing.  */
>>> >>>>> +bool
>>> >>>>> +check_access_vec (tree const &base, auto_vec<access_p> const 
>>> >>>>> &access_vec,
>>> >>>>> +               auto_vec<tree> *unqualify_vec)
>>> >>>>> +{
>>> >>>>> +  bool read = false;
>>> >>>>> +  bool write = false;
>>> >>>>> +  for (unsigned int j = 0; j < access_vec.length (); j++)
>>> >>>>> +    {
>>> >>>>> +      struct access *access = access_vec[j];
>>> >>>>> +      if (access->writing)
>>> >>>>> +     write = true;
>>> >>>>> +      else
>>> >>>>> +     read = true;
>>> >>>>> +
>>> >>>>> +      if (write && read)
>>> >>>>> +     break;
>>> >>>>> +    }
>>> >>>>> +  if ((write && read) || (!write && !read))
>>> >>>>> +    unqualify_vec->safe_push (base);
>>> >>>>> +
>>> >>>>> +  return true;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +/* Analyze all the accesses, remove those inprofitable candidates.
>>> >>>>> +   And build the expr->access map.  */
>>> >>>>> +static void
>>> >>>>> +analyze_accesses ()
>>> >>>>> +{
>>> >>>>> +  auto_vec<tree> unqualify_vec;
>>> >>>>> +  base_access_vec->traverse<auto_vec<tree> *, check_access_vec> (
>>> >>>>> +    &unqualify_vec);
>>> >>>>> +
>>> >>>>> +  tree base;
>>> >>>>> +  unsigned i;
>>> >>>>> +  FOR_EACH_VEC_ELT (unqualify_vec, i, base)
>>> >>>>> +    disqualify_candidate (base);
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +static void
>>> >>>>> +prepare_expander_sra ()
>>> >>>>> +{
>>> >>>>> +  if (optimize <= 0)
>>> >>>>> +    return;
>>> >>>>> +
>>> >>>>> +  base_access_vec = new hash_map<tree, auto_vec<access_p> >;
>>> >>>>> +  expr_access_vec = new hash_map<tree, access_p>;
>>> >>>>> +
>>> >>>>> +  if (collect_light_sra_candidates ())
>>> >>>>> +    {
>>> >>>>> +      scan_function ();
>>> >>>>> +      analyze_accesses ();
>>> >>>>> +    }
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +static void
>>> >>>>> +free_expander_sra ()
>>> >>>>> +{
>>> >>>>> +  if (optimize <= 0 || !expr_access_vec)
>>> >>>>> +    return;
>>> >>>>> +  delete expr_access_vec;
>>> >>>>> +  expr_access_vec = 0;
>>> >>>>> +  delete base_access_vec;
>>> >>>>> +  base_access_vec = 0;
>>> >>>>> +}
>>> >>>>> +} /* namespace */
>>> >>>>> +
>>> >>>>> +/* Check If there is an sra access for the expr.
>>> >>>>> +   Return the correspond scalar sym for the access. */
>>> >>>>> +rtx
>>> >>>>> +get_scalar_rtx_for_aggregate_expr (tree expr)
>>> >>>>> +{
>>> >>>>> +  if (!expr_access_vec)
>>> >>>>> +    return NULL_RTX;
>>> >>>>> +  access_p *access = expr_access_vec->get (expr);
>>> >>>>> +  return access ? (*access)->rtx_val : NULL_RTX;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +extern rtx
>>> >>>>> +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, 
>>> >>>>> int);
>>> >>>>> +
>>> >>>>> +/* Compute/Set RTX registers for those accesses on BASE.  */
>>> >>>>> +void
>>> >>>>> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs)
>>> >>>>> +{
>>> >>>>> +  if (!base_access_vec)
>>> >>>>> +    return;
>>> >>>>> +  vec<access_p> *access_vec = get_base_access_vector (base);
>>> >>>>> +  if (!access_vec)
>>> >>>>> +    return;
>>> >>>>> +
>>> >>>>> +  /* Go through each access, compute corresponding rtx(regs or 
>>> >>>>> subregs)
>>> >>>>> +     for the expression.  */
>>> >>>>> +  int n = access_vec->length ();
>>> >>>>> +  int cur_access_index = 0;
>>> >>>>> +  for (; cur_access_index < n; cur_access_index++)
>>> >>>>> +    {
>>> >>>>> +      access_p acc = (*access_vec)[cur_access_index];
>>> >>>>> +      machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr));
>>> >>>>> +      /* non BLK in mult registers*/
>>> >>>>> +      if (expr_mode != BLKmode
>>> >>>>> +       && known_gt (acc->size, GET_MODE_BITSIZE (word_mode)))
>>> >>>>> +     break;
>>> >>>>> +
>>> >>>>> +      int start_index = -1;
>>> >>>>> +      int end_index = -1;
>>> >>>>> +      HOST_WIDE_INT left_margin_bits = 0;
>>> >>>>> +      HOST_WIDE_INT right_margin_bits = 0;
>>> >>>>> +      int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
>>> >>>>> +      for (; cur_index < XVECLEN (regs, 0); cur_index++)
>>> >>>>> +     {
>>> >>>>> +       rtx slot = XVECEXP (regs, 0, cur_index);
>>> >>>>> +       HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
>>> >>>>> +       HOST_WIDE_INT size
>>> >>>>> +         = GET_MODE_BITSIZE (GET_MODE (XEXP (slot, 0))).to_constant 
>>> >>>>> ();
>>> >>>>> +       if (off <= acc->offset && off + size > acc->offset)
>>> >>>>> +         {
>>> >>>>> +           start_index = cur_index;
>>> >>>>> +           left_margin_bits = acc->offset - off;
>>> >>>>> +         }
>>> >>>>> +       if (off + size >= acc->offset + acc->size)
>>> >>>>> +         {
>>> >>>>> +           end_index = cur_index;
>>> >>>>> +           right_margin_bits = off + size - (acc->offset + 
>>> >>>>> acc->size);
>>> >>>>> +           break;
>>> >>>>> +         }
>>> >>>>> +     }
>>> >>>>> +      /* accessing pading and outof bound.  */
>>> >>>>> +      if (start_index < 0 || end_index < 0)
>>> >>>>> +     break;
>>> >>>>> +
>>> >>>>> +      /* Need a parallel for possible multi-registers. */
>>> >>>>> +      if (expr_mode == BLKmode || end_index > start_index)
>>> >>>>> +     {
>>> >>>>> +       /* Can not support start from middle of a register.  */
>>> >>>>> +       if (left_margin_bits != 0)
>>> >>>>> +         break;
>>> >>>>> +
>>> >>>>> +       int len = end_index - start_index + 1;
>>> >>>>> +       const int margin = 3; /* more space for SI, HI, QI.  */
>>> >>>>> +       rtx *tmps = XALLOCAVEC (rtx, len + (right_margin_bits ? 
>>> >>>>> margin : 0));
>>> >>>>> +
>>> >>>>> +       HOST_WIDE_INT start_off
>>> >>>>> +         = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1));
>>> >>>>> +       int pos = 0;
>>> >>>>> +       for (; pos < len - (right_margin_bits ? 1 : 0); pos++)
>>> >>>>> +         {
>>> >>>>> +           int index = start_index + pos;
>>> >>>>> +           rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0);
>>> >>>>> +           machine_mode mode = GET_MODE (orig_reg);
>>> >>>>> +           rtx reg = NULL_RTX;
>>> >>>>> +           if (HARD_REGISTER_P (orig_reg))
>>> >>>>> +             {
>>> >>>>> +               /* Reading from param hard reg need to be moved to a 
>>> >>>>> temp.  */
>>> >>>>> +               gcc_assert (!acc->writing);
>>> >>>>> +               reg = gen_reg_rtx (mode);
>>> >>>>> +               emit_move_insn (reg, orig_reg);
>>> >>>>> +             }
>>> >>>>> +           else
>>> >>>>> +             reg = orig_reg;
>>> >>>>> +
>>> >>>>> +           HOST_WIDE_INT off = UINTVAL (XEXP (XVECEXP (regs, 0, 
>>> >>>>> index), 1));
>>> >>>>> +           tmps[pos]
>>> >>>>> +             = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off - 
>>> >>>>> start_off));
>>> >>>>> +         }
>>> >>>>> +
>>> >>>>> +       /* There are some fields are in part of registers.   */
>>> >>>>> +       if (right_margin_bits != 0)
>>> >>>>> +         {
>>> >>>>> +           if (acc->writing)
>>> >>>>> +             break;
>>> >>>>> +
>>> >>>>> +           gcc_assert ((right_margin_bits % BITS_PER_UNIT) == 0);
>>> >>>>> +           HOST_WIDE_INT off_byte
>>> >>>>> +             = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - 
>>> >>>>> start_off;
>>> >>>>> +           rtx orig_reg = XEXP (XVECEXP (regs, 0, end_index), 0);
>>> >>>>> +           machine_mode orig_mode = GET_MODE (orig_reg);
>>> >>>>> +           gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT);
>>> >>>>> +
>>> >>>>> +           machine_mode mode_aux[] = {SImode, HImode, QImode};
>>> >>>>> +           HOST_WIDE_INT reg_size
>>> >>>>> +             = GET_MODE_BITSIZE (orig_mode).to_constant ();
>>> >>>>> +           HOST_WIDE_INT off_bits = 0;
>>> >>>>> +           for (unsigned long j = 0;
>>> >>>>> +                j < sizeof (mode_aux) / sizeof (mode_aux[0]); j++)
>>> >>>>> +             {
>>> >>>>> +               HOST_WIDE_INT submode_bitsize
>>> >>>>> +                 = GET_MODE_BITSIZE (mode_aux[j]).to_constant ();
>>> >>>>> +               if (reg_size - right_margin_bits - off_bits
>>> >>>>> +                   >= submode_bitsize)
>>> >>>>> +                 {
>>> >>>>> +                   rtx reg = gen_reg_rtx (orig_mode);
>>> >>>>> +                   emit_move_insn (reg, orig_reg);
>>> >>>>> +
>>> >>>>> +                   poly_uint64 lowpart_off
>>> >>>>> +                     = subreg_lowpart_offset (mode_aux[j], 
>>> >>>>> orig_mode);
>>> >>>>> +                   int lowpart_off_bits
>>> >>>>> +                     = lowpart_off.to_constant () * BITS_PER_UNIT;
>>> >>>>> +                   int shift_bits = lowpart_off_bits >= off_bits
>>> >>>>> +                                      ? (lowpart_off_bits - off_bits)
>>> >>>>> +                                      : (off_bits - 
>>> >>>>> lowpart_off_bits);
>>> >>>>> +                   if (shift_bits > 0)
>>> >>>>> +                     reg = expand_shift (RSHIFT_EXPR, orig_mode, reg,
>>> >>>>> +                                         shift_bits, NULL, 1);
>>> >>>>> +                   rtx subreg = gen_lowpart (mode_aux[j], reg);
>>> >>>>> +                   rtx off = GEN_INT (off_byte);
>>> >>>>> +                   tmps[pos++]
>>> >>>>> +                     = gen_rtx_EXPR_LIST (mode_aux[j], subreg, off);
>>> >>>>> +                   off_byte += submode_bitsize / BITS_PER_UNIT;
>>> >>>>> +                   off_bits += submode_bitsize;
>>> >>>>> +                 }
>>> >>>>> +             }
>>> >>>>> +         }
>>> >>>>> +
>>> >>>>> +       /* Currently, PARALLELs with register elements for 
>>> >>>>> param/returns
>>> >>>>> +          are using BLKmode.  */
>>> >>>>> +       acc->rtx_val = gen_rtx_PARALLEL (TYPE_MODE (TREE_TYPE 
>>> >>>>> (acc->expr)),
>>> >>>>> +                                        gen_rtvec_v (pos, tmps));
>>> >>>>> +       continue;
>>> >>>>> +     }
>>> >>>>> +
>>> >>>>> +      /* The access corresponds to one reg.  */
>>> >>>>> +      if (end_index == start_index && left_margin_bits == 0
>>> >>>>> +       && right_margin_bits == 0)
>>> >>>>> +     {
>>> >>>>> +       rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
>>> >>>>> +       rtx reg = NULL_RTX;
>>> >>>>> +       if (HARD_REGISTER_P (orig_reg))
>>> >>>>> +         {
>>> >>>>> +           /* Reading from param hard reg need to be moved to a 
>>> >>>>> temp.  */
>>> >>>>> +           gcc_assert (!acc->writing);
>>> >>>>> +           reg = gen_reg_rtx (GET_MODE (orig_reg));
>>> >>>>> +           emit_move_insn (reg, orig_reg);
>>> >>>>> +         }
>>> >>>>> +       else
>>> >>>>> +         reg = orig_reg;
>>> >>>>> +       if (GET_MODE (orig_reg) != expr_mode)
>>> >>>>> +         reg = gen_lowpart (expr_mode, reg);
>>> >>>>> +
>>> >>>>> +       acc->rtx_val = reg;
>>> >>>>> +       continue;
>>> >>>>> +     }
>>> >>>>> +
>>> >>>>> +      /* It is accessing a filed which is part of a register.  */
>>> >>>>> +      scalar_int_mode imode;
>>> >>>>> +      if (!acc->writing && end_index == start_index
>>> >>>>> +       && int_mode_for_size (acc->size, 1).exists (&imode))
>>> >>>>> +     {
>>> >>>>> +       /* get and copy original register inside the param.  */
>>> >>>>> +       rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0);
>>> >>>>> +       machine_mode mode = GET_MODE (orig_reg);
>>> >>>>> +       gcc_assert (GET_MODE_CLASS (mode) == MODE_INT);
>>> >>>>> +       rtx reg = gen_reg_rtx (mode);
>>> >>>>> +       emit_move_insn (reg, orig_reg);
>>> >>>>> +
>>> >>>>> +       /* shift to expect part. */
>>> >>>>> +       poly_uint64 lowpart_off = subreg_lowpart_offset (imode, mode);
>>> >>>>> +       int lowpart_off_bits = lowpart_off.to_constant () * 
>>> >>>>> BITS_PER_UNIT;
>>> >>>>> +       int shift_bits = lowpart_off_bits >= left_margin_bits
>>> >>>>> +                          ? (lowpart_off_bits - left_margin_bits)
>>> >>>>> +                          : (left_margin_bits - lowpart_off_bits);
>>> >>>>> +       if (shift_bits > 0)
>>> >>>>> +         reg = expand_shift (RSHIFT_EXPR, mode, reg, shift_bits, 
>>> >>>>> NULL, 1);
>>> >>>>> +
>>> >>>>> +       /* move corresond part subreg to result.  */
>>> >>>>> +       rtx subreg = gen_lowpart (imode, reg);
>>> >>>>> +       rtx result = gen_reg_rtx (imode);
>>> >>>>> +       emit_move_insn (result, subreg);
>>> >>>>> +
>>> >>>>> +       if (expr_mode != imode)
>>> >>>>> +         result = gen_lowpart (expr_mode, result);
>>> >>>>> +
>>> >>>>> +       acc->rtx_val = result;
>>> >>>>> +       continue;
>>> >>>>> +     }
>>> >>>>> +
>>> >>>>> +      break;
>>> >>>>> +    }
>>> >>>>> +
>>> >>>>> +  /* Some access expr(s) are not scalarized.  */
>>> >>>>> +  if (cur_access_index != n)
>>> >>>>> +    disqualify_candidate (base);
>>> >>>>> +  else
>>> >>>>> +    {
>>> >>>>> +      /* Add elements to expr->access map.  */
>>> >>>>> +      for (int j = 0; j < n; j++)
>>> >>>>> +     {
>>> >>>>> +       access_p access = (*access_vec)[j];
>>> >>>>> +       expr_access_vec->put (access->expr, access);
>>> >>>>> +     }
>>> >>>>> +    }
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> +void
>>> >>>>> +set_scalar_rtx_for_returns ()
>>> >>>>> +{
>>> >>>>> +  tree res = DECL_RESULT (current_function_decl);
>>> >>>>> +  gcc_assert (res);
>>> >>>>> +  edge_iterator ei;
>>> >>>>> +  edge e;
>>> >>>>> +  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
>>> >>>>> +    if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb 
>>> >>>>> (e->src)))
>>> >>>>> +      {
>>> >>>>> +     tree val = gimple_return_retval (r);
>>> >>>>> +     if (val && VAR_P (val))
>>> >>>>> +       set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res));
>>> >>>>> +      }
>>> >>>>> +}
>>> >>>>> +
>>> >>>>>  /* Return an expression tree corresponding to the RHS of GIMPLE
>>> >>>>>     statement STMT.  */
>>> >>>>>  
>>> >>>>> @@ -3778,7 +4336,8 @@ expand_return (tree retval)
>>> >>>>>  
>>> >>>>>    /* If we are returning the RESULT_DECL, then the value has already
>>> >>>>>       been stored into it, so we don't have to do anything special.  
>>> >>>>> */
>>> >>>>> -  if (TREE_CODE (retval_rhs) == RESULT_DECL)
>>> >>>>> +  if (TREE_CODE (retval_rhs) == RESULT_DECL
>>> >>>>> +      || get_scalar_rtx_for_aggregate_expr (retval_rhs))
>>> >>>>>      expand_value_return (result_rtl);
>>> >>>>>  
>>> >>>>>    /* If the result is an aggregate that is being returned in one (or 
>>> >>>>> more)
>>> >>>>> @@ -4422,6 +4981,9 @@ expand_debug_expr (tree exp)
>>> >>>>>    int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp));
>>> >>>>>    addr_space_t as;
>>> >>>>>    scalar_int_mode op0_mode, op1_mode, addr_mode;
>>> >>>>> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>>> >>>>> +  if (x)
>>> >>>>> +    return NULL_RTX;/* optimized out.  */
>>> >>>>>  
>>> >>>>>    switch (TREE_CODE_CLASS (TREE_CODE (exp)))
>>> >>>>>      {
>>> >>>>> @@ -6630,6 +7192,8 @@ pass_expand::execute (function *fun)
>>> >>>>>           avoid_deep_ter_for_debug (gsi_stmt (gsi), 0);
>>> >>>>>      }
>>> >>>>>  
>>> >>>>> +  prepare_expander_sra ();
>>> >>>>> +
>>> >>>>>    /* Mark arrays indexed with non-constant indices with 
>>> >>>>> TREE_ADDRESSABLE.  */
>>> >>>>>    auto_bitmap forced_stack_vars;
>>> >>>>>    discover_nonconstant_array_refs (forced_stack_vars);
>>> >>>>> @@ -7062,6 +7626,7 @@ pass_expand::execute (function *fun)
>>> >>>>>        loop_optimizer_finalize ();
>>> >>>>>      }
>>> >>>>>  
>>> >>>>> +  free_expander_sra ();
>>> >>>>>    timevar_pop (TV_POST_EXPAND);
>>> >>>>>  
>>> >>>>>    return 0;
>>> >>>>> diff --git a/gcc/expr.cc b/gcc/expr.cc
>>> >>>>> index 56b51876f80..b970f98e689 100644
>>> >>>>> --- a/gcc/expr.cc
>>> >>>>> +++ b/gcc/expr.cc
>>> >>>>> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, 
>>> >>>>> rtx, rtx,
>>> >>>>>  static rtx const_vector_from_tree (tree);
>>> >>>>>  static tree tree_expr_size (const_tree);
>>> >>>>>  static void convert_mode_scalar (rtx, rtx, int);
>>> >>>>> +rtx get_scalar_rtx_for_aggregate_expr (tree);
>>> >>>>>  
>>> >>>>>  
>>> >>>>>  /* This is run to set up which modes can be used
>>> >>>>> @@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool 
>>> >>>>> nontemporal)
>>> >>>>>       Assignment of an array element at a constant index, and 
>>> >>>>> assignment of
>>> >>>>>       an array element in an unaligned packed structure field, has 
>>> >>>>> the same
>>> >>>>>       problem.  Same for (partially) storing into a non-memory 
>>> >>>>> object.  */
>>> >>>>> -  if (handled_component_p (to)
>>> >>>>> -      || (TREE_CODE (to) == MEM_REF
>>> >>>>> -       && (REF_REVERSE_STORAGE_ORDER (to)
>>> >>>>> -           || mem_ref_refers_to_non_mem_p (to)))
>>> >>>>> -      || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)
>>> >>>>> +  if (!get_scalar_rtx_for_aggregate_expr (to)
>>> >>>>> +      && (handled_component_p (to)
>>> >>>>> +       || (TREE_CODE (to) == MEM_REF
>>> >>>>> +           && (REF_REVERSE_STORAGE_ORDER (to)
>>> >>>>> +               || mem_ref_refers_to_non_mem_p (to)))
>>> >>>>> +       || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE))
>>> >>>>>      {
>>> >>>>>        machine_mode mode1;
>>> >>>>>        poly_int64 bitsize, bitpos;
>>> >>>>> @@ -8995,6 +8997,9 @@ expand_expr_real (tree exp, rtx target, 
>>> >>>>> machine_mode tmode,
>>> >>>>>        ret = CONST0_RTX (tmode);
>>> >>>>>        return ret ? ret : const0_rtx;
>>> >>>>>      }
>>> >>>>> +  rtx x = get_scalar_rtx_for_aggregate_expr (exp);
>>> >>>>> +  if (x)
>>> >>>>> +    return x;
>>> >>>>>  
>>> >>>>>    ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl,
>>> >>>>>                           inner_reference_p);
>>> >>>>> diff --git a/gcc/function.cc b/gcc/function.cc
>>> >>>>> index 82102ed78d7..262d3f17e72 100644
>>> >>>>> --- a/gcc/function.cc
>>> >>>>> +++ b/gcc/function.cc
>>> >>>>> @@ -2742,6 +2742,9 @@ assign_parm_find_stack_rtl (tree parm, struct 
>>> >>>>> assign_parm_data_one *data)
>>> >>>>>    data->stack_parm = stack_parm;
>>> >>>>>  }
>>> >>>>>  
>>> >>>>> +extern void
>>> >>>>> +set_scalar_rtx_for_aggregate_access (tree, rtx);
>>> >>>>> +
>>> >>>>>  /* A subroutine of assign_parms.  Adjust DATA->ENTRY_RTL such that 
>>> >>>>> it's
>>> >>>>>     always valid and contiguous.  */
>>> >>>>>  
>>> >>>>> @@ -3117,8 +3120,21 @@ assign_parm_setup_block (struct 
>>> >>>>> assign_parm_data_all *all,
>>> >>>>>         emit_move_insn (mem, entry_parm);
>>> >>>>>       }
>>> >>>>>        else
>>> >>>>> -     move_block_from_reg (REGNO (entry_parm), mem,
>>> >>>>> -                          size_stored / UNITS_PER_WORD);
>>> >>>>> +     {
>>> >>>>> +       int regno = REGNO (entry_parm);
>>> >>>>> +       int nregs = size_stored / UNITS_PER_WORD;
>>> >>>>> +       move_block_from_reg (regno, mem, nregs);
>>> >>>>> +
>>> >>>>> +       rtx *tmps = XALLOCAVEC (rtx, nregs);
>>> >>>>> +       machine_mode mode = word_mode;
>>> >>>>> +       for (int i = 0; i < nregs; i++)
>>> >>>>> +         tmps[i] = gen_rtx_EXPR_LIST (
>>> >>>>> +           VOIDmode, gen_rtx_REG (mode, regno + i),
>>> >>>>> +           GEN_INT (GET_MODE_SIZE (mode).to_constant () * i));
>>> >>>>> +
>>> >>>>> +       rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, 
>>> >>>>> tmps));
>>> >>>>> +       set_scalar_rtx_for_aggregate_access (parm, regs);
>>> >>>>> +     }
>>> >>>>>      }
>>> >>>>>    else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type))
>>> >>>>>      {
>>> >>>>> @@ -3718,6 +3734,10 @@ assign_parms (tree fndecl)
>>> >>>>>        else
>>> >>>>>       set_decl_incoming_rtl (parm, data.entry_parm, false);
>>> >>>>>  
>>> >>>>> +      rtx incoming = DECL_INCOMING_RTL (parm);
>>> >>>>> +      if (GET_CODE (incoming) == PARALLEL)
>>> >>>>> +     set_scalar_rtx_for_aggregate_access (parm, incoming);
>>> >>>>> +
>>> >>>>>        assign_parm_adjust_stack_rtl (&data);
>>> >>>>>  
>>> >>>>>        if (assign_parm_setup_block_p (&data))
>>> >>>>> @@ -5037,6 +5057,7 @@ stack_protect_epilogue (void)
>>> >>>>>     the function's parameters, which must be run at any return 
>>> >>>>> statement.  */
>>> >>>>>  
>>> >>>>>  bool currently_expanding_function_start;
>>> >>>>> +extern void set_scalar_rtx_for_returns ();
>>> >>>>>  void
>>> >>>>>  expand_function_start (tree subr)
>>> >>>>>  {
>>> >>>>> @@ -5138,6 +5159,7 @@ expand_function_start (tree subr)
>>> >>>>>           {
>>> >>>>>             gcc_assert (GET_CODE (hard_reg) == PARALLEL);
>>> >>>>>             set_parm_rtl (res, gen_group_rtx (hard_reg));
>>> >>>>> +           set_scalar_rtx_for_returns ();
>>> >>>>>           }
>>> >>>>>       }
>>> >>>>>  
>>> >>>>> diff --git a/gcc/opts.cc b/gcc/opts.cc
>>> >>>>> index 86b94d62b58..5e129a1cc49 100644
>>> >>>>> --- a/gcc/opts.cc
>>> >>>>> +++ b/gcc/opts.cc
>>> >>>>> @@ -1559,6 +1559,10 @@ public:
>>> >>>>>    vec<const char *> m_values;
>>> >>>>>  };
>>> >>>>>  
>>> >>>>> +#ifdef __GNUC__
>>> >>>>> +#pragma GCC diagnostic push
>>> >>>>> +#pragma GCC diagnostic ignored "-Wformat-truncation"
>>> >>>>> +#endif
>>> >>>>>  /* Print help for a specific front-end, etc.  */
>>> >>>>>  static void
>>> >>>>>  print_filtered_help (unsigned int include_flags,
>>> >>>>> @@ -1913,7 +1917,9 @@ print_filtered_help (unsigned int include_flags,
>>> >>>>>        printf ("\n\n");
>>> >>>>>      }
>>> >>>>>  }
>>> >>>>> -
>>> >>>>> +#ifdef __GNUC__
>>> >>>>> +#pragma GCC diagnostic pop
>>> >>>>> +#endif
>>> >>>>>  /* Display help for a specified type of option.
>>> >>>>>     The options must have ALL of the INCLUDE_FLAGS set
>>> >>>>>     ANY of the flags in the ANY_FLAGS set
>>> >>>>> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
>>> >>>>> b/gcc/testsuite/g++.target/powerpc/pr102024.C
>>> >>>>> index 769585052b5..c8995cae707 100644
>>> >>>>> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C
>>> >>>>> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
>>> >>>>> @@ -5,7 +5,7 @@
>>> >>>>>  // Test that a zero-width bit field in an otherwise homogeneous 
>>> >>>>> aggregate
>>> >>>>>  // generates a psabi warning and passes arguments in GPRs.
>>> >>>>>  
>>> >>>>> -// { dg-final { scan-assembler-times {\mstd\M} 4 } }
>>> >>>>> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
>>> >>>>>  
>>> >>>>>  struct a_thing
>>> >>>>>  {
>>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
>>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>>> >>>>> new file mode 100644
>>> >>>>> index 00000000000..7dd1a4a326a
>>> >>>>> --- /dev/null
>>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
>>> >>>>> @@ -0,0 +1,29 @@
>>> >>>>> +/* { dg-do run } */
>>> >>>>> +/* { dg-options "-O2 -save-temps" } */
>>> >>>>> +
>>> >>>>> +typedef struct DF {double a[4]; short s1; short s2; short s3; short 
>>> >>>>> s4; } DF;
>>> >>>>> +typedef struct SF {float a[4]; int i1; int i2; } SF;
>>> >>>>> +
>>> >>>>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { 
>>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */
>>> >>>>> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { 
>>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */
>>> >>>>> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { 
>>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */
>>> >>>>> +short  __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 
>>> >>>>> 2)return a.s2+a.s3;return 0;}
>>> >>>>> +int  __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 
>>> >>>>> 2)return a.i2+a.i1;return 0;}
>>> >>>>> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 
>>> >>>>> 2)return arg.a[3];else return 0.0;}
>>> >>>>> +float  __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 
>>> >>>>> 2)return arg.a[2]; return 0;}
>>> >>>>> +float  __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag 
>>> >>>>> == 2)return arg.a[1];return 0;}
>>> >>>>> +
>>> >>>>> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4};
>>> >>>>> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2};
>>> >>>>> +
>>> >>>>> +int main()
>>> >>>>> +{
>>> >>>>> +  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 
>>> >>>>> 2) == 4.0
>>> >>>>> +     && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
>>> >>>>> +    __builtin_abort ();
>>> >>>>> +  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 
>>> >>>>> 1) == 0
>>> >>>>> +     && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
>>> >>>>> +    __builtin_abort ();
>>> >>>>> +  return 0;
>>> >>>>> +}
>>> >>>>> +
>>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c 
>>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>>> >>>>> new file mode 100644
>>> >>>>> index 00000000000..4e1f87f7939
>>> >>>>> --- /dev/null
>>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c
>>> >>>>> @@ -0,0 +1,6 @@
>>> >>>>> +/* PR target/65421 */
>>> >>>>> +/* { dg-options "-O2" } */
>>> >>>>> +
>>> >>>>> +typedef struct LARGE {double a[4]; int arr[32];} LARGE;
>>> >>>>> +LARGE foo (LARGE a){return a;}
>>> >>>>> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */
>>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c 
>>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>>> >>>>> new file mode 100644
>>> >>>>> index 00000000000..8a8e1a0e996
>>> >>>>> --- /dev/null
>>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c
>>> >>>>> @@ -0,0 +1,32 @@
>>> >>>>> +/* PR target/65421 */
>>> >>>>> +/* { dg-options "-O2" } */
>>> >>>>> +/* { dg-require-effective-target powerpc_elfv2 } */
>>> >>>>> +/* { dg-require-effective-target has_arch_ppc64 } */
>>> >>>>> +
>>> >>>>> +typedef struct FLOATS
>>> >>>>> +{
>>> >>>>> +  double a[3];
>>> >>>>> +} FLOATS;
>>> >>>>> +
>>> >>>>> +/* 3 lfd after returns also optimized */
>>> >>>>> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */
>>> >>>>> +
>>> >>>>> +/* 3 stfd */
>>> >>>>> +void st_arg (FLOATS a, FLOATS *p) {*p = a;}
>>> >>>>> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */
>>> >>>>> +
>>> >>>>> +/* blr */
>>> >>>>> +FLOATS ret_arg (FLOATS a) {return a;}
>>> >>>>> +
>>> >>>>> +typedef struct MIX
>>> >>>>> +{
>>> >>>>> +  double a[2];
>>> >>>>> +  long l;
>>> >>>>> +} MIX;
>>> >>>>> +
>>> >>>>> +/* std 3 param regs to return slot */
>>> >>>>> +MIX ret_arg1 (MIX a) {return a;}
>>> >>>>> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */
>>> >>>>> +
>>> >>>>> +/* count insns */
>>> >>>>> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */
>>> >>>>> 
>>> >>>>
>>> >>>> -- 
>>> >>>> Richard Biener <rguent...@suse.de>
>>> >>>> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 
>>> >>>> Nuernberg,
>>> >>>> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
>>> >>>> HRB 36809 (AG Nuernberg)
>>> 

Reply via email to