Hi,

As known there are a few PRs (meta-bug PR101926) about
accessing aggregate param/returns which are passed through registers.

We could use the current SRA pass in a special mode right before
RTL expansion for the incoming/outgoing part, as the suggestion from:
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637935.html

Compared to previous version:
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646625.html
This version enhanced the expand_ARG_PARTS to pass bootstrap aarch64.
And this version merge previous three patches into one patch for
review.

As mentioned in previous version:
This patch is using IFN ARG_PARTS and SET_RET_PARTS for parameters
and returns. And expand the IFNs according to the incoming/outgoing
registers.

Bootstrapped/regtested on ppc64{,le}, x86_64 and aarch64.

Again there are a few thing could be enhanced for this patch:
* Multi-registers access
* Parameter access cross call
* Optimize for access parameter which in memory
* More cases/targets checking

I would like to ask for comments to avoid major flaw.

BR,
Jeff (Jiufu Guo)


        PR target/108073
        PR target/65421
        PR target/69143

gcc/ChangeLog:

        * cfgexpand.cc (expand_value_return): Update for rtx eq checking.
        (expand_return): Update for sclarized returns.
        * internal-fn.cc (query_position_in_parallel): New function.
        (construct_reg_seq): New function.
        (get_incoming_element): New function.
        (reference_alias_ptr_type): Extern declare.
        (expand_ARG_PARTS): New IFN expand.
        (store_outgoing_element): New function.
        (expand_SET_RET_PARTS): New IFN expand.
        (expand_SET_RET_LAST_PARTS): New IFN expand.
        * internal-fn.def (ARG_PARTS): New IFN.
        (SET_RET_PARTS): New IFN.
        (SET_RET_LAST_PARTS): New IFN.
        * passes.def (pass_sra_final): Add new pass.
        * tree-pass.h (make_pass_sra_final): New function.
        * tree-sra.cc (enum sra_mode): New enum item SRA_MODE_FINAL_INTRA.
        (build_accesses_from_assign): Accept SRA_MODE_FINAL_INTRA.
        (scan_function): Update for argment in fsra.
        (find_var_candidates): Collect candidates for SRA_MODE_FINAL_INTRA.
        (analyze_access_subtree): Update analyze for fsra.
        (generate_subtree_copies): Update to generate new IFNs.
        (final_intra_sra): New function.
        (class pass_sra_final): New class.
        (make_pass_sra_final): New function.

gcc/testsuite/ChangeLog:

        * g++.target/powerpc/pr102024.C: Update instructions.
        * gcc.target/powerpc/pr108073-1.c: New test.
        * gcc.target/powerpc/pr108073.c: New test.
        * gcc.target/powerpc/pr65421.c: New test.
        * gcc.target/powerpc/pr69143.c: New test.

---
 gcc/cfgexpand.cc                              |   6 +-
 gcc/internal-fn.cc                            | 254 ++++++++++++++++++
 gcc/internal-fn.def                           |   9 +
 gcc/passes.def                                |   2 +
 gcc/tree-pass.h                               |   1 +
 gcc/tree-sra.cc                               | 157 ++++++++++-
 gcc/testsuite/g++.target/powerpc/pr102024.C   |   2 +-
 gcc/testsuite/gcc.target/powerpc/pr108073-1.c |  76 ++++++
 gcc/testsuite/gcc.target/powerpc/pr108073.c   |  74 +++++
 gcc/testsuite/gcc.target/powerpc/pr65421.c    |  10 +
 gcc/testsuite/gcc.target/powerpc/pr69143.c    |  23 ++
 11 files changed, 598 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr69143.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index eef565eddb5..1ec6c2d8102 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -3759,7 +3759,7 @@ expand_value_return (rtx val)
 
   tree decl = DECL_RESULT (current_function_decl);
   rtx return_reg = DECL_RTL (decl);
-  if (return_reg != val)
+  if (!rtx_equal_p (return_reg, val))
     {
       tree funtype = TREE_TYPE (current_function_decl);
       tree type = TREE_TYPE (decl);
@@ -3832,6 +3832,10 @@ expand_return (tree retval)
      been stored into it, so we don't have to do anything special.  */
   if (TREE_CODE (retval_rhs) == RESULT_DECL)
     expand_value_return (result_rtl);
+  /* return is scalarized by fsra: TODO use FLAG. */
+  else if (VAR_P (retval_rhs)
+          && rtx_equal_p (result_rtl, DECL_RTL (retval_rhs)))
+    expand_null_return_1 ();
 
   /* If the result is an aggregate that is being returned in one (or more)
      registers, load the registers here.  */
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index fcf47c7fa12..905ee7da005 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3394,6 +3394,260 @@ expand_DEFERRED_INIT (internal_fn, gcall *stmt)
     }
 }
 
+/* In the parallel rtx register series REGS, compute the register position for
+   given {BITPOS, BITSIZE}.  The results are stored into START_INDEX, 
END_INDEX,
+   LEFT_BITS and RIGHT_BITS.  */
+
+void
+query_position_in_parallel (HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+                           rtx regs, int &start_index, int &end_index,
+                           HOST_WIDE_INT &left_bits, HOST_WIDE_INT &right_bits)
+{
+  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+  for (; cur_index < XVECLEN (regs, 0); cur_index++)
+    {
+      rtx slot = XVECEXP (regs, 0, cur_index);
+      HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+      machine_mode mode = GET_MODE (XEXP (slot, 0));
+      HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+      if (off <= bitpos && off + size > bitpos)
+       {
+         start_index = cur_index;
+         left_bits = bitpos - off;
+       }
+      if (off + size >= bitpos + bitsize)
+       {
+         end_index = cur_index;
+         right_bits = off + size - (bitpos + bitsize);
+         break;
+       }
+    }
+}
+
+/* Create a serial registers which start at FIRST_REG,
+   and SIZE is the total size of those registers.  */
+
+static rtx
+construct_reg_seq (HOST_WIDE_INT size, rtx first_reg)
+{
+  int nregs = size / UNITS_PER_WORD + (((size % UNITS_PER_WORD) != 0) ? 1 : 0);
+  rtx *tmps = XALLOCAVEC (rtx, nregs);
+  int regno = REGNO (first_reg);
+  machine_mode mode = word_mode;
+  HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
+  for (int i = 0; i < nregs; i++)
+    {
+      rtx reg = gen_rtx_REG (mode, regno + i);
+      rtx off = GEN_INT (word_size * i);
+      tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
+    }
+  return gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+}
+
+static rtx
+get_incoming_element (tree arg, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+                     bool reversep, tree expr)
+{
+  rtx regs = DECL_INCOMING_RTL (arg);
+  if (REG_P (regs) && GET_MODE (regs) == BLKmode)
+    {
+      HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (arg));
+      /* For possible REG_PADDING */
+      if (size < UNITS_PER_WORD)
+       {
+         regs = gen_rtx_REG (word_mode, REGNO (regs));
+         rtx arg_rtx = arg->decl_with_rtl.rtl;//DECL_RTL
+         emit_move_insn (regs, adjust_address (arg_rtx, word_mode, 0));
+       }
+      regs = construct_reg_seq (size, regs);
+    }
+
+  if (GET_CODE (regs) != PARALLEL)
+    return NULL_RTX;
+
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  query_position_in_parallel (bitpos, bitsize, regs, start_index, end_index,
+                             left_bits, right_bits);
+
+  if (start_index < 0 || end_index < 0)
+    return NULL_RTX;
+
+  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (expr));
+  /* Just need one reg for the access.  */
+  if (end_index != start_index)
+    return NULL_RTX;
+
+  rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+  /* Just need one reg for the access.  */
+  if (left_bits == 0 && right_bits == 0)
+    {
+      if (GET_MODE (reg) != expr_mode)
+       reg = gen_lowpart (expr_mode, reg);
+      return reg;
+    }
+
+  /* Need to extract bitfield part reg for the access.
+     left_bits != 0 or right_bits != 0 */
+  scalar_int_mode imode;
+  if (!int_mode_for_mode (expr_mode).exists (&imode))
+    return NULL_RTX;
+
+  if (expr_mode != imode
+      && known_gt (GET_MODE_SIZE (GET_MODE (regs)), UNITS_PER_WORD))
+    return NULL_RTX;
+
+  machine_mode mode = GET_MODE (reg);
+  bool sgn = TYPE_UNSIGNED (TREE_TYPE (expr));
+  rtx bfld = extract_bit_field (reg, bitsize, left_bits, sgn, NULL_RTX, mode,
+                               imode, reversep, NULL);
+
+  if (GET_MODE (bfld) != imode)
+    bfld = gen_lowpart (imode, bfld);
+
+  if (expr_mode == imode)
+    return bfld;
+
+  /* expr_mode != imode, e.g. SF != SI.  */
+  rtx result = gen_reg_rtx (imode);
+  emit_move_insn (result, bfld);
+  return gen_lowpart (expr_mode, result);
+}
+
+tree
+reference_alias_ptr_type (tree t);
+
+static void
+expand_ARG_PARTS (internal_fn, gcall *stmt)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree arg = gimple_call_arg (stmt, 0);
+  HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+  HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+  int reversep = tree_to_shwi (gimple_call_arg (stmt, 3));
+  rtx sub_elem = get_incoming_element (arg, offset, size, reversep, lhs);
+  if (sub_elem)
+    {
+      rtx to_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+      if (to_rtx && REG_P (to_rtx))
+       {
+         emit_move_insn (to_rtx, sub_elem);
+         return;
+       }
+    }
+  /* Fall back to normal expand method. */
+  if ((offset % BITS_PER_WORD == 0) && (size % BITS_PER_WORD == 0))
+    {
+      tree base = build_fold_addr_expr (arg);
+      tree type = reference_alias_ptr_type (arg);
+      tree off = build_int_cst (type, offset / BITS_PER_UNIT);
+      location_t loc = EXPR_LOCATION (arg);
+      tree rhs = fold_build2_loc (loc, MEM_REF, TREE_TYPE (lhs), base, off);
+      REF_REVERSE_STORAGE_ORDER (rhs) = reversep;
+      expand_assignment (lhs, rhs, false);
+    }
+  else
+    {
+      tree type = TREE_TYPE (lhs);
+      machine_mode mode = TYPE_MODE (type);
+      rtx op0
+       = expand_expr_real (arg, NULL, VOIDmode, EXPAND_NORMAL, NULL, true);
+      op0 = extract_bit_field (op0, size, offset, TYPE_UNSIGNED (type), NULL,
+                              mode, mode, reversep, NULL);
+      rtx dest = expand_expr (lhs, NULL, VOIDmode, EXPAND_WRITE);
+      if (GET_CODE (dest) == SUBREG && SUBREG_PROMOTED_VAR_P (dest))
+       convert_move (SUBREG_REG (dest), op0, SUBREG_PROMOTED_SIGN (dest));
+      else
+       emit_move_insn (dest, op0);
+    }
+}
+
+static bool
+store_outgoing_element (rtx regs, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+                       tree rhs)
+{
+  if (GET_CODE (regs) != PARALLEL)
+    return false;
+
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  query_position_in_parallel (bitpos, bitsize, regs, start_index, end_index,
+                             left_bits, right_bits);
+
+  if (start_index < 0 || end_index < 0)
+    return false;
+
+  if (end_index != start_index)
+    return false;
+
+  if (!((left_bits == 0 && !BITS_BIG_ENDIAN)
+       || (right_bits == 0 && BITS_BIG_ENDIAN)))
+    return false;
+
+  /* Just need one reg for the access.  */
+  rtx dest = XEXP (XVECEXP (regs, 0, start_index), 0);
+  machine_mode mode = GET_MODE (dest);
+
+  if (left_bits != 0 || right_bits != 0)
+    {
+      machine_mode small_mode;
+      if (!SCALAR_INT_MODE_P (mode)
+         || !mode_for_size (bitsize, GET_MODE_CLASS (mode), 0)
+               .exists (&small_mode))
+       return false;
+
+      dest = gen_lowpart (small_mode, dest);
+      mode = small_mode;
+    }
+
+  rtx src = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  if (!src)
+    return false;
+
+  machine_mode src_mode = GET_MODE (src);
+  if (mode != src_mode)
+    src = gen_lowpart (mode, src);
+
+  emit_move_insn (dest, src);
+
+  return true;
+}
+
+static void
+expand_SET_RET_PARTS (internal_fn, gcall *stmt)
+{
+  HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+  HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+  tree decl = DECL_RESULT (current_function_decl);
+  rtx dest_regs = decl->decl_with_rtl.rtl; // DECL_RTL (base);
+  tree rhs = gimple_call_arg (stmt, 3);
+  bool res = store_outgoing_element (dest_regs, offset, size, rhs);
+  if (!res)
+    {
+      tree base = gimple_call_arg (stmt, 0);
+      tree lhs = gimple_call_lhs (stmt);
+      expand_assignment (base, decl, false);
+      expand_assignment (lhs, rhs, false);
+      expand_assignment (decl, base, false);
+    }
+}
+
+static void
+expand_SET_RET_LAST_PARTS (internal_fn, gcall *stmt)
+{
+  expand_SET_RET_PARTS (IFN_SET_RET_PARTS, stmt);
+
+  tree decl = DECL_RESULT (current_function_decl);
+  rtx dest_regs = decl->decl_with_rtl.rtl; // DECL_RTL (base);
+  tree base = gimple_call_arg (stmt, 0);
+  base->decl_with_rtl.rtl = dest_regs; // SET_DECL_RTL
+}
+
+
 /* The size of an OpenACC compute dimension.  */
 
 static void
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 848bb9dbff3..59a44e13161 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -510,6 +510,15 @@ DEF_INTERNAL_FN (PHI, 0, NULL)
    automatic variable.  */
 DEF_INTERNAL_FN (DEFERRED_INIT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
 
+/* A function to extract elemet(s) from an aggregate argument in fsra. */
+DEF_INTERNAL_FN (ARG_PARTS, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL)
+
+/* Functions to set/construct elemet(s) for an 'return' aggregate. */
+DEF_INTERNAL_FN (SET_RET_PARTS, ECF_LEAF | ECF_NOTHROW, NULL)
+/* Functions to set/construct elemet(s) for a 'return' aggregate just before
+return statement. */
+DEF_INTERNAL_FN (SET_RET_LAST_PARTS, ECF_LEAF | ECF_NOTHROW, NULL)
+
 /* DIM_SIZE and DIM_POS return the size of a particular compute
    dimension and the executing thread's position within that
    dimension.  DIM_POS is pure (and not const) so that it isn't
diff --git a/gcc/passes.def b/gcc/passes.def
index 1cbbd413097..183c1becd65 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -449,6 +449,8 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_harden_conditional_branches);
   NEXT_PASS (pass_harden_compares);
   NEXT_PASS (pass_warn_access, /*early=*/false);
+  NEXT_PASS (pass_sra_final);
+
   NEXT_PASS (pass_cleanup_cfg_post_optimizing);
   NEXT_PASS (pass_warn_function_noreturn);
 
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 29267589eeb..2d0e12bd1bb 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -366,6 +366,7 @@ extern gimple_opt_pass *make_pass_early_tree_profile 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_cleanup_eh (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_sra (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_sra_early (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_sra_final (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_recursion (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_calls (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fix_loops (gcc::context *ctxt);
diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
index f8e71ec48b9..2b2c9d9f83a 100644
--- a/gcc/tree-sra.cc
+++ b/gcc/tree-sra.cc
@@ -21,14 +21,16 @@ along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
 /* This file implements Scalar Reduction of Aggregates (SRA).  SRA is run
-   twice, once in the early stages of compilation (early SRA) and once in the
-   late stages (late SRA).  The aim of both is to turn references to scalar
-   parts of aggregates into uses of independent scalar variables.
+   three times, once in the early stages of compilation (early SRA) and once
+   in the late stages (late SRA).  The aim of them is to turn references to
+   scalar parts of aggregates into uses of independent scalar variables.
 
-   The two passes are nearly identical, the only difference is that early SRA
+   The three passes are nearly identical, the difference are that early SRA
    does not scalarize unions which are used as the result in a GIMPLE_RETURN
    statement because together with inlining this can lead to weird type
-   conversions.
+   conversions.  The third pass is more care about parameters and returns,
+   it would be helpful for the parameters and returns which are passed through
+   registers.
 
    Both passes operate in four stages:
 
@@ -104,6 +106,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Enumeration of all aggregate reductions we can do.  */
 enum sra_mode { SRA_MODE_EARLY_IPA,   /* early call regularization */
                SRA_MODE_EARLY_INTRA, /* early intraprocedural SRA */
+               SRA_MODE_FINAL_INTRA, /* final gimple intraprocedural SRA */
                SRA_MODE_INTRA };     /* late intraprocedural SRA */
 
 /* Global variable describing which aggregate reduction we are performing at
@@ -1437,7 +1440,8 @@ build_accesses_from_assign (gimple *stmt)
     }
 
   if (lacc && racc
-      && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA)
+      && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA
+         || sra_mode == SRA_MODE_FINAL_INTRA)
       && !lacc->grp_unscalarizable_region
       && !racc->grp_unscalarizable_region
       && AGGREGATE_TYPE_P (TREE_TYPE (lhs))
@@ -1504,7 +1508,8 @@ scan_function (void)
          tree t;
          unsigned i;
 
-         if (gimple_code (stmt) != GIMPLE_CALL)
+         if (gimple_code (stmt) != GIMPLE_CALL
+             || sra_mode == SRA_MODE_FINAL_INTRA)
            walk_stmt_load_store_addr_ops (stmt, NULL, NULL, NULL,
                                           scan_visit_addr);
 
@@ -2149,6 +2154,24 @@ find_var_candidates (void)
        parm = DECL_CHAIN (parm))
     ret |= maybe_add_sra_candidate (parm);
 
+  /* fsra only care about parameters and returns */
+  if (sra_mode == SRA_MODE_FINAL_INTRA)
+    {
+      if (!DECL_RESULT (current_function_decl))
+       return ret;
+
+      edge_iterator ei;
+      edge e;
+      FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+       if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src)))
+         {
+           tree val = gimple_return_retval (r);
+           if (val && VAR_P (val))
+             ret |= maybe_add_sra_candidate (val);
+         }
+      return ret;
+    }
+
   FOR_EACH_LOCAL_DECL (cfun, i, var)
     {
       if (!VAR_P (var))
@@ -2745,12 +2768,31 @@ analyze_access_subtree (struct access *root, struct 
access *parent,
        hole = true;
     }
 
+  auto check_rw = [] (struct access *root) -> bool {
+    if ((root->grp_scalar_read || root->grp_assignment_read)
+       && (root->grp_scalar_write || root->grp_assignment_write))
+      return true;
+    if (sra_mode != SRA_MODE_FINAL_INTRA)
+      return false;
+    if ((root->grp_scalar_read || root->grp_assignment_read)
+       && TREE_CODE (root->base) == PARM_DECL)
+      return true;
+    /* Now in fsra (SRA_MODE_FINAL_INTRA), only PARAM and RETURNS
+       are candidates, so if "VAR_P (root->base)", then it is used by
+       a return stmt.
+       TODO: add a flag to root->base to indicate it is used by return
+       stmt.*/
+    if ((root->grp_scalar_write || root->grp_assignment_write)
+       && VAR_P (root->base))
+      return true;
+
+    return false;
+  };
+
+  /* In fsra, parameter is scalarizable even no writing to it.  */
   if (allow_replacements && scalar && !root->first_child
       && (totally || !root->grp_total_scalarization)
-      && (totally
-         || root->grp_hint
-         || ((root->grp_scalar_read || root->grp_assignment_read)
-             && (root->grp_scalar_write || root->grp_assignment_write))))
+      && (totally || root->grp_hint || check_rw (root)))
     {
       /* Always create access replacements that cover the whole access.
          For integral types this means the precision has to match.
@@ -2819,6 +2861,15 @@ analyze_access_subtree (struct access *root, struct 
access *parent,
     root->grp_covered = 1;
   else if (root->grp_write || comes_initialized_p (root->base))
     root->grp_unscalarized_data = 1; /* not covered and written to */
+
+  if (sra_mode == SRA_MODE_FINAL_INTRA)
+    {/* Does not support writen to PARAM and partial-unscalarized RET yet.  */
+      if (root->grp_unscalarized_data && (VAR_P (root->base)))
+       return false;
+      if (root->grp_write && TREE_CODE (root->base) == PARM_DECL)
+       return false;
+    }
+
   return sth_created;
 }
 
@@ -3780,7 +3831,7 @@ generate_subtree_copies (struct access *access, tree agg,
              || access->offset + access->size > start_offset))
        {
          tree expr, repl = get_access_replacement (access);
-         gassign *stmt;
+         gimple *stmt;
 
          expr = build_ref_for_model (loc, agg, access->offset - top_offset,
                                      access, gsi, insert_after);
@@ -3792,7 +3843,20 @@ generate_subtree_copies (struct access *access, tree agg,
                                                 !insert_after,
                                                 insert_after ? GSI_NEW_STMT
                                                 : GSI_SAME_STMT);
-             stmt = gimple_build_assign (repl, expr);
+             if (sra_mode == SRA_MODE_FINAL_INTRA
+                 && TREE_CODE (access->base) == PARM_DECL
+                 && (access->grp_scalar_read || access->grp_assignment_read))
+               {
+                 gimple *call = gimple_build_call_internal (
+                   IFN_ARG_PARTS, 4, access->base,
+                   wide_int_to_tree (sizetype, access->offset),
+                   wide_int_to_tree (sizetype, access->size),
+                   wide_int_to_tree (sizetype, access->reverse));
+                 gimple_call_set_lhs (call, repl);
+                 stmt = call;
+               }
+             else
+               stmt = gimple_build_assign (repl, expr);
            }
          else
            {
@@ -3802,7 +3866,25 @@ generate_subtree_copies (struct access *access, tree agg,
                                                 !insert_after,
                                                 insert_after ? GSI_NEW_STMT
                                                 : GSI_SAME_STMT);
-             stmt = gimple_build_assign (expr, repl);
+             if (sra_mode == SRA_MODE_FINAL_INTRA && VAR_P (access->base)
+                 && (access->grp_scalar_write || access->grp_assignment_write))
+               {
+                 enum internal_fn fcode;
+                 if (access->first_child == NULL
+                     && access->next_sibling == NULL)
+                   fcode = IFN_SET_RET_LAST_PARTS;
+                 else
+                   fcode = IFN_SET_RET_PARTS;
+
+                 gimple *call = gimple_build_call_internal (
+                   fcode, 4, access->base,
+                   wide_int_to_tree (sizetype, access->offset),
+                   wide_int_to_tree (sizetype, access->size), repl);
+                 gimple_call_set_lhs (call, expr);
+                 stmt = call;
+               }
+             else
+               stmt = gimple_build_assign (expr, repl);
            }
          gimple_set_location (stmt, loc);
 
@@ -5017,6 +5099,14 @@ late_intra_sra (void)
   return perform_intra_sra ();
 }
 
+/* Perform "final sra" intraprocedural SRA just before expander.  */
+static unsigned int
+final_intra_sra (void)
+{
+  sra_mode = SRA_MODE_FINAL_INTRA;
+  return perform_intra_sra ();
+}
+
 
 static bool
 gate_intra_sra (void)
@@ -5099,3 +5189,42 @@ make_pass_sra (gcc::context *ctxt)
 {
   return new pass_sra (ctxt);
 }
+
+namespace
+{
+const pass_data pass_data_sra_final = {
+  GIMPLE_PASS,          /* type */
+  "fsra",               /* name */
+  OPTGROUP_NONE,        /* optinfo_flags */
+  TV_TREE_SRA,          /* tv_id */
+  (PROP_cfg | PROP_ssa), /* properties_required */
+  0,                    /* properties_provided */
+  0,                    /* properties_destroyed */
+  0,                    /* todo_flags_start */
+  TODO_update_ssa,      /* todo_flags_finish */
+};
+
+class pass_sra_final : public gimple_opt_pass
+{
+public:
+  pass_sra_final (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_sra_final, ctxt)
+  {
+  }
+
+  /* opt_pass methods: */
+  bool gate (function *) final override { return gate_intra_sra (); }
+  unsigned int execute (function *) final override
+  {
+    return final_intra_sra ();
+  }
+
+}; // class pass_sra_final
+
+} // namespace
+
+gimple_opt_pass *
+make_pass_sra_final (gcc::context *ctxt)
+{
+  return new pass_sra_final (ctxt);
+}
diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C 
b/gcc/testsuite/g++.target/powerpc/pr102024.C
index 769585052b5..c8995cae707 100644
--- a/gcc/testsuite/g++.target/powerpc/pr102024.C
+++ b/gcc/testsuite/g++.target/powerpc/pr102024.C
@@ -5,7 +5,7 @@
 // Test that a zero-width bit field in an otherwise homogeneous aggregate
 // generates a psabi warning and passes arguments in GPRs.
 
-// { dg-final { scan-assembler-times {\mstd\M} 4 } }
+// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } }
 
 struct a_thing
 {
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073-1.c 
b/gcc/testsuite/gcc.target/powerpc/pr108073-1.c
new file mode 100644
index 00000000000..4892716e85f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073-1.c
@@ -0,0 +1,76 @@
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+typedef struct DF
+{
+  double a[4];
+  short s1;
+  short s2;
+  short s3;
+  short s4;
+} DF;
+typedef struct SF
+{
+  float a[4];
+  int i1;
+  int i2;
+} SF;
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && 
has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mlwz\M} {target { lp64 && has_arch_pwr8 } 
} } } */
+/* { dg-final { scan-assembler-not {\mlhz\M} {target { lp64 && has_arch_pwr8 } 
} } } */
+
+#define NOIPA __attribute__ ((noipa))
+
+short NOIPA
+foo_hi (DF a, int flag)
+{
+  if (flag == 2)
+    return a.s2 + a.s3;
+  return 0;
+}
+int NOIPA
+foo_si (SF a, int flag)
+{
+  if (flag == 2)
+    return a.i2 + a.i1;
+  return 0;
+}
+double NOIPA
+foo_df (DF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[3];
+  else
+    return 0.0;
+}
+float NOIPA
+foo_sf (SF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[2];
+  return 0;
+}
+float NOIPA
+foo_sf1 (SF arg, int flag)
+{
+  if (flag == 2)
+    return arg.a[1];
+  return 0;
+}
+
+DF gdf = {{1.0, 2.0, 3.0, 4.0}, 1, 2, 3, 4};
+SF gsf = {{1.0f, 2.0f, 3.0f, 4.0f}, 1, 2};
+
+int
+main ()
+{
+  if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0
+       && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0))
+    __builtin_abort ();
+  if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0
+       && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0))
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c 
b/gcc/testsuite/gcc.target/powerpc/pr108073.c
new file mode 100644
index 00000000000..4e7feaa6810
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c
@@ -0,0 +1,74 @@
+/* { dg-do run } */
+/* { dg-require-effective-target hard_float } */
+/* { dg-options "-O2 -save-temps" } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 5 {target { lp64 && 
{ has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 4 {target { lp64 && { 
has_arch_pwr8 && be } } } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrd|mtvsrws\M} 3 {target { lp64 && 
{ has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 {target { lp64 && { 
has_arch_pwr8 && le } } } } } */
+/* { dg-final { scan-assembler-times {\mfadds\M} 2 {target { lp64 && 
has_arch_pwr8 } } } } */
+
+#define NOIPA __attribute__ ((noipa))
+typedef struct X
+{
+  float x;
+  float y;
+} X;
+
+float NOIPA
+fooX (X y)
+{
+  y.x += 1;
+  return y.x + y.y;
+}
+
+typedef struct Y
+{
+  double a[4];
+  long l;
+} Y;
+
+double NOIPA
+fooY (Y arg)
+{
+  return arg.a[3];
+}
+
+typedef struct Z
+{
+  float a[4];
+  short l;
+} Z;
+
+float NOIPA
+fooZ (Z arg)
+{
+  return arg.a[3];
+}
+
+float NOIPA
+fooZ2 (Z arg)
+{
+  return arg.a[2];
+}
+
+X x = {1.0f, 2.0f};
+Y y = {1.0, 2.0, 3.0, 4.0, 1};
+Z z = {1.0f, 2.0f, 3.0f, 4.0f, 1};
+int
+main ()
+{
+  if (fooX (x) != 4.0f)
+    __builtin_abort ();
+
+  if (fooY (y) != 4.0)
+    __builtin_abort ();
+
+  if (fooZ (z) != 4.0f)
+    __builtin_abort ();
+
+  if (fooZ2 (z) != 3.0f)
+    __builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421.c 
b/gcc/testsuite/gcc.target/powerpc/pr65421.c
new file mode 100644
index 00000000000..ea86b53afbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr65421.c
@@ -0,0 +1,10 @@
+/* { dg-require-effective-target hard_float } */
+/* { dg-require-effective-target powerpc_elfv2 } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {\mlfd\M} 4 {target { lp64 && 
has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mstd\M} {target { lp64 && has_arch_pwr8 } 
} } } */
+/* { dg-final { scan-assembler-not {\mld\M} {target { lp64 && has_arch_pwr8 } 
} } } */
+
+typedef struct { double a[4]; } A;
+A foo (const A *a) { return *a; }
diff --git a/gcc/testsuite/gcc.target/powerpc/pr69143.c 
b/gcc/testsuite/gcc.target/powerpc/pr69143.c
new file mode 100644
index 00000000000..216a270fb7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr69143.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target hard_float } */
+/* { dg-require-effective-target powerpc_elfv2 } */
+/* { dg-options "-O2" } */
+
+/* { dg-final { scan-assembler-times {\mfmr\M} 3 {target { lp64 && 
has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-not {\mxscvspdpn\M} {target { lp64 && 
has_arch_pwr8 } } } } */
+
+struct foo1
+{
+  float x;
+  float y;
+};
+
+struct foo1
+blah1 (struct foo1 y)
+{
+  struct foo1 x;
+
+  x.x = y.y;
+  x.y = y.x;
+
+  return x;
+}
-- 
2.25.1

Reply via email to