Hi,
I just updated the patch. We could review this one. Compare with previous patch: https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627287.html This version: * Supports bitfield access from one register. * Allow return scalar registers cleaned via contructor. Bootstrapped and regtested on x86_64-redhat-linux, and powerpc64{,le}-linux-gnu. Is it ok for trunk? PR target/65421 PR target/69143 gcc/ChangeLog: * cfgexpand.cc (extract_bit_field): Extern declare. (struct access): New class. (struct expand_sra): New class. (expand_sra::build_access): New member function. (expand_sra::visit_base): Likewise. (expand_sra::analyze_default_stmt): Likewise. (expand_sra::analyze_assign): Likewise. (expand_sra::add_sra_candidate): Likewise. (expand_sra::collect_sra_candidates): Likewise. (expand_sra::valid_scalariable_accesses): Likewise. (expand_sra::prepare_expander_sra): Likewise. (expand_sra::expand_sra): Class constructor. (expand_sra::~expand_sra): Class destructor. (expand_sra::get_scalarized_rtx): New member function. (extract_one_reg): New function. (extract_bitfield): New function. (expand_sra::scalarize_access): New member function. (expand_sra::scalarize_accesses): New member function. (get_scalar_rtx_for_aggregate_expr): New function. (set_scalar_rtx_for_aggregate_access): New function. (set_scalar_rtx_for_returns): New function. (expand_return): Call get_scalar_rtx_for_aggregate_expr. (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr. (pass_expand::execute): Update to use the expand_sra. * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare. (expand_assignment): Call get_scalar_rtx_for_aggregate_expr. (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr. * function.cc (set_scalar_rtx_for_aggregate_access): Extern declare. (set_scalar_rtx_for_returns): Extern declare. (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access. (assign_parms): Call set_scalar_rtx_for_aggregate_access. (expand_function_start): Call set_scalar_rtx_for_returns. * tree-sra.h (struct base_access): New class. (struct default_analyzer): New class. (scan_function): New function template. gcc/testsuite/ChangeLog: * g++.target/powerpc/pr102024.C: Updated. * gcc.target/powerpc/pr108073.c: New test. * gcc.target/powerpc/pr65421-1.c: New test. * gcc.target/powerpc/pr65421-2.c: New test. --- gcc/cfgexpand.cc | 474 ++++++++++++++++++- gcc/expr.cc | 29 +- gcc/function.cc | 28 +- gcc/tree-sra.h | 77 +++ gcc/testsuite/g++.target/powerpc/pr102024.C | 2 +- gcc/testsuite/gcc.target/powerpc/pr108073.c | 29 ++ gcc/testsuite/gcc.target/powerpc/pr65421-1.c | 6 + gcc/testsuite/gcc.target/powerpc/pr65421-2.c | 32 ++ 8 files changed, 668 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc index edf292cfbe95ac2711faee7769e839cb4edb0dd3..385b6c781aa2805e7ca40293a0ae84f87e23e0b6 100644 --- a/gcc/cfgexpand.cc +++ b/gcc/cfgexpand.cc @@ -74,6 +74,7 @@ along with GCC; see the file COPYING3. If not see #include "output.h" #include "builtins.h" #include "opts.h" +#include "tree-sra.h" /* Some systems use __main in a way incompatible with its use in gcc, in these cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN to @@ -97,6 +98,468 @@ static bool defer_stack_allocation (tree, bool); static void record_alignment_for_reg_var (unsigned int); +extern rtx extract_bit_field (rtx, poly_uint64, poly_uint64, int, rtx, + machine_mode, machine_mode, bool, rtx *); + +/* For light SRA in expander about paramaters and returns. */ +struct access : public base_access +{ + /* The rtx for the access: link to incoming/returning register(s). */ + rtx rtx_val; +}; + +typedef struct access *access_p; + +struct expand_sra : public default_analyzer +{ + expand_sra (); + ~expand_sra (); + + /* Now use default APIs, no actions for + pre_analyze_stmt, analyze_return. */ + + /* overwrite analyze_default_stmt. */ + void analyze_default_stmt (gimple *); + + /* overwrite analyze phi,call,asm . */ + void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); }; + void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); }; + void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); }; + /* overwrite analyze_assign. */ + void analyze_assign (gassign *); + + /* Compute the scalar rtx(s) for all access of BASE from a parrallel REGS. */ + bool scalarize_accesses (tree base, rtx regs); + /* Return the scalarized rtx for EXPR. */ + rtx get_scalarized_rtx (tree expr); + +private: + void prepare_expander_sra (void); + + /* Return true if VAR is a candidate for SRA. */ + bool add_sra_candidate (tree var); + + /* Collect the parameter and returns with type which is suitable for + scalarization. */ + bool collect_sra_candidates (void); + + /* Return true if EXPR has interesting access to the sra candidates, + and created access, return false otherwise. */ + access_p build_access (tree expr, bool write); + + /* Check if the accesses of BASE are scalarizbale. + Now support the parms only with reading or returns only with writing. */ + bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm); + + /* Compute the scalar rtx for one access ACC from a parrallel REGS. */ + bool scalarize_access (access_p acc, rtx regs); + + /* Callback of walk_stmt_load_store_addr_ops, used to remove + unscalarizable accesses. */ + static bool visit_base (gimple *, tree op, tree, void *data); + + /* Expr (tree) -> Scalarized value (rtx) map. */ + hash_map<tree, rtx> *expr_rtx_vec; + + /* Base (tree) -> Vector (vec<access_p> *) map. */ + hash_map<tree, auto_vec<access_p> > *base_access_vec; +}; + +access_p +expand_sra::build_access (tree expr, bool write) +{ + enum tree_code code = TREE_CODE (expr); + if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF + && code != ARRAY_REF && code != ARRAY_RANGE_REF) + return NULL; + + HOST_WIDE_INT offset, size; + bool reverse; + tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse); + if (!base || !DECL_P (base)) + return NULL; + if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr)) + { + base_access_vec->remove (base); + return NULL; + } + + vec<access_p> *access_vec = base_access_vec->get (base); + if (!access_vec) + return NULL; + + /* TODO: support reverse. */ + if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE (base))) + { + base_access_vec->remove (base); + return NULL; + } + + struct access *access = XNEWVEC (struct access, 1); + + memset (access, 0, sizeof (struct access)); + access->offset = offset; + access->size = size; + access->expr = expr; + access->write = write; + access->rtx_val = NULL_RTX; + + access_vec->safe_push (access); + + return access; +} + +bool +expand_sra::visit_base (gimple *, tree op, tree, void *data) +{ + op = get_base_address (op); + if (op && DECL_P (op)) + { + expand_sra *p = (expand_sra *) data; + p->base_access_vec->remove (op); + } + return false; +} + +void +expand_sra::analyze_default_stmt (gimple *stmt) +{ + if (base_access_vec && !base_access_vec->is_empty ()) + walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base, + visit_base); +} + +void +expand_sra::analyze_assign (gassign *stmt) +{ + if (!base_access_vec || base_access_vec->is_empty ()) + return; + + if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt)) + { + tree rhs = gimple_assign_rhs1 (stmt); + tree lhs = gimple_assign_lhs (stmt); + bool res_r = build_access (rhs, false); + bool res_l = build_access (lhs, true); + + if (res_l || res_r) + return; + } + + analyze_default_stmt (stmt); +} + +/* Return true if VAR is a candidate for SRA. */ + +bool +expand_sra::add_sra_candidate (tree var) +{ + tree type = TREE_TYPE (var); + + if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type)) + || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var) + || is_va_list_type (type)) + return false; + gcc_assert (COMPLETE_TYPE_P (type)); + + base_access_vec->get_or_insert (var); + + return true; +} + +bool +expand_sra::collect_sra_candidates (void) +{ + bool ret = false; + + /* Collect parameters. */ + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm; + parm = DECL_CHAIN (parm)) + ret |= add_sra_candidate (parm); + + /* Collect VARs on returns. */ + if (DECL_RESULT (current_function_decl)) + { + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) + { + tree val = gimple_return_retval (r); + /* To sclaraized the return, the return value should be only + writen, except this return stmt. + Then using 'true(write)' to create the access. */ + if (val && VAR_P (val)) + ret |= add_sra_candidate (val) && build_access (val, true); + } + } + + return ret; +} + +bool +expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm) +{ + if (access_vec->is_empty ()) + return false; + + for (unsigned int j = 0; j < access_vec->length (); j++) + { + struct access *access = (*access_vec)[j]; + if (is_parm && access->write) + return false; + + if (!is_parm && !access->write) + return false; + } + + return true; +} + +void +expand_sra::prepare_expander_sra () +{ + if (optimize <= 0) + return; + + base_access_vec = new hash_map<tree, auto_vec<access_p> >; + expr_rtx_vec = new hash_map<tree, rtx>; + + collect_sra_candidates (); +} + +expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL) +{ + prepare_expander_sra (); +} + +expand_sra::~expand_sra () +{ + if (optimize <= 0) + return; + delete expr_rtx_vec; + expr_rtx_vec = NULL; + delete base_access_vec; + base_access_vec = NULL; +} + +rtx +expand_sra::get_scalarized_rtx (tree expr) +{ + if (!expr_rtx_vec) + return NULL_RTX; + rtx *val = expr_rtx_vec->get (expr); + return val ? *val : NULL_RTX; +} + +/* Get the register at INDEX from a parallel REGS. */ + +static rtx +extract_one_reg (rtx regs, int index) +{ + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0); + if (!HARD_REGISTER_P (orig_reg)) + return orig_reg; + + /* Reading from param hard reg need to be moved to a temp. */ + rtx reg = gen_reg_rtx (GET_MODE (orig_reg)); + emit_move_insn (reg, orig_reg); + return reg; +} + +/* Extract bitfield from REG with SIZE as MODE, start from POS. */ + +static rtx +extract_bitfield (rtx reg, int size, int pos, machine_mode tmode, bool unsignedp) +{ + scalar_int_mode imode; + if (!int_mode_for_mode (tmode).exists (&imode)) + return NULL_RTX; + + machine_mode mode = GET_MODE (reg); + bool reverse = false; + rtx bfld = extract_bit_field (reg, size, pos, unsignedp, NULL_RTX, mode, + imode, reverse, NULL); + mode = GET_MODE (bfld); + if (mode != imode) + bfld = gen_lowpart (imode, bfld); + rtx result = gen_reg_rtx (imode); + emit_move_insn (result, bfld); + + if (tmode != imode) + result = gen_lowpart (tmode, result); + + return result; +} + +bool +expand_sra::scalarize_access (access_p acc, rtx regs) +{ + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr)); + + /* mode of mult registers. */ + if (expr_mode != BLKmode + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode))) + return false; + + /* Compute the position of the access in the whole parallel rtx. */ + int start_index = -1; + int end_index = -1; + HOST_WIDE_INT left_bits = 0; + HOST_WIDE_INT right_bits = 0; + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1; + for (; cur_index < XVECLEN (regs, 0); cur_index++) + { + rtx slot = XVECEXP (regs, 0, cur_index); + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT; + machine_mode mode = GET_MODE (XEXP (slot, 0)); + HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant (); + if (off <= acc->offset && off + size > acc->offset) + { + start_index = cur_index; + left_bits = acc->offset - off; + } + if (off + size >= acc->offset + acc->size) + { + end_index = cur_index; + right_bits = off + size - (acc->offset + acc->size); + break; + } + } + /* Invalid access possition: padding or outof bound. */ + if (start_index < 0 || end_index < 0) + return false; + + /* Need multi-registers in a parallel for the access. */ + if (expr_mode == BLKmode || end_index > start_index) + { + if (left_bits || right_bits) + return false; + + int num_words = end_index - start_index + 1; + rtx *tmps = XALLOCAVEC (rtx, num_words); + + int pos = 0; + HOST_WIDE_INT start; + start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)); + /* Extract whole registers. */ + for (; pos < num_words; pos++) + { + int index = start_index + pos; + rtx reg = extract_one_reg (regs, index); + machine_mode mode = GET_MODE (reg); + HOST_WIDE_INT off; + off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start; + tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off)); + } + + rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps)); + acc->rtx_val = reg; + return true; + } + + /* Just need one reg for the access. */ + if (end_index == start_index && left_bits == 0 && right_bits == 0) + { + rtx reg = extract_one_reg (regs, start_index); + if (GET_MODE (reg) != expr_mode) + reg = gen_lowpart (expr_mode, reg); + + acc->rtx_val = reg; + return true; + } + + /* Need to extract part reg for the access. */ + if (!acc->write && end_index == start_index) + { + rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0); + bool sgn = TYPE_UNSIGNED (TREE_TYPE (acc->expr)); + acc->rtx_val + = extract_bitfield (reg, acc->size, left_bits, expr_mode, sgn); + if (acc->rtx_val) + return true; + } + + return false; +} + +bool +expand_sra::scalarize_accesses (tree base, rtx regs) +{ + if (!base_access_vec) + return false; + vec<access_p> *access_vec = base_access_vec->get (base); + if (!access_vec) + return false; + bool is_parm = TREE_CODE (base) == PARM_DECL; + if (!valid_scalariable_accesses (access_vec, is_parm)) + return false; + + /* Go through each access, compute corresponding rtx(regs or subregs) + for the expression. */ + int n = access_vec->length (); + int cur_access_index = 0; + for (; cur_access_index < n; cur_access_index++) + if (!scalarize_access ((*access_vec)[cur_access_index], regs)) + break; + + /* Avoid un-scalarized accesses. */ + if (cur_access_index != n) + { + base_access_vec->remove (base); + return false; + } + + /* Bind/map expr(tree) to scalarized rtx. */ + for (int j = 0; j < n; j++) + { + access_p access = (*access_vec)[j]; + expr_rtx_vec->put (access->expr, access->rtx_val); + } + + return true; +} + +static expand_sra *current_sra = NULL; + +/* Check If there is an sra access for the expr. + Return the correspond scalar sym for the access. */ + +rtx +get_scalar_rtx_for_aggregate_expr (tree expr) +{ + return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX; +} + +/* Compute/Set RTX registers for those accesses on BASE. */ + +void +set_scalar_rtx_for_aggregate_access (tree base, rtx regs) +{ + if (!current_sra) + return; + current_sra->scalarize_accesses (base, regs); +} + +void +set_scalar_rtx_for_returns () +{ + if (!current_sra) + return; + + tree res = DECL_RESULT (current_function_decl); + gcc_assert (res); + edge_iterator ei; + edge e; + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) + { + tree val = gimple_return_retval (r); + if (val && VAR_P (val)) + current_sra->scalarize_accesses (val, DECL_RTL (res)); + } +} + /* Return an expression tree corresponding to the RHS of GIMPLE statement STMT. */ @@ -3778,7 +4241,8 @@ expand_return (tree retval) /* If we are returning the RESULT_DECL, then the value has already been stored into it, so we don't have to do anything special. */ - if (TREE_CODE (retval_rhs) == RESULT_DECL) + if (TREE_CODE (retval_rhs) == RESULT_DECL + || get_scalar_rtx_for_aggregate_expr (retval_rhs)) expand_value_return (result_rtl); /* If the result is an aggregate that is being returned in one (or more) @@ -4422,6 +4886,9 @@ expand_debug_expr (tree exp) int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp)); addr_space_t as; scalar_int_mode op0_mode, op1_mode, addr_mode; + rtx x = get_scalar_rtx_for_aggregate_expr (exp); + if (x) + return NULL_RTX;/* optimized out. */ switch (TREE_CODE_CLASS (TREE_CODE (exp))) { @@ -6624,6 +7091,9 @@ pass_expand::execute (function *fun) auto_bitmap forced_stack_vars; discover_nonconstant_array_refs (forced_stack_vars); + current_sra = new expand_sra; + scan_function (cfun, *current_sra); + /* Make sure all values used by the optimization passes have sane defaults. */ reg_renumber = 0; @@ -7052,6 +7522,8 @@ pass_expand::execute (function *fun) loop_optimizer_finalize (); } + delete current_sra; + current_sra = NULL; timevar_pop (TV_POST_EXPAND); return 0; diff --git a/gcc/expr.cc b/gcc/expr.cc index 174f8acb269ab5450fc799516471d5a2bd9b9efa..57b037040d6d8e8c98b2befcb556221c0c5604c4 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, rtx, static rtx const_vector_from_tree (tree); static tree tree_expr_size (const_tree); static void convert_mode_scalar (rtx, rtx, int); +rtx get_scalar_rtx_for_aggregate_expr (tree); /* This is run to set up which modes can be used @@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool nontemporal) Assignment of an array element at a constant index, and assignment of an array element in an unaligned packed structure field, has the same problem. Same for (partially) storing into a non-memory object. */ - if (handled_component_p (to) - || (TREE_CODE (to) == MEM_REF - && (REF_REVERSE_STORAGE_ORDER (to) - || mem_ref_refers_to_non_mem_p (to))) - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE) + if (!get_scalar_rtx_for_aggregate_expr (to) + && (handled_component_p (to) + || (TREE_CODE (to) == MEM_REF + && (REF_REVERSE_STORAGE_ORDER (to) + || mem_ref_refers_to_non_mem_p (to))) + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)) { machine_mode mode1; poly_int64 bitsize, bitpos; @@ -8912,6 +8914,20 @@ expand_constructor (tree exp, rtx target, enum expand_modifier modifier, && ! mostly_zeros_p (exp)) return NULL_RTX; + if (target && GET_CODE (target) == PARALLEL && all_zeros_p (exp)) + { + int length = XVECLEN (target, 0); + int start = XEXP (XVECEXP (target, 0, 0), 0) ? 0 : 1; + for (int i = start; i < length; i++) + { + rtx dst = XEXP (XVECEXP (target, 0, i), 0); + rtx zero = CONST0_RTX (GET_MODE (dst)); + gcc_assert (zero); + emit_move_insn (dst, zero); + } + return target; + } + /* Handle calls that pass values in multiple non-contiguous locations. The Irix 6 ABI has examples of this. */ if (target == 0 || ! safe_from_p (target, exp, 1) @@ -9006,6 +9022,9 @@ expand_expr_real (tree exp, rtx target, machine_mode tmode, ret = CONST0_RTX (tmode); return ret ? ret : const0_rtx; } + rtx x = get_scalar_rtx_for_aggregate_expr (exp); + if (x) + return x; ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl, inner_reference_p); diff --git a/gcc/function.cc b/gcc/function.cc index dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be 100644 --- a/gcc/function.cc +++ b/gcc/function.cc @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct assign_parm_data_one *data) data->stack_parm = stack_parm; } +extern void set_scalar_rtx_for_aggregate_access (tree, rtx); +extern void set_scalar_rtx_for_returns (); + /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that it's always valid and contiguous. */ @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all *all, emit_move_insn (mem, entry_parm); } else - move_block_from_reg (REGNO (entry_parm), mem, - size_stored / UNITS_PER_WORD); + { + int regno = REGNO (entry_parm); + int nregs = size_stored / UNITS_PER_WORD; + move_block_from_reg (regno, mem, nregs); + + rtx *tmps = XALLOCAVEC (rtx, nregs); + machine_mode mode = word_mode; + HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant (); + for (int i = 0; i < nregs; i++) + { + rtx reg = gen_rtx_REG (mode, regno + i); + rtx off = GEN_INT (word_size * i); + tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off); + } + + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps)); + set_scalar_rtx_for_aggregate_access (parm, regs); + } } else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type)) { @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl) else set_decl_incoming_rtl (parm, data.entry_parm, false); + rtx incoming = DECL_INCOMING_RTL (parm); + if (GET_CODE (incoming) == PARALLEL) + set_scalar_rtx_for_aggregate_access (parm, incoming); + assign_parm_adjust_stack_rtl (&data); if (assign_parm_setup_block_p (&data)) @@ -5136,6 +5159,7 @@ expand_function_start (tree subr) { gcc_assert (GET_CODE (hard_reg) == PARALLEL); set_parm_rtl (res, gen_group_rtx (hard_reg)); + set_scalar_rtx_for_returns (); } } diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h index f20266c46226f7840299a768cb575f6f92b54207..bd0396e672b30f7ef66c305d8d131e91639039d7 100644 --- a/gcc/tree-sra.h +++ b/gcc/tree-sra.h @@ -19,6 +19,83 @@ You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see <http://www.gnu.org/licenses/>. */ +struct base_access +{ + /* Values returned by get_ref_base_and_extent, indicates the + OFFSET, SIZE and BASE of the access. */ + HOST_WIDE_INT offset; + HOST_WIDE_INT size; + tree base; + + /* The context expression of this access. */ + tree expr; + + /* Indicates this is a write access. */ + bool write : 1; + + /* Indicates if this access is made in reverse storage order. */ + bool reverse : 1; +}; + +/* Default template for sra_scan_function. */ + +struct default_analyzer +{ + /* Template analyze functions. */ + void analyze_phi (gphi *){}; + void pre_analyze_stmt (gimple *){}; + void analyze_return (greturn *){}; + void analyze_assign (gassign *){}; + void analyze_call (gcall *){}; + void analyze_asm (gasm *){}; + void analyze_default_stmt (gimple *){}; +}; + +/* Scan function and look for interesting expressions. */ + +template <typename analyzer> +void +scan_function (struct function *fun, analyzer &a) +{ + basic_block bb; + FOR_EACH_BB_FN (bb, fun) + { + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + a.analyze_phi (gsi.phi ()); + + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + a.pre_analyze_stmt (stmt); + + switch (gimple_code (stmt)) + { + case GIMPLE_RETURN: + a.analyze_return (as_a<greturn *> (stmt)); + break; + + case GIMPLE_ASSIGN: + a.analyze_assign (as_a<gassign *> (stmt)); + break; + + case GIMPLE_CALL: + a.analyze_call (as_a<gcall *> (stmt)); + break; + + case GIMPLE_ASM: + a.analyze_asm (as_a<gasm *> (stmt)); + break; + + default: + a.analyze_default_stmt (stmt); + break; + } + } + } +} + bool type_internals_preclude_sra_p (tree type, const char **msg); /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C b/gcc/testsuite/g++.target/powerpc/pr102024.C index 769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966 100644 --- a/gcc/testsuite/g++.target/powerpc/pr102024.C +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C @@ -5,7 +5,7 @@ // Test that a zero-width bit field in an otherwise homogeneous aggregate // generates a psabi warning and passes arguments in GPRs. -// { dg-final { scan-assembler-times {\mstd\M} 4 } } +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } } struct a_thing { diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c b/gcc/testsuite/gcc.target/powerpc/pr108073.c new file mode 100644 index 0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c @@ -0,0 +1,29 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -save-temps" } */ + +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF; +typedef struct SF {float a[4]; int i1; int i2; } SF; + +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 && has_arch_pwr8 } } } } */ +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && has_arch_pwr8 } } } } */ +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && has_arch_pwr8 } } } } */ +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return a.s2+a.s3;return 0;} +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return a.i2+a.i1;return 0;} +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == 2)return arg.a[3];else return 0.0;} +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == 2)return arg.a[2]; return 0;} +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == 2)return arg.a[1];return 0;} + +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4}; +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2}; + +int main() +{ + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == 4.0 + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0)) + __builtin_abort (); + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0 + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0)) + __builtin_abort (); + return 0; +} + diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c new file mode 100644 index 0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c @@ -0,0 +1,6 @@ +/* PR target/65421 */ +/* { dg-options "-O2" } */ + +typedef struct LARGE {double a[4]; int arr[32];} LARGE; +LARGE foo (LARGE a){return a;} +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c new file mode 100644 index 0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c @@ -0,0 +1,32 @@ +/* PR target/65421 */ +/* { dg-options "-O2" } */ +/* { dg-require-effective-target powerpc_elfv2 } */ +/* { dg-require-effective-target has_arch_ppc64 } */ + +typedef struct FLOATS +{ + double a[3]; +} FLOATS; + +/* 3 lfd after returns also optimized */ +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */ + +/* 3 stfd */ +void st_arg (FLOATS a, FLOATS *p) {*p = a;} +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */ + +/* blr */ +FLOATS ret_arg (FLOATS a) {return a;} + +typedef struct MIX +{ + double a[2]; + long l; +} MIX; + +/* std 3 param regs to return slot */ +MIX ret_arg1 (MIX a) {return a;} +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */ + +/* count insns */ +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */ -- 2.25.1 BR, Jeff (Jiufu Guo) Jiufu Guo <guoji...@linux.ibm.com> writes: > Hi, > > There are a few PRs about the issues on the struct parameters and > returns, like PRs 69143/65421/108073. > > we could consider introducing a light SRA in the expander to > handle those parameters and returns in aggregate type, if they > are passed through registers. For access to the fields of > the parameters or returns, the corresponding scalar registers > can be used. > > As discussed: > https://gcc.gnu.org/pipermail/gcc-patches/2023-May/619884.html > > This is an initial patch for the light-expander-sra. > > Bootstrapped and regtested on x86_64-redhat-linux, and > powerpc64{,le}-linux-gnu. > > Is it ok for trunk? > > > BR, > Jeff (Jiufu Guo) > > > PR target/65421 > PR target/69143 > > gcc/ChangeLog: > > * cfgexpand.cc (expand_shift): Extern declare. > (struct access): New class. > (struct expand_sra): New class. > (expand_sra::build_access): New member function. > (expand_sra::visit_base): Likewise. > (expand_sra::analyze_default_stmt): Likewise. > (expand_sra::analyze_assign): Likewise. > (expand_sra::add_sra_candidate): Likewise. > (expand_sra::collect_sra_candidates): Likewise. > (expand_sra::valid_scalariable_accesses): Likewise. > (expand_sra::prepare_expander_sra): Likewise. > (expand_sra::expand_sra): Class constructor. > (expand_sra::~expand_sra): Class destructor. > (expand_sra::get_scalarized_rtx): New member function. > (extract_one_reg): New function. > (extract_sub_reg): New function. > (expand_sra::scalarize_access): New member function. > (expand_sra::scalarize_accesses): New member function. > (get_scalar_rtx_for_aggregate_expr): New function. > (set_scalar_rtx_for_aggregate_access): New function. > (set_scalar_rtx_for_returns): New function. > (expand_return): Call get_scalar_rtx_for_aggregate_expr. > (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr. > (pass_expand::execute): Update to use the expand_sra. > * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare. > (expand_assignment): Call get_scalar_rtx_for_aggregate_expr. > (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr. > * function.cc (set_scalar_rtx_for_aggregate_access): Extern declare. > (set_scalar_rtx_for_returns): Extern declare. > (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access. > (assign_parms): Call set_scalar_rtx_for_aggregate_access. > (expand_function_start): Call set_scalar_rtx_for_returns. > * tree-sra.h (struct base_access): New class. > (struct default_analyzer): New class. > (scan_function): New function template. > > gcc/testsuite/ChangeLog: > > * g++.target/powerpc/pr102024.C: Updated. > * gcc.target/powerpc/pr108073.c: New test. > * gcc.target/powerpc/pr65421-1.c: New test. > * gcc.target/powerpc/pr65421-2.c: New test. > > --- > gcc/cfgexpand.cc | 478 ++++++++++++++++++- > gcc/expr.cc | 15 +- > gcc/function.cc | 28 +- > gcc/tree-sra.h | 80 +++- > gcc/testsuite/g++.target/powerpc/pr102024.C | 2 +- > gcc/testsuite/gcc.target/powerpc/pr108073.c | 29 ++ > gcc/testsuite/gcc.target/powerpc/pr65421-1.c | 6 + > gcc/testsuite/gcc.target/powerpc/pr65421-2.c | 32 ++ > 8 files changed, 660 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c > > diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc > index > edf292cfbe95ac2711faee7769e839cb4edb0dd3..21a09ebac96bbcddc67da73c42f470c6d5f60e6c > 100644 > --- a/gcc/cfgexpand.cc > +++ b/gcc/cfgexpand.cc > @@ -74,6 +74,7 @@ along with GCC; see the file COPYING3. If not see > #include "output.h" > #include "builtins.h" > #include "opts.h" > +#include "tree-sra.h" > > /* Some systems use __main in a way incompatible with its use in gcc, in > these > cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN > to > @@ -97,6 +98,472 @@ static bool defer_stack_allocation (tree, bool); > > static void record_alignment_for_reg_var (unsigned int); > > +extern rtx > +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int); > + > +/* For light SRA in expander about paramaters and returns. */ > +struct access : public base_access > +{ > + /* The rtx for the access: link to incoming/returning register(s). */ > + rtx rtx_val; > +}; > + > +typedef struct access *access_p; > + > +struct expand_sra : public default_analyzer > +{ > + expand_sra (); > + ~expand_sra (); > + > + /* Now use default APIs, no actions for > + pre_analyze_stmt, analyze_return. */ > + > + /* overwrite analyze_default_stmt. */ > + void analyze_default_stmt (gimple *); > + > + /* overwrite analyze phi,call,asm . */ > + void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); }; > + void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); }; > + void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); }; > + /* overwrite analyze_assign. */ > + void analyze_assign (gassign *); > + > + /* Compute the scalar rtx(s) for all access of BASE from a parrallel REGS. > */ > + bool scalarize_accesses (tree base, rtx regs); > + /* Return the scalarized rtx for EXPR. */ > + rtx get_scalarized_rtx (tree expr); > + > +private: > + void prepare_expander_sra (void); > + > + /* Return true if VAR is a candidate for SRA. */ > + bool add_sra_candidate (tree var); > + > + /* Collect the parameter and returns with type which is suitable for > + scalarization. */ > + bool collect_sra_candidates (void); > + > + /* Return true if EXPR has interesting access to the sra candidates, > + and created access, return false otherwise. */ > + access_p build_access (tree expr, bool write); > + > + /* Check if the accesses of BASE are scalarizbale. > + Now support the parms only with reading or returns only with writing. > */ > + bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm); > + > + /* Compute the scalar rtx for one access ACC from a parrallel REGS. */ > + bool scalarize_access (access_p acc, rtx regs); > + > + /* Callback of walk_stmt_load_store_addr_ops, used to remove > + unscalarizable accesses. */ > + static bool visit_base (gimple *, tree op, tree, void *data); > + > + /* Expr (tree) -> Scalarized value (rtx) map. */ > + hash_map<tree, rtx> *expr_rtx_vec; > + > + /* Base (tree) -> Vector (vec<access_p> *) map. */ > + hash_map<tree, auto_vec<access_p> > *base_access_vec; > +}; > + > +access_p > +expand_sra::build_access (tree expr, bool write) > +{ > + enum tree_code code = TREE_CODE (expr); > + if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF > + && code != ARRAY_REF && code != ARRAY_RANGE_REF) > + return NULL; > + > + HOST_WIDE_INT offset, size; > + bool reverse; > + tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse); > + if (!base || !DECL_P (base)) > + return NULL; > + if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr)) > + { > + base_access_vec->remove (base); > + return NULL; > + } > + > + vec<access_p> *access_vec = base_access_vec->get (base); > + if (!access_vec) > + return NULL; > + > + /* TODO: support reverse. */ > + if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE > (base))) > + { > + base_access_vec->remove (base); > + return NULL; > + } > + > + struct access *access = XNEWVEC (struct access, 1); > + > + memset (access, 0, sizeof (struct access)); > + access->offset = offset; > + access->size = size; > + access->expr = expr; > + access->write = write; > + access->rtx_val = NULL_RTX; > + > + access_vec->safe_push (access); > + > + return access; > +} > + > +bool > +expand_sra::visit_base (gimple *, tree op, tree, void *data) > +{ > + op = get_base_address (op); > + if (op && DECL_P (op)) > + { > + expand_sra *p = (expand_sra *) data; > + p->base_access_vec->remove (op); > + } > + return false; > +} > + > +void > +expand_sra::analyze_default_stmt (gimple *stmt) > +{ > + if (base_access_vec && !base_access_vec->is_empty ()) > + walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base, > + visit_base); > +} > + > +void > +expand_sra::analyze_assign (gassign *stmt) > +{ > + if (!base_access_vec || base_access_vec->is_empty ()) > + return; > + > + if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt)) > + { > + tree rhs = gimple_assign_rhs1 (stmt); > + tree lhs = gimple_assign_lhs (stmt); > + bool res_r = build_access (rhs, false); > + bool res_l = build_access (lhs, true); > + if (res_l && TREE_CODE (rhs) == CONSTRUCTOR) > + base_access_vec->remove (get_base_address (lhs)); > + > + if (res_l || res_r) > + return; > + } > + > + analyze_default_stmt (stmt); > +} > + > +/* Return true if VAR is a candidate for SRA. */ > + > +bool > +expand_sra::add_sra_candidate (tree var) > +{ > + tree type = TREE_TYPE (var); > + > + if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type)) > + || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var) > + || is_va_list_type (type)) > + return false; > + gcc_assert (COMPLETE_TYPE_P (type)); > + > + base_access_vec->get_or_insert (var); > + > + return true; > +} > + > +bool > +expand_sra::collect_sra_candidates (void) > +{ > + bool ret = false; > + > + /* Collect parameters. */ > + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm; > + parm = DECL_CHAIN (parm)) > + ret |= add_sra_candidate (parm); > + > + /* Collect VARs on returns. */ > + if (DECL_RESULT (current_function_decl)) > + { > + edge_iterator ei; > + edge e; > + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) > + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) > + { > + tree val = gimple_return_retval (r); > + /* To sclaraized the return, the return value should be only > + writen, except this return stmt. > + Then using 'true(write)' to create the access. */ > + if (val && VAR_P (val)) > + ret |= add_sra_candidate (val) && build_access (val, true); > + } > + } > + > + return ret; > +} > + > +bool > +expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool > is_parm) > +{ > + if (access_vec->is_empty ()) > + return false; > + > + for (unsigned int j = 0; j < access_vec->length (); j++) > + { > + struct access *access = (*access_vec)[j]; > + if (is_parm && access->write) > + return false; > + > + if (!is_parm && !access->write) > + return false; > + } > + > + return true; > +} > + > +void > +expand_sra::prepare_expander_sra () > +{ > + if (optimize <= 0) > + return; > + > + base_access_vec = new hash_map<tree, auto_vec<access_p> >; > + expr_rtx_vec = new hash_map<tree, rtx>; > + > + collect_sra_candidates (); > +} > + > +expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL) > +{ > + prepare_expander_sra (); > +} > + > +expand_sra::~expand_sra () > +{ > + if (optimize <= 0) > + return; > + delete expr_rtx_vec; > + expr_rtx_vec = NULL; > + delete base_access_vec; > + base_access_vec = NULL; > +} > + > +rtx > +expand_sra::get_scalarized_rtx (tree expr) > +{ > + if (!expr_rtx_vec) > + return NULL_RTX; > + rtx *val = expr_rtx_vec->get (expr); > + return val ? *val : NULL_RTX; > +} > + > +/* Get the register at INDEX from a parallel REGS. */ > + > +static rtx > +extract_one_reg (rtx regs, int index) > +{ > + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0); > + if (!HARD_REGISTER_P (orig_reg)) > + return orig_reg; > + > + /* Reading from param hard reg need to be moved to a temp. */ > + rtx reg = gen_reg_rtx (GET_MODE (orig_reg)); > + emit_move_insn (reg, orig_reg); > + return reg; > +} > + > +/* Get IMODE part from REG at OFF_BITS. */ > + > +static rtx > +extract_sub_reg (rtx orig_reg, int off_bits, machine_mode mode) > +{ > + scalar_int_mode imode; > + if (!int_mode_for_mode (mode).exists (&imode)) > + return NULL_RTX; > + > + machine_mode orig_mode = GET_MODE (orig_reg); > + gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT); > + > + poly_uint64 lowpart_off = subreg_lowpart_offset (imode, orig_mode); > + int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT; > + int shift_bits; > + if (lowpart_off_bits >= off_bits) > + shift_bits = lowpart_off_bits - off_bits; > + else > + shift_bits = off_bits - lowpart_off_bits; > + > + rtx reg = orig_reg; > + if (shift_bits > 0) > + reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, shift_bits, NULL, 1); > + > + rtx subreg = gen_lowpart (imode, reg); > + rtx result = gen_reg_rtx (imode); > + emit_move_insn (result, subreg); > + > + if (mode != imode) > + result = gen_lowpart (mode, result); > + > + return result; > +} > + > +bool > +expand_sra::scalarize_access (access_p acc, rtx regs) > +{ > + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr)); > + > + /* mode of mult registers. */ > + if (expr_mode != BLKmode > + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode))) > + return false; > + > + /* Compute the position of the access in the whole parallel rtx. */ > + int start_index = -1; > + int end_index = -1; > + HOST_WIDE_INT left_bits = 0; > + HOST_WIDE_INT right_bits = 0; > + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1; > + for (; cur_index < XVECLEN (regs, 0); cur_index++) > + { > + rtx slot = XVECEXP (regs, 0, cur_index); > + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT; > + machine_mode mode = GET_MODE (XEXP (slot, 0)); > + HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant (); > + if (off <= acc->offset && off + size > acc->offset) > + { > + start_index = cur_index; > + left_bits = acc->offset - off; > + } > + if (off + size >= acc->offset + acc->size) > + { > + end_index = cur_index; > + right_bits = off + size - (acc->offset + acc->size); > + break; > + } > + } > + /* Invalid access possition: padding or outof bound. */ > + if (start_index < 0 || end_index < 0) > + return false; > + > + /* Need multi-registers in a parallel for the access. */ > + if (expr_mode == BLKmode || end_index > start_index) > + { > + if (left_bits || right_bits) > + return false; > + > + int num_words = end_index - start_index + 1; > + rtx *tmps = XALLOCAVEC (rtx, num_words); > + > + int pos = 0; > + HOST_WIDE_INT start; > + start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)); > + /* Extract whole registers. */ > + for (; pos < num_words; pos++) > + { > + int index = start_index + pos; > + rtx reg = extract_one_reg (regs, index); > + machine_mode mode = GET_MODE (reg); > + HOST_WIDE_INT off; > + off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start; > + tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off)); > + } > + > + rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps)); > + acc->rtx_val = reg; > + return true; > + } > + > + /* Just need one reg for the access. */ > + if (end_index == start_index && left_bits == 0 && right_bits == 0) > + { > + rtx reg = extract_one_reg (regs, start_index); > + if (GET_MODE (reg) != expr_mode) > + reg = gen_lowpart (expr_mode, reg); > + > + acc->rtx_val = reg; > + return true; > + } > + > + /* Need to extract part reg for the access. */ > + if (!acc->write && end_index == start_index > + && (acc->size % BITS_PER_UNIT) == 0) > + { > + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0); > + acc->rtx_val = extract_sub_reg (orig_reg, left_bits, expr_mode); > + if (acc->rtx_val) > + return true; > + } > + > + return false; > +} > + > +bool > +expand_sra::scalarize_accesses (tree base, rtx regs) > +{ > + if (!base_access_vec) > + return false; > + vec<access_p> *access_vec = base_access_vec->get (base); > + if (!access_vec) > + return false; > + bool is_parm = TREE_CODE (base) == PARM_DECL; > + if (!valid_scalariable_accesses (access_vec, is_parm)) > + return false; > + > + /* Go through each access, compute corresponding rtx(regs or subregs) > + for the expression. */ > + int n = access_vec->length (); > + int cur_access_index = 0; > + for (; cur_access_index < n; cur_access_index++) > + if (!scalarize_access ((*access_vec)[cur_access_index], regs)) > + break; > + > + /* Bind/map expr(tree) to sclarized rtx if all access scalarized. */ > + if (cur_access_index == n) > + for (int j = 0; j < n; j++) > + { > + access_p access = (*access_vec)[j]; > + expr_rtx_vec->put (access->expr, access->rtx_val); > + } > + > + return true; > +} > + > +static expand_sra *current_sra = NULL; > + > +/* Check If there is an sra access for the expr. > + Return the correspond scalar sym for the access. */ > + > +rtx > +get_scalar_rtx_for_aggregate_expr (tree expr) > +{ > + return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX; > +} > + > +/* Compute/Set RTX registers for those accesses on BASE. */ > + > +void > +set_scalar_rtx_for_aggregate_access (tree base, rtx regs) > +{ > + if (!current_sra) > + return; > + current_sra->scalarize_accesses (base, regs); > +} > + > +void > +set_scalar_rtx_for_returns () > +{ > + if (!current_sra) > + return; > + > + tree res = DECL_RESULT (current_function_decl); > + gcc_assert (res); > + edge_iterator ei; > + edge e; > + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) > + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) > + { > + tree val = gimple_return_retval (r); > + if (val && VAR_P (val)) > + current_sra->scalarize_accesses (val, DECL_RTL (res)); > + } > +} > + > /* Return an expression tree corresponding to the RHS of GIMPLE > statement STMT. */ > > @@ -3778,7 +4245,8 @@ expand_return (tree retval) > > /* If we are returning the RESULT_DECL, then the value has already > been stored into it, so we don't have to do anything special. */ > - if (TREE_CODE (retval_rhs) == RESULT_DECL) > + if (TREE_CODE (retval_rhs) == RESULT_DECL > + || get_scalar_rtx_for_aggregate_expr (retval_rhs)) > expand_value_return (result_rtl); > > /* If the result is an aggregate that is being returned in one (or more) > @@ -4422,6 +4890,9 @@ expand_debug_expr (tree exp) > int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp)); > addr_space_t as; > scalar_int_mode op0_mode, op1_mode, addr_mode; > + rtx x = get_scalar_rtx_for_aggregate_expr (exp); > + if (x) > + return NULL_RTX;/* optimized out. */ > > switch (TREE_CODE_CLASS (TREE_CODE (exp))) > { > @@ -6624,6 +7095,9 @@ pass_expand::execute (function *fun) > auto_bitmap forced_stack_vars; > discover_nonconstant_array_refs (forced_stack_vars); > > + current_sra = new expand_sra; > + scan_function (cfun, *current_sra); > + > /* Make sure all values used by the optimization passes have sane > defaults. */ > reg_renumber = 0; > @@ -7052,6 +7526,8 @@ pass_expand::execute (function *fun) > loop_optimizer_finalize (); > } > > + delete current_sra; > + current_sra = NULL; > timevar_pop (TV_POST_EXPAND); > > return 0; > diff --git a/gcc/expr.cc b/gcc/expr.cc > index > 174f8acb269ab5450fc799516471d5a2bd9b9efa..53b48aba790d4dd8ade326a2b33a0c7ec3fffc47 > 100644 > --- a/gcc/expr.cc > +++ b/gcc/expr.cc > @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, > rtx, > static rtx const_vector_from_tree (tree); > static tree tree_expr_size (const_tree); > static void convert_mode_scalar (rtx, rtx, int); > +rtx get_scalar_rtx_for_aggregate_expr (tree); > > > /* This is run to set up which modes can be used > @@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool > nontemporal) > Assignment of an array element at a constant index, and assignment of > an array element in an unaligned packed structure field, has the same > problem. Same for (partially) storing into a non-memory object. */ > - if (handled_component_p (to) > - || (TREE_CODE (to) == MEM_REF > - && (REF_REVERSE_STORAGE_ORDER (to) > - || mem_ref_refers_to_non_mem_p (to))) > - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE) > + if (!get_scalar_rtx_for_aggregate_expr (to) > + && (handled_component_p (to) > + || (TREE_CODE (to) == MEM_REF > + && (REF_REVERSE_STORAGE_ORDER (to) > + || mem_ref_refers_to_non_mem_p (to))) > + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)) > { > machine_mode mode1; > poly_int64 bitsize, bitpos; > @@ -9006,6 +9008,9 @@ expand_expr_real (tree exp, rtx target, machine_mode > tmode, > ret = CONST0_RTX (tmode); > return ret ? ret : const0_rtx; > } > + rtx x = get_scalar_rtx_for_aggregate_expr (exp); > + if (x) > + return x; > > ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl, > inner_reference_p); > diff --git a/gcc/function.cc b/gcc/function.cc > index > dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be > 100644 > --- a/gcc/function.cc > +++ b/gcc/function.cc > @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct > assign_parm_data_one *data) > data->stack_parm = stack_parm; > } > > +extern void set_scalar_rtx_for_aggregate_access (tree, rtx); > +extern void set_scalar_rtx_for_returns (); > + > /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that it's > always valid and contiguous. */ > > @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all > *all, > emit_move_insn (mem, entry_parm); > } > else > - move_block_from_reg (REGNO (entry_parm), mem, > - size_stored / UNITS_PER_WORD); > + { > + int regno = REGNO (entry_parm); > + int nregs = size_stored / UNITS_PER_WORD; > + move_block_from_reg (regno, mem, nregs); > + > + rtx *tmps = XALLOCAVEC (rtx, nregs); > + machine_mode mode = word_mode; > + HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant (); > + for (int i = 0; i < nregs; i++) > + { > + rtx reg = gen_rtx_REG (mode, regno + i); > + rtx off = GEN_INT (word_size * i); > + tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off); > + } > + > + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps)); > + set_scalar_rtx_for_aggregate_access (parm, regs); > + } > } > else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type)) > { > @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl) > else > set_decl_incoming_rtl (parm, data.entry_parm, false); > > + rtx incoming = DECL_INCOMING_RTL (parm); > + if (GET_CODE (incoming) == PARALLEL) > + set_scalar_rtx_for_aggregate_access (parm, incoming); > + > assign_parm_adjust_stack_rtl (&data); > > if (assign_parm_setup_block_p (&data)) > @@ -5136,6 +5159,7 @@ expand_function_start (tree subr) > { > gcc_assert (GET_CODE (hard_reg) == PARALLEL); > set_parm_rtl (res, gen_group_rtx (hard_reg)); > + set_scalar_rtx_for_returns (); > } > } > > diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h > index > f20266c46226f7840299a768cb575f6f92b54207..7af87bccf1b43badbc3f8a4c51a87c84d5020b9e > 100644 > --- a/gcc/tree-sra.h > +++ b/gcc/tree-sra.h > @@ -19,7 +19,85 @@ You should have received a copy of the GNU General Public > License > along with GCC; see the file COPYING3. If not see > <http://www.gnu.org/licenses/>. */ > > -bool type_internals_preclude_sra_p (tree type, const char **msg); > +struct base_access > +{ > + /* Values returned by get_ref_base_and_extent, indicates the > + OFFSET, SIZE and BASE of the access. */ > + HOST_WIDE_INT offset; > + HOST_WIDE_INT size; > + tree base; > + > + /* The context expression of this access. */ > + tree expr; > + > + /* Indicates this is a write access. */ > + bool write : 1; > + > + /* Indicates if this access is made in reverse storage order. */ > + bool reverse : 1; > +}; > + > +/* Default template for sra_scan_function. */ > + > +struct default_analyzer > +{ > + /* Template analyze functions. */ > + void analyze_phi (gphi *){}; > + void pre_analyze_stmt (gimple *){}; > + void analyze_return (greturn *){}; > + void analyze_assign (gassign *){}; > + void analyze_call (gcall *){}; > + void analyze_asm (gasm *){}; > + void analyze_default_stmt (gimple *){}; > +}; > + > +/* Scan function and look for interesting expressions. */ > + > +template <typename analyzer> > +void > +scan_function (struct function *fun, analyzer &a) > +{ > + basic_block bb; > + FOR_EACH_BB_FN (bb, fun) > + { > + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + a.analyze_phi (gsi.phi ()); > + > + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + a.pre_analyze_stmt (stmt); > + > + switch (gimple_code (stmt)) > + { > + case GIMPLE_RETURN: > + a.analyze_return (as_a<greturn *> (stmt)); > + break; > + > + case GIMPLE_ASSIGN: > + a.analyze_assign (as_a<gassign *> (stmt)); > + break; > + > + case GIMPLE_CALL: > + a.analyze_call (as_a<gcall *> (stmt)); > + break; > + > + case GIMPLE_ASM: > + a.analyze_asm (as_a<gasm *> (stmt)); > + break; > + > + default: > + a.analyze_default_stmt (stmt); > + break; > + } > + } > + } > +} > + > +bool > +type_internals_preclude_sra_p (tree type, const char **msg); > > /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA > should leave alone). */ > diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C > b/gcc/testsuite/g++.target/powerpc/pr102024.C > index > 769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966 > 100644 > --- a/gcc/testsuite/g++.target/powerpc/pr102024.C > +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C > @@ -5,7 +5,7 @@ > // Test that a zero-width bit field in an otherwise homogeneous aggregate > // generates a psabi warning and passes arguments in GPRs. > > -// { dg-final { scan-assembler-times {\mstd\M} 4 } } > +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } } > > struct a_thing > { > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c > b/gcc/testsuite/gcc.target/powerpc/pr108073.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c > @@ -0,0 +1,29 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -save-temps" } */ > + > +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF; > +typedef struct SF {float a[4]; int i1; int i2; } SF; > + > +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 > && has_arch_pwr8 } } } } */ > +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && > has_arch_pwr8 } } } } */ > +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && > has_arch_pwr8 } } } } */ > +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return > a.s2+a.s3;return 0;} > +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return > a.i2+a.i1;return 0;} > +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == > 2)return arg.a[3];else return 0.0;} > +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == > 2)return arg.a[2]; return 0;} > +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == > 2)return arg.a[1];return 0;} > + > +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4}; > +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2}; > + > +int main() > +{ > + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == > 4.0 > + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0)) > + __builtin_abort (); > + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0 > + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0)) > + __builtin_abort (); > + return 0; > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > @@ -0,0 +1,6 @@ > +/* PR target/65421 */ > +/* { dg-options "-O2" } */ > + > +typedef struct LARGE {double a[4]; int arr[32];} LARGE; > +LARGE foo (LARGE a){return a;} > +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > @@ -0,0 +1,32 @@ > +/* PR target/65421 */ > +/* { dg-options "-O2" } */ > +/* { dg-require-effective-target powerpc_elfv2 } */ > +/* { dg-require-effective-target has_arch_ppc64 } */ > + > +typedef struct FLOATS > +{ > + double a[3]; > +} FLOATS; > + > +/* 3 lfd after returns also optimized */ > +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */ > + > +/* 3 stfd */ > +void st_arg (FLOATS a, FLOATS *p) {*p = a;} > +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */ > + > +/* blr */ > +FLOATS ret_arg (FLOATS a) {return a;} > + > +typedef struct MIX > +{ > + double a[2]; > + long l; > +} MIX; > + > +/* std 3 param regs to return slot */ > +MIX ret_arg1 (MIX a) {return a;} > +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */ > + > +/* count insns */ > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */