Hi Richard,
Thanks so much for your great review! Richard Biener <rguent...@suse.de> writes: > On Wed, 23 Aug 2023, Jiufu Guo wrote: > >> >> Hi, >> >> I just updated the patch. We could review this one. >> >> Compare with previous patch: >> https://gcc.gnu.org/pipermail/gcc-patches/2023-August/627287.html >> This version: >> * Supports bitfield access from one register. >> * Allow return scalar registers cleaned via contructor. >> >> Bootstrapped and regtested on x86_64-redhat-linux, and >> powerpc64{,le}-linux-gnu. >> >> Is it ok for trunk? > > Some comments inline - not a full review (and sorry for the delay). > >> >> PR target/65421 >> PR target/69143 >> >> gcc/ChangeLog: >> >> * cfgexpand.cc (extract_bit_field): Extern declare. >> (struct access): New class. >> (struct expand_sra): New class. >> (expand_sra::build_access): New member function. >> (expand_sra::visit_base): Likewise. >> (expand_sra::analyze_default_stmt): Likewise. >> (expand_sra::analyze_assign): Likewise. >> (expand_sra::add_sra_candidate): Likewise. >> (expand_sra::collect_sra_candidates): Likewise. >> (expand_sra::valid_scalariable_accesses): Likewise. >> (expand_sra::prepare_expander_sra): Likewise. >> (expand_sra::expand_sra): Class constructor. >> (expand_sra::~expand_sra): Class destructor. >> (expand_sra::get_scalarized_rtx): New member function. >> (extract_one_reg): New function. >> (extract_bitfield): New function. >> (expand_sra::scalarize_access): New member function. >> (expand_sra::scalarize_accesses): New member function. >> (get_scalar_rtx_for_aggregate_expr): New function. >> (set_scalar_rtx_for_aggregate_access): New function. >> (set_scalar_rtx_for_returns): New function. >> (expand_return): Call get_scalar_rtx_for_aggregate_expr. >> (expand_debug_expr): Call get_scalar_rtx_for_aggregate_expr. >> (pass_expand::execute): Update to use the expand_sra. >> * expr.cc (get_scalar_rtx_for_aggregate_expr): Extern declare. >> (expand_assignment): Call get_scalar_rtx_for_aggregate_expr. >> (expand_expr_real): Call get_scalar_rtx_for_aggregate_expr. >> * function.cc (set_scalar_rtx_for_aggregate_access): Extern declare. >> (set_scalar_rtx_for_returns): Extern declare. >> (assign_parm_setup_block): Call set_scalar_rtx_for_aggregate_access. >> (assign_parms): Call set_scalar_rtx_for_aggregate_access. >> (expand_function_start): Call set_scalar_rtx_for_returns. >> * tree-sra.h (struct base_access): New class. >> (struct default_analyzer): New class. >> (scan_function): New function template. >> >> gcc/testsuite/ChangeLog: >> >> * g++.target/powerpc/pr102024.C: Updated. >> * gcc.target/powerpc/pr108073.c: New test. >> * gcc.target/powerpc/pr65421-1.c: New test. >> * gcc.target/powerpc/pr65421-2.c: New test. >> >> --- >> gcc/cfgexpand.cc | 474 ++++++++++++++++++- >> gcc/expr.cc | 29 +- >> gcc/function.cc | 28 +- >> gcc/tree-sra.h | 77 +++ >> gcc/testsuite/g++.target/powerpc/pr102024.C | 2 +- >> gcc/testsuite/gcc.target/powerpc/pr108073.c | 29 ++ >> gcc/testsuite/gcc.target/powerpc/pr65421-1.c | 6 + >> gcc/testsuite/gcc.target/powerpc/pr65421-2.c | 32 ++ >> 8 files changed, 668 insertions(+), 9 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c >> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c >> >> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc >> index >> edf292cfbe95ac2711faee7769e839cb4edb0dd3..385b6c781aa2805e7ca40293a0ae84f87e23e0b6 >> 100644 >> --- a/gcc/cfgexpand.cc >> +++ b/gcc/cfgexpand.cc >> @@ -74,6 +74,7 @@ along with GCC; see the file COPYING3. If not see >> #include "output.h" >> #include "builtins.h" >> #include "opts.h" >> +#include "tree-sra.h" >> >> /* Some systems use __main in a way incompatible with its use in gcc, in >> these >> cases use the macros NAME__MAIN to give a quoted symbol and SYMBOL__MAIN >> to >> @@ -97,6 +98,468 @@ static bool defer_stack_allocation (tree, bool); >> >> static void record_alignment_for_reg_var (unsigned int); >> >> +extern rtx extract_bit_field (rtx, poly_uint64, poly_uint64, int, rtx, >> + machine_mode, machine_mode, bool, rtx *); > > belongs in some header Thanks. Will update. > >> + >> +/* For light SRA in expander about paramaters and returns. */ >> +struct access : public base_access >> +{ >> + /* The rtx for the access: link to incoming/returning register(s). */ >> + rtx rtx_val; >> +}; >> + >> +typedef struct access *access_p; >> + >> +struct expand_sra : public default_analyzer >> +{ > > Both 'base_access' and 'default_analyzer' need a more specific > name I think. Just throwing in two names here, > 'sra_access_base' and 'sra_default_analyzer' Thanks! > >> + expand_sra (); >> + ~expand_sra (); >> + >> + /* Now use default APIs, no actions for >> + pre_analyze_stmt, analyze_return. */ >> + >> + /* overwrite analyze_default_stmt. */ >> + void analyze_default_stmt (gimple *); >> + >> + /* overwrite analyze phi,call,asm . */ >> + void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); }; >> + void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); }; >> + void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); }; > > that looks odd I would define another function with a name like: protect_mem_access_in_stmt (gimple*) "analyze_default_stmt and analyze_phi" call this function. analyze_call and analyze_asm would be updated to analyze sra accesses on call/asm stmt later. > >> + /* overwrite analyze_assign. */ >> + void analyze_assign (gassign *); >> + >> + /* Compute the scalar rtx(s) for all access of BASE from a parrallel >> REGS. */ >> + bool scalarize_accesses (tree base, rtx regs); >> + /* Return the scalarized rtx for EXPR. */ >> + rtx get_scalarized_rtx (tree expr); >> + >> +private: >> + void prepare_expander_sra (void); >> + >> + /* Return true if VAR is a candidate for SRA. */ >> + bool add_sra_candidate (tree var); >> + >> + /* Collect the parameter and returns with type which is suitable for >> + scalarization. */ >> + bool collect_sra_candidates (void); >> + >> + /* Return true if EXPR has interesting access to the sra candidates, >> + and created access, return false otherwise. */ >> + access_p build_access (tree expr, bool write); >> + >> + /* Check if the accesses of BASE are scalarizbale. >> + Now support the parms only with reading or returns only with writing. >> */ >> + bool valid_scalariable_accesses (vec<access_p> *access_vec, bool is_parm); >> + >> + /* Compute the scalar rtx for one access ACC from a parrallel REGS. */ >> + bool scalarize_access (access_p acc, rtx regs); >> + >> + /* Callback of walk_stmt_load_store_addr_ops, used to remove >> + unscalarizable accesses. */ >> + static bool visit_base (gimple *, tree op, tree, void *data); >> + >> + /* Expr (tree) -> Scalarized value (rtx) map. */ >> + hash_map<tree, rtx> *expr_rtx_vec; >> + >> + /* Base (tree) -> Vector (vec<access_p> *) map. */ >> + hash_map<tree, auto_vec<access_p> > *base_access_vec; >> +}; >> + >> +access_p >> +expand_sra::build_access (tree expr, bool write) >> +{ >> + enum tree_code code = TREE_CODE (expr); >> + if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF >> + && code != ARRAY_REF && code != ARRAY_RANGE_REF) >> + return NULL; >> + >> + HOST_WIDE_INT offset, size; >> + bool reverse; >> + tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse); >> + if (!base || !DECL_P (base)) >> + return NULL; >> + if (storage_order_barrier_p (expr) || TREE_THIS_VOLATILE (expr)) >> + { >> + base_access_vec->remove (base); >> + return NULL; >> + } >> + >> + vec<access_p> *access_vec = base_access_vec->get (base); >> + if (!access_vec) >> + return NULL; >> + >> + /* TODO: support reverse. */ >> + if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE >> (base))) >> + { >> + base_access_vec->remove (base); >> + return NULL; >> + } >> + >> + struct access *access = XNEWVEC (struct access, 1); >> + >> + memset (access, 0, sizeof (struct access)); >> + access->offset = offset; >> + access->size = size; >> + access->expr = expr; >> + access->write = write; >> + access->rtx_val = NULL_RTX; >> + >> + access_vec->safe_push (access); > > doesn't look like this shares anything with SRA? For this code in tree-sra and ipa-sra, they are just "**seems**" similar, but they are different, even extracting the "common" code slightly, it may look crushed. I would still try to common the code. > >> + Return access; >> +} >> + >> +bool >> +expand_sra::visit_base (gimple *, tree op, tree, void *data) >> +{ >> + op = get_base_address (op); >> + if (op && DECL_P (op)) >> + { >> + expand_sra *p = (expand_sra *) data; >> + p->base_access_vec->remove (op); >> + } >> + return false; >> +} >> + >> +void >> +expand_sra::analyze_default_stmt (gimple *stmt) >> +{ >> + if (base_access_vec && !base_access_vec->is_empty ()) >> + walk_stmt_load_store_addr_ops (stmt, this, visit_base, visit_base, >> + visit_base); > > comments would be helpful. This seems to remove all bases for > references based on a decl?! Are you really interested in addrs? Yes. In tree-sra/ipa-sra, stmt (assign/call/asm/return) are checked, and may disqualify(remove) the base if risk. This patch is relatively conservative, it only keeps sra candidates on assignment. For example: "%_7=&arg", and %_7 is used for some purpose. For this case, "arg" is addr taken, and then it needs a mem for it, and then it would be at risk to do sra without deep analyzing. > >> +} >> + >> +void >> +expand_sra::analyze_assign (gassign *stmt) >> +{ >> + if (!base_access_vec || base_access_vec->is_empty ()) >> + return; >> + >> + if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt)) >> + { >> + tree rhs = gimple_assign_rhs1 (stmt); >> + tree lhs = gimple_assign_lhs (stmt); >> + bool res_r = build_access (rhs, false); >> + bool res_l = build_access (lhs, true); >> + >> + if (res_l || res_r) > > does this assume we can analyze all accesses that are in the > lhs/rhs? Because if we have an aggregate copy there might be one > in both but one(?) might fail? For the light-expand-sra, "lhs" of an assignment which we care about is only "return", or say, assigning to a return-var. "rhs" of an assignment that we care about is a parameter(or part of parameter). It is also ok for aggregate copy between parameter and return-var. > > Why do we remove bases - since this is all about heuristics it should > be OK to "ignore" accesses we cannot / do not analyze? Removing the whole base is simple. To support partial sra, we need more work. - Need more analysis to make sure the scalarized part is not affected by the access to other parts of the same base. - We need to allow the parameter to be stored to stack for non-sra access, and also allow access to the scalarized at the same time. Need to figure out one way to let DECL_RTL registers and DECL_RTL of stack co-exist. > > Why doesn't this use walk_stmt_load_store_addr_ops as well? Two reasons: 1. The callback of this walk has "base" as a parameter, but we need the 'access expr' which has "base, offset, size" info. 2. The most common case for the parameter sra is reading/ writing field(s) of parmater. Then what we care about is: if "rhs/rhs" is a field/member access of an aggregate parameter/return. Then checking "lhs/rhs" directly would be simpler. > >> + return; >> + } >> + >> + analyze_default_stmt (stmt); >> +} >> + >> +/* Return true if VAR is a candidate for SRA. */ >> + >> +bool >> +expand_sra::add_sra_candidate (tree var) >> +{ >> + tree type = TREE_TYPE (var); >> + >> + if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type)) >> + || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var) >> + || is_va_list_type (type)) >> + return false; >> + gcc_assert (COMPLETE_TYPE_P (type)); >> + >> + base_access_vec->get_or_insert (var); >> + >> + return true; >> +} >> + >> +bool >> +expand_sra::collect_sra_candidates (void) >> +{ >> + bool ret = false; >> + >> + /* Collect parameters. */ >> + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm; >> + parm = DECL_CHAIN (parm)) >> + ret |= add_sra_candidate (parm); >> + >> + /* Collect VARs on returns. */ >> + if (DECL_RESULT (current_function_decl)) >> + { >> + edge_iterator ei; >> + edge e; >> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) >> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) >> + { >> + tree val = gimple_return_retval (r); >> + /* To sclaraized the return, the return value should be only >> + writen, except this return stmt. >> + Then using 'true(write)' to create the access. */ >> + if (val && VAR_P (val)) > > what about RESULT_DECLs? I think you want to check > needs_to_live_in_memory. If RESULT_DECL is on the return-stmts and return via registers, then the DECL_RTL of RESULT_DECL is already as "PARALLEL" code. We already handled this case fine. In this patch, we only need to handle a special case: e.g. D.2872 = XXX; return D.2872. In this case, a VAR(D.2872) is allocated, and it is not the RESULT_DECL, and return-stmt returns the VAR(D.2872). > >> + ret |= add_sra_candidate (val) && build_access (val, true); >> + } >> + } >> + >> + return ret; >> +} >> + >> +bool >> +expand_sra::valid_scalariable_accesses (vec<access_p> *access_vec, bool >> is_parm) >> +{ >> + if (access_vec->is_empty ()) >> + return false; >> + >> + for (unsigned int j = 0; j < access_vec->length (); j++) >> + { >> + struct access *access = (*access_vec)[j]; >> + if (is_parm && access->write) >> + return false; >> + >> + if (!is_parm && !access->write) >> + return false; > > Can you explain why you are making this restriction? With this restriction, it would be safe/simple, and would be able to cover most cases. tree-sra/ipa-sra handle most of the cases already. For writing to parameter: like "a.d += xx;" The SSA code may like: "a$d_4 = a.d; _2 = _1 + a$d_4;" There is no "writing to parameter" anymore. While there are still cases about "writing" to parameter: e.g. "a$d_11 = a.d; a.d = _2; _7 = bar (a)" To support this, we may need more analysis to do. > >> + } >> + >> + return true; >> +} >> + >> +void >> +expand_sra::prepare_expander_sra () >> +{ >> + if (optimize <= 0) >> + return; >> + >> + base_access_vec = new hash_map<tree, auto_vec<access_p> >; >> + expr_rtx_vec = new hash_map<tree, rtx>; >> + >> + collect_sra_candidates (); >> +} >> + >> +expand_sra::expand_sra () : expr_rtx_vec (NULL), base_access_vec (NULL) >> +{ >> + prepare_expander_sra (); > > inline this function here. Thanks. > >> +} >> + >> +expand_sra::~expand_sra () >> +{ >> + if (optimize <= 0) >> + return; >> + delete expr_rtx_vec; >> + expr_rtx_vec = NULL; >> + delete base_access_vec; >> + base_access_vec = NULL; >> +} >> + >> +rtx >> +expand_sra::get_scalarized_rtx (tree expr) >> +{ >> + if (!expr_rtx_vec) >> + return NULL_RTX; >> + rtx *val = expr_rtx_vec->get (expr); >> + return val ? *val : NULL_RTX; >> +} >> + >> +/* Get the register at INDEX from a parallel REGS. */ >> + >> +static rtx >> +extract_one_reg (rtx regs, int index) >> +{ >> + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0); >> + if (!HARD_REGISTER_P (orig_reg)) >> + return orig_reg; >> + >> + /* Reading from param hard reg need to be moved to a temp. */ >> + rtx reg = gen_reg_rtx (GET_MODE (orig_reg)); >> + emit_move_insn (reg, orig_reg); >> + return reg; >> +} >> + >> +/* Extract bitfield from REG with SIZE as MODE, start from POS. */ >> + >> +static rtx >> +extract_bitfield (rtx reg, int size, int pos, machine_mode tmode, bool >> unsignedp) > > quite a bad name for extract_bit_field which we already have ... > I wonder why you need this at all. Thanks! extract_bit_field would be fine. > >> +{ >> + scalar_int_mode imode; >> + if (!int_mode_for_mode (tmode).exists (&imode)) >> + return NULL_RTX; >> + >> + machine_mode mode = GET_MODE (reg); >> + bool reverse = false; >> + rtx bfld = extract_bit_field (reg, size, pos, unsignedp, NULL_RTX, mode, >> + imode, reverse, NULL); >> + mode = GET_MODE (bfld); >> + if (mode != imode) >> + bfld = gen_lowpart (imode, bfld); >> + rtx result = gen_reg_rtx (imode); >> + emit_move_insn (result, bfld); >> + >> + if (tmode != imode) >> + result = gen_lowpart (tmode, result); >> + >> + return result; >> +} >> + >> +bool >> +expand_sra::scalarize_access (access_p acc, rtx regs) >> +{ >> + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr)); >> + >> + /* mode of mult registers. */ >> + if (expr_mode != BLKmode >> + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode))) >> + return false; >> + > > I'm confused now - you are rewriting RTL? In this patch, yes, the computed rtx(s) are generated and emitted. e.g. %r123=%r3; %r124=%r4#0 "scalarize_access" is called at the time when parameters are set up, and incoming registers are confirmed. Then, "scalarize_accesses" checks the aggregate parameters, and computes scalarized rtx according to the (base, offset, size) of the expression for each access. As you mentioned below, another idea is: here only confirm if accesses can be scalarized or not and set DECL_RTL for the parameter. > >> + /* Compute the position of the access in the whole parallel rtx. */ >> + int start_index = -1; >> + int end_index = -1; >> + HOST_WIDE_INT left_bits = 0; >> + HOST_WIDE_INT right_bits = 0; >> + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1; >> + for (; cur_index < XVECLEN (regs, 0); cur_index++) >> + { >> + rtx slot = XVECEXP (regs, 0, cur_index); >> + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT; >> + machine_mode mode = GET_MODE (XEXP (slot, 0)); >> + HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant (); >> + if (off <= acc->offset && off + size > acc->offset) >> + { >> + start_index = cur_index; >> + left_bits = acc->offset - off; >> + } >> + if (off + size >= acc->offset + acc->size) >> + { >> + end_index = cur_index; >> + right_bits = off + size - (acc->offset + acc->size); >> + break; >> + } >> + } >> + /* Invalid access possition: padding or outof bound. */ >> + if (start_index < 0 || end_index < 0) >> + return false; >> + >> + /* Need multi-registers in a parallel for the access. */ >> + if (expr_mode == BLKmode || end_index > start_index) >> + { >> + if (left_bits || right_bits) >> + return false; >> + >> + int num_words = end_index - start_index + 1; >> + rtx *tmps = XALLOCAVEC (rtx, num_words); >> + >> + int pos = 0; >> + HOST_WIDE_INT start; >> + start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)); >> + /* Extract whole registers. */ >> + for (; pos < num_words; pos++) >> + { >> + int index = start_index + pos; >> + rtx reg = extract_one_reg (regs, index); >> + machine_mode mode = GET_MODE (reg); >> + HOST_WIDE_INT off; >> + off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start; >> + tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off)); >> + } >> + >> + rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps)); >> + acc->rtx_val = reg; >> + return true; >> + } >> + >> + /* Just need one reg for the access. */ >> + if (end_index == start_index && left_bits == 0 && right_bits == 0) >> + { >> + rtx reg = extract_one_reg (regs, start_index); >> + if (GET_MODE (reg) != expr_mode) >> + reg = gen_lowpart (expr_mode, reg); >> + >> + acc->rtx_val = reg; >> + return true; >> + } >> + >> + /* Need to extract part reg for the access. */ >> + if (!acc->write && end_index == start_index) >> + { >> + rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0); >> + bool sgn = TYPE_UNSIGNED (TREE_TYPE (acc->expr)); >> + acc->rtx_val >> + = extract_bitfield (reg, acc->size, left_bits, expr_mode, sgn); >> + if (acc->rtx_val) >> + return true; >> + } >> + >> + return false; >> +} >> + >> +bool >> +expand_sra::scalarize_accesses (tree base, rtx regs) >> +{ >> + if (!base_access_vec) >> + return false; >> + vec<access_p> *access_vec = base_access_vec->get (base); >> + if (!access_vec) >> + return false; >> + bool is_parm = TREE_CODE (base) == PARM_DECL; >> + if (!valid_scalariable_accesses (access_vec, is_parm)) >> + return false; >> + >> + /* Go through each access, compute corresponding rtx(regs or subregs) >> + for the expression. */ >> + int n = access_vec->length (); >> + int cur_access_index = 0; >> + for (; cur_access_index < n; cur_access_index++) >> + if (!scalarize_access ((*access_vec)[cur_access_index], regs)) >> + break; >> + >> + /* Avoid un-scalarized accesses. */ >> + if (cur_access_index != n) >> + { >> + base_access_vec->remove (base); >> + return false; >> + } >> + >> + /* Bind/map expr(tree) to scalarized rtx. */ >> + for (int j = 0; j < n; j++) >> + { >> + access_p access = (*access_vec)[j]; >> + expr_rtx_vec->put (access->expr, access->rtx_val); > > why do we compute RTL for each access? We only want to change > the representation of the decl - how RTL expansion "scalarizes" > an aggregate. It does this by assigning it a mode which we want > to change. Changing the representation of the decl (e.g. set DECL_RTL as parallel with registers) could handle most cases. There may be one issue with partial scalarization. (which is not supported by this patch either.) e.g. "struct A { int i,j; long d;}" and insns: "_3=arg.i+arg.j; foo (&arg.d);" arg.i and arg.j may be accessed through register directly, but arg.d may need to be handled in memory. To support this kind of case, we may need a way to tell which member of 'arg' can (or can not) be accessed through the registers. This is the reason why I'm trying to bind an RTL to a TREE expression(not only DECL TREE): If a TREE expr has the registers RTL then uses it, otherwise, the expr would be in the stack. > > If we want to really dis-aggregate the aggregate in the IL then > I believe we should do this on the GIMPLE level - which means > having a way to tie the "default definitions" to the incoming RTL. > Which I think we have. Yes, most aggregates have been scalarized(dis-aggregate) by tree-sra/ipa-sra on the GIMPLE level. In expander, there are some special cases that need to be handled: - aggregate parameters/returns passing by registers, but combined as INT mode: e.g. "struct A{float a; float b}" may has DImode. - aggregate parameters/returns passing by registers, but with BLKmode and stored to stack even if it is safe to use dis-aggregate. > > foo (struct X x) > { > <bb2>: > x_a_1 = x.a; > x_b_2 = x.b; > x_c_3 = x.c; > ... > > } > > Another possibility would be to have the parameters be SSA > names when the parameter is not (fully) passed in memory > and have the stack materialization explicit as aggregate > copy. > > I realize the complication is the way we transfer the incoming > hard registers to pseudos - and I think we simply need to > extend what DECL_RTL a PARM_DECL can have for the case we are > after - allow sth like (vec:TI [(reg:SI ..) (reg:SI ..) ... ]) > or so and have access expansion handle that in addition to > the already supported (concat:..). There's some support for > (parallel ...), at least for returns, but it might be too > sparse. Yes, it is complicated to handle various expressions (e.g. extract/store_bit_fields) on "vec:TI, concat:, parallel:". > > What I'm missing is (or I didn't find it) is code creating > accesses for the actual incoming / outgoing hardregister parts. This part is handled. It is where "set_scalar_rtx_for_aggregate_access" is called. e.g. in "assign_parm_setup_block". But, this patch directly uses the hard registers from incoming/outgoing which is computed by existing code, e.g. ---------- set_decl_incoming_rtl (parm, data.entry_parm, false); + rtx incoming = DECL_INCOMING_RTL (parm); + if (GET_CODE (incoming) == PARALLEL) + set_scalar_rtx_for_aggregate_access (parm, incoming); > Where's that taken into account? I also don't see any handling > of overlapping accesses - that is, when the choice for the This patch does not check overlapping access. One reason: this patch supports "reading" from parameter only. So it is safe to use the registers/pseudo for reading expression, even there are overlapping accesses. (Yes, for some cases, it is also safe to use reg for writing. But, this patch need to be enhanced carefully.) > pieces isn't obvious? In fact I can't see _any_ code that > tries to compute the set of scalar replacements? Consider > > struct X { int a; int b; }; > > long foo (struct X x) > { > long y; > memcpy (&y, &x, sizeof (long)); > return y + x.a + x.b; > } > > you'll get a 8 byte read and two 4 byte reads. What's going > to be your choice for decomposing 'X'? On x86 it will have > DImode, passed in %rdi so decomposing it will not improve > code generation. Since there is a call that takes the address of the parameter, So, the current patch uses the stack for the parameter but does not use the incoming registers. > > struct X { int a; int b; int c; int d; }; > > long foo (struct X x) > { > long y; > memcpy (&y, &x, sizeof (long)); > return y + x.c + x.d; > } > > on x86 we have TImode for X, but it's passed in two DImode > registers. For the first half you have an 8 byte access, > for the second half you have two 4 byte accesses. I'd > expect to decompose this to two DImode halves, based on > the incoming register layout and roughly matching > accesses. Assume there is no addr-taken, memcpy is sth like "y=x" (via union). Then, there are three accesses for "x": (offset=0, size=8), (offset=8,size=4), (12,4). Then three rtx is generated: scalar1=%rx:DI, scalar2=%ry:DI#0, scalar3=(%ry>>32)#0 > > struct X { int a; int b; int c; int d; }; > > long foo (struct X x) > { > long y; > memcpy (&y, &x.b, sizeof (long)); > return y + x.a + x.d; > } > > two DImode halves for the pseudos doens't make much sense > in this case - we have 'y' overlapping both. RTL opts > recover nicely in all the above cases when using TImode > so it's going to be important to not be overly aggressive. > > That IMHO also means when we need to do more fancy things > like rewriting accesses (thus when RTL expansion cannot > cope with our idea of how the base object expands) we should > think of dealing with this on the GIMPLE level. Thanks, I try to understand this. :) > > For power you seem to have the situation of BLKmode aggregates > that are passed in registers? I don't think we have this > situation on x86 for example. The mode of an aggregate may be DI/TI, and also BLK: If the size of the aggregate is not greater than the biggest INT, then it would use INT mode, otherwise, BLK would be used. If I remember correctly, x86 may also be similar. > > That said - I'm not sure I reverse engineered what you do > correctly. A more thorough description with the problem > you are solving plus how you attack it from a high level > perspective would have been useful. Thanks for pointing out this! And thanks so much for your time on this review! BR, Jeff (Jiufu Guo) > > Thanks, > Richard. > >> + } >> + >> + return true; >> +} >> + >> +static expand_sra *current_sra = NULL; >> + >> +/* Check If there is an sra access for the expr. >> + Return the correspond scalar sym for the access. */ >> + >> +rtx >> +get_scalar_rtx_for_aggregate_expr (tree expr) >> +{ >> + return current_sra ? current_sra->get_scalarized_rtx (expr) : NULL_RTX; >> +} >> + >> +/* Compute/Set RTX registers for those accesses on BASE. */ >> + >> +void >> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs) >> +{ >> + if (!current_sra) >> + return; >> + current_sra->scalarize_accesses (base, regs); >> +} >> + >> +void >> +set_scalar_rtx_for_returns () >> +{ >> + if (!current_sra) >> + return; >> + >> + tree res = DECL_RESULT (current_function_decl); >> + gcc_assert (res); >> + edge_iterator ei; >> + edge e; >> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) >> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) >> + { >> + tree val = gimple_return_retval (r); >> + if (val && VAR_P (val)) >> + current_sra->scalarize_accesses (val, DECL_RTL (res)); >> + } >> +} >> + >> /* Return an expression tree corresponding to the RHS of GIMPLE >> statement STMT. */ >> >> @@ -3778,7 +4241,8 @@ expand_return (tree retval) >> >> /* If we are returning the RESULT_DECL, then the value has already >> been stored into it, so we don't have to do anything special. */ >> - if (TREE_CODE (retval_rhs) == RESULT_DECL) >> + if (TREE_CODE (retval_rhs) == RESULT_DECL >> + || get_scalar_rtx_for_aggregate_expr (retval_rhs)) >> expand_value_return (result_rtl); >> >> /* If the result is an aggregate that is being returned in one (or more) >> @@ -4422,6 +4886,9 @@ expand_debug_expr (tree exp) >> int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp)); >> addr_space_t as; >> scalar_int_mode op0_mode, op1_mode, addr_mode; >> + rtx x = get_scalar_rtx_for_aggregate_expr (exp); >> + if (x) >> + return NULL_RTX;/* optimized out. */ >> >> switch (TREE_CODE_CLASS (TREE_CODE (exp))) >> { >> @@ -6624,6 +7091,9 @@ pass_expand::execute (function *fun) >> auto_bitmap forced_stack_vars; >> discover_nonconstant_array_refs (forced_stack_vars); >> >> + current_sra = new expand_sra; >> + scan_function (cfun, *current_sra); >> + >> /* Make sure all values used by the optimization passes have sane >> defaults. */ >> reg_renumber = 0; >> @@ -7052,6 +7522,8 @@ pass_expand::execute (function *fun) >> loop_optimizer_finalize (); >> } >> >> + delete current_sra; >> + current_sra = NULL; >> timevar_pop (TV_POST_EXPAND); >> >> return 0; >> diff --git a/gcc/expr.cc b/gcc/expr.cc >> index >> 174f8acb269ab5450fc799516471d5a2bd9b9efa..57b037040d6d8e8c98b2befcb556221c0c5604c4 >> 100644 >> --- a/gcc/expr.cc >> +++ b/gcc/expr.cc >> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, >> rtx, >> static rtx const_vector_from_tree (tree); >> static tree tree_expr_size (const_tree); >> static void convert_mode_scalar (rtx, rtx, int); >> +rtx get_scalar_rtx_for_aggregate_expr (tree); >> >> >> /* This is run to set up which modes can be used >> @@ -5618,11 +5619,12 @@ expand_assignment (tree to, tree from, bool >> nontemporal) >> Assignment of an array element at a constant index, and assignment of >> an array element in an unaligned packed structure field, has the same >> problem. Same for (partially) storing into a non-memory object. */ >> - if (handled_component_p (to) >> - || (TREE_CODE (to) == MEM_REF >> - && (REF_REVERSE_STORAGE_ORDER (to) >> - || mem_ref_refers_to_non_mem_p (to))) >> - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE) >> + if (!get_scalar_rtx_for_aggregate_expr (to) >> + && (handled_component_p (to) >> + || (TREE_CODE (to) == MEM_REF >> + && (REF_REVERSE_STORAGE_ORDER (to) >> + || mem_ref_refers_to_non_mem_p (to))) >> + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)) >> { >> machine_mode mode1; >> poly_int64 bitsize, bitpos; >> @@ -8912,6 +8914,20 @@ expand_constructor (tree exp, rtx target, enum >> expand_modifier modifier, >> && ! mostly_zeros_p (exp)) >> return NULL_RTX; >> >> + if (target && GET_CODE (target) == PARALLEL && all_zeros_p (exp)) >> + { >> + int length = XVECLEN (target, 0); >> + int start = XEXP (XVECEXP (target, 0, 0), 0) ? 0 : 1; >> + for (int i = start; i < length; i++) >> + { >> + rtx dst = XEXP (XVECEXP (target, 0, i), 0); >> + rtx zero = CONST0_RTX (GET_MODE (dst)); >> + gcc_assert (zero); >> + emit_move_insn (dst, zero); >> + } >> + return target; >> + } >> + >> /* Handle calls that pass values in multiple non-contiguous >> locations. The Irix 6 ABI has examples of this. */ >> if (target == 0 || ! safe_from_p (target, exp, 1) >> @@ -9006,6 +9022,9 @@ expand_expr_real (tree exp, rtx target, machine_mode >> tmode, >> ret = CONST0_RTX (tmode); >> return ret ? ret : const0_rtx; >> } >> + rtx x = get_scalar_rtx_for_aggregate_expr (exp); >> + if (x) >> + return x; >> >> ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl, >> inner_reference_p); >> diff --git a/gcc/function.cc b/gcc/function.cc >> index >> dd2c1136e0725f55673f28e0eeaf4c91ad18e93f..7fe927bd36beac11466ca9fca12800892b57f0be >> 100644 >> --- a/gcc/function.cc >> +++ b/gcc/function.cc >> @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct >> assign_parm_data_one *data) >> data->stack_parm = stack_parm; >> } >> >> +extern void set_scalar_rtx_for_aggregate_access (tree, rtx); >> +extern void set_scalar_rtx_for_returns (); >> + >> /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that it's >> always valid and contiguous. */ >> >> @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all >> *all, >> emit_move_insn (mem, entry_parm); >> } >> else >> - move_block_from_reg (REGNO (entry_parm), mem, >> - size_stored / UNITS_PER_WORD); >> + { >> + int regno = REGNO (entry_parm); >> + int nregs = size_stored / UNITS_PER_WORD; >> + move_block_from_reg (regno, mem, nregs); >> + >> + rtx *tmps = XALLOCAVEC (rtx, nregs); >> + machine_mode mode = word_mode; >> + HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant (); >> + for (int i = 0; i < nregs; i++) >> + { >> + rtx reg = gen_rtx_REG (mode, regno + i); >> + rtx off = GEN_INT (word_size * i); >> + tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off); >> + } >> + >> + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps)); >> + set_scalar_rtx_for_aggregate_access (parm, regs); >> + } >> } >> else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type)) >> { >> @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl) >> else >> set_decl_incoming_rtl (parm, data.entry_parm, false); >> >> + rtx incoming = DECL_INCOMING_RTL (parm); >> + if (GET_CODE (incoming) == PARALLEL) >> + set_scalar_rtx_for_aggregate_access (parm, incoming); >> + >> assign_parm_adjust_stack_rtl (&data); >> >> if (assign_parm_setup_block_p (&data)) >> @@ -5136,6 +5159,7 @@ expand_function_start (tree subr) >> { >> gcc_assert (GET_CODE (hard_reg) == PARALLEL); >> set_parm_rtl (res, gen_group_rtx (hard_reg)); >> + set_scalar_rtx_for_returns (); >> } >> } >> >> diff --git a/gcc/tree-sra.h b/gcc/tree-sra.h >> index >> f20266c46226f7840299a768cb575f6f92b54207..bd0396e672b30f7ef66c305d8d131e91639039d7 >> 100644 >> --- a/gcc/tree-sra.h >> +++ b/gcc/tree-sra.h >> @@ -19,6 +19,83 @@ You should have received a copy of the GNU General Public >> License >> along with GCC; see the file COPYING3. If not see >> <http://www.gnu.org/licenses/>. */ >> >> +struct base_access >> +{ >> + /* Values returned by get_ref_base_and_extent, indicates the >> + OFFSET, SIZE and BASE of the access. */ >> + HOST_WIDE_INT offset; >> + HOST_WIDE_INT size; >> + tree base; >> + >> + /* The context expression of this access. */ >> + tree expr; >> + >> + /* Indicates this is a write access. */ >> + bool write : 1; >> + >> + /* Indicates if this access is made in reverse storage order. */ >> + bool reverse : 1; >> +}; >> + >> +/* Default template for sra_scan_function. */ >> + >> +struct default_analyzer >> +{ >> + /* Template analyze functions. */ >> + void analyze_phi (gphi *){}; >> + void pre_analyze_stmt (gimple *){}; >> + void analyze_return (greturn *){}; >> + void analyze_assign (gassign *){}; >> + void analyze_call (gcall *){}; >> + void analyze_asm (gasm *){}; >> + void analyze_default_stmt (gimple *){}; >> +}; >> + >> +/* Scan function and look for interesting expressions. */ >> + >> +template <typename analyzer> >> +void >> +scan_function (struct function *fun, analyzer &a) >> +{ >> + basic_block bb; >> + FOR_EACH_BB_FN (bb, fun) >> + { >> + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); >> + gsi_next (&gsi)) >> + a.analyze_phi (gsi.phi ()); >> + >> + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); >> + gsi_next (&gsi)) >> + { >> + gimple *stmt = gsi_stmt (gsi); >> + a.pre_analyze_stmt (stmt); >> + >> + switch (gimple_code (stmt)) >> + { >> + case GIMPLE_RETURN: >> + a.analyze_return (as_a<greturn *> (stmt)); >> + break; >> + >> + case GIMPLE_ASSIGN: >> + a.analyze_assign (as_a<gassign *> (stmt)); >> + break; >> + >> + case GIMPLE_CALL: >> + a.analyze_call (as_a<gcall *> (stmt)); >> + break; >> + >> + case GIMPLE_ASM: >> + a.analyze_asm (as_a<gasm *> (stmt)); >> + break; >> + >> + default: >> + a.analyze_default_stmt (stmt); >> + break; >> + } >> + } >> + } >> +} >> + >> bool type_internals_preclude_sra_p (tree type, const char **msg); >> >> /* Return true iff TYPE is stdarg va_list type (which early SRA and IPA-SRA >> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C >> b/gcc/testsuite/g++.target/powerpc/pr102024.C >> index >> 769585052b507ad971868795f861106230c976e3..c8995cae707bb6e2e849275b823d2ba14d24a966 >> 100644 >> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C >> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C >> @@ -5,7 +5,7 @@ >> // Test that a zero-width bit field in an otherwise homogeneous aggregate >> // generates a psabi warning and passes arguments in GPRs. >> >> -// { dg-final { scan-assembler-times {\mstd\M} 4 } } >> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } } >> >> struct a_thing >> { >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c >> b/gcc/testsuite/gcc.target/powerpc/pr108073.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..7dd1a4a326a181e0f35c9418af20a9bebabdfe4b >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c >> @@ -0,0 +1,29 @@ >> +/* { dg-do run } */ >> +/* { dg-options "-O2 -save-temps" } */ >> + >> +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } >> DF; >> +typedef struct SF {float a[4]; int i1; int i2; } SF; >> + >> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { >> has_arch_ppc64 && has_arch_pwr8 } } } } */ >> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && >> has_arch_pwr8 } } } } */ >> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && >> has_arch_pwr8 } } } } */ >> +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return >> a.s2+a.s3;return 0;} >> +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return >> a.i2+a.i1;return 0;} >> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == >> 2)return arg.a[3];else return 0.0;} >> +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == >> 2)return arg.a[2]; return 0;} >> +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == >> 2)return arg.a[1];return 0;} >> + >> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4}; >> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2}; >> + >> +int main() >> +{ >> + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == >> 4.0 >> + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0)) >> + __builtin_abort (); >> + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0 >> + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0)) >> + __builtin_abort (); >> + return 0; >> +} >> + >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..4e1f87f7939cbf1423772023ee392fc5200b6708 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >> @@ -0,0 +1,6 @@ >> +/* PR target/65421 */ >> +/* { dg-options "-O2" } */ >> + >> +typedef struct LARGE {double a[4]; int arr[32];} LARGE; >> +LARGE foo (LARGE a){return a;} >> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..8a8e1a0e9962317ba2c0942af8891b3c51f4d3a4 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >> @@ -0,0 +1,32 @@ >> +/* PR target/65421 */ >> +/* { dg-options "-O2" } */ >> +/* { dg-require-effective-target powerpc_elfv2 } */ >> +/* { dg-require-effective-target has_arch_ppc64 } */ >> + >> +typedef struct FLOATS >> +{ >> + double a[3]; >> +} FLOATS; >> + >> +/* 3 lfd after returns also optimized */ >> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */ >> + >> +/* 3 stfd */ >> +void st_arg (FLOATS a, FLOATS *p) {*p = a;} >> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */ >> + >> +/* blr */ >> +FLOATS ret_arg (FLOATS a) {return a;} >> + >> +typedef struct MIX >> +{ >> + double a[2]; >> + long l; >> +} MIX; >> + >> +/* std 3 param regs to return slot */ >> +MIX ret_arg1 (MIX a) {return a;} >> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */ >> + >> +/* count insns */ >> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */ >>